2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
139 class YoutubeIE(InfoExtractor):
140 """Information extractor for youtube.com."""
144 (?:https?://)? # http(s):// (optional)
145 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 )? # optional -> youtube.com/xxxx is OK
157 )? # all until now is optional -> you can pass the naked ID
158 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
159 (?(1).+)? # if we found the ID, everything can follow
161 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 _NETRC_MACHINE = 'youtube'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169 _video_extensions = {
175 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
181 _video_dimensions = {
200 def suitable(cls, url):
201 """Receives a URL and returns True if suitable for this IE."""
202 if YoutubePlaylistIE.suitable(url): return False
203 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
205 def report_lang(self):
206 """Report attempt to set language."""
207 self._downloader.to_screen(u'[youtube] Setting language')
209 def report_login(self):
210 """Report attempt to log in."""
211 self._downloader.to_screen(u'[youtube] Logging in')
213 def report_age_confirmation(self):
214 """Report attempt to confirm age."""
215 self._downloader.to_screen(u'[youtube] Confirming age')
217 def report_video_webpage_download(self, video_id):
218 """Report attempt to download video webpage."""
219 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
221 def report_video_info_webpage_download(self, video_id):
222 """Report attempt to download video info webpage."""
223 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
225 def report_video_subtitles_download(self, video_id):
226 """Report attempt to download video info webpage."""
227 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
229 def report_video_subtitles_request(self, video_id, sub_lang, format):
230 """Report attempt to download video info webpage."""
231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
233 def report_video_subtitles_available(self, video_id, sub_lang_list):
234 """Report available subtitles."""
235 sub_lang = ",".join(list(sub_lang_list.keys()))
236 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
238 def report_information_extraction(self, video_id):
239 """Report attempt to extract video information."""
240 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
242 def report_unavailable_format(self, video_id, format):
243 """Report extracted video URL."""
244 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
246 def report_rtmp_download(self):
247 """Indicate the download will use the RTMP protocol."""
248 self._downloader.to_screen(u'[youtube] RTMP download detected')
250 def _get_available_subtitles(self, video_id):
251 self.report_video_subtitles_download(video_id)
252 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
254 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256 return (u'unable to download video subtitles: %s' % compat_str(err), None)
257 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259 if not sub_lang_list:
260 return (u'video doesn\'t have subtitles', None)
263 def _list_available_subtitles(self, video_id):
264 sub_lang_list = self._get_available_subtitles(video_id)
265 self.report_video_subtitles_available(video_id, sub_lang_list)
267 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
270 (error_message, sub_lang, sub)
272 self.report_video_subtitles_request(video_id, sub_lang, format)
273 params = compat_urllib_parse.urlencode({
279 url = 'http://www.youtube.com/api/timedtext?' + params
281 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
282 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
283 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
285 return (u'Did not fetch video subtitles', None, None)
286 return (None, sub_lang, sub)
288 def _extract_subtitle(self, video_id):
290 Return a list with a tuple:
291 [(error_message, sub_lang, sub)]
293 sub_lang_list = self._get_available_subtitles(video_id)
294 sub_format = self._downloader.params.get('subtitlesformat')
295 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
296 return [(sub_lang_list[0], None, None)]
297 if self._downloader.params.get('subtitleslang', False):
298 sub_lang = self._downloader.params.get('subtitleslang')
299 elif 'en' in sub_lang_list:
302 sub_lang = list(sub_lang_list.keys())[0]
303 if not sub_lang in sub_lang_list:
304 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
306 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
309 def _extract_all_subtitles(self, video_id):
310 sub_lang_list = self._get_available_subtitles(video_id)
311 sub_format = self._downloader.params.get('subtitlesformat')
312 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
313 return [(sub_lang_list[0], None, None)]
315 for sub_lang in sub_lang_list:
316 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
317 subtitles.append(subtitle)
320 def _print_formats(self, formats):
321 print('Available formats:')
323 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
325 def _real_initialize(self):
326 if self._downloader is None:
331 downloader_params = self._downloader.params
333 # Attempt to use provided username and password or .netrc data
334 if downloader_params.get('username', None) is not None:
335 username = downloader_params['username']
336 password = downloader_params['password']
337 elif downloader_params.get('usenetrc', False):
339 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
344 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
345 except (IOError, netrc.NetrcParseError) as err:
346 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
350 request = compat_urllib_request.Request(self._LANG_URL)
353 compat_urllib_request.urlopen(request).read()
354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
355 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
358 # No authentication to be performed
362 request = compat_urllib_request.Request(self._LOGIN_URL)
364 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
366 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
371 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
373 galx = match.group(1)
375 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
381 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
385 u'PersistentCookie': u'yes',
387 u'bgresponse': u'js_disabled',
388 u'checkConnection': u'',
389 u'checkedDomains': u'youtube',
395 u'signIn': u'Sign in',
397 u'service': u'youtube',
401 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
403 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
404 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
405 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
408 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
410 self._downloader.report_warning(u'unable to log in: bad username or password')
412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
413 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
419 'action_confirm': 'Confirm',
421 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
423 self.report_age_confirmation()
424 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
425 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
426 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
429 def _extract_id(self, url):
430 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
432 self._downloader.report_error(u'invalid URL: %s' % url)
434 video_id = mobj.group(2)
437 def _real_extract(self, url):
438 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
439 mobj = re.search(self._NEXT_URL_RE, url)
441 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
442 video_id = self._extract_id(url)
445 self.report_video_webpage_download(video_id)
446 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
447 request = compat_urllib_request.Request(url)
449 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
451 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
454 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
456 # Attempt to extract SWF player URL
457 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
459 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
464 self.report_video_info_webpage_download(video_id)
465 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
466 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
467 % (video_id, el_type))
468 request = compat_urllib_request.Request(video_info_url)
470 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
471 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
472 video_info = compat_parse_qs(video_info_webpage)
473 if 'token' in video_info:
475 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
476 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
478 if 'token' not in video_info:
479 if 'reason' in video_info:
480 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
482 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
485 # Check for "rental" videos
486 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
487 self._downloader.report_error(u'"rental" videos not supported')
490 # Start extracting information
491 self.report_information_extraction(video_id)
494 if 'author' not in video_info:
495 self._downloader.report_error(u'unable to extract uploader name')
497 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
500 video_uploader_id = None
501 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
503 video_uploader_id = mobj.group(1)
505 self._downloader.report_warning(u'unable to extract uploader nickname')
508 if 'title' not in video_info:
509 self._downloader.report_error(u'unable to extract video title')
511 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
514 if 'thumbnail_url' not in video_info:
515 self._downloader.report_warning(u'unable to extract video thumbnail')
517 else: # don't panic if we can't find it
518 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
522 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
524 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
525 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
526 for expression in format_expressions:
528 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
533 video_description = get_element_by_id("eow-description", video_webpage)
534 if video_description:
535 video_description = clean_html(video_description)
537 video_description = ''
540 video_subtitles = None
542 if self._downloader.params.get('writesubtitles', False):
543 video_subtitles = self._extract_subtitle(video_id)
545 (sub_error, sub_lang, sub) = video_subtitles[0]
547 self._downloader.report_error(sub_error)
549 if self._downloader.params.get('allsubtitles', False):
550 video_subtitles = self._extract_all_subtitles(video_id)
551 for video_subtitle in video_subtitles:
552 (sub_error, sub_lang, sub) = video_subtitle
554 self._downloader.report_error(sub_error)
556 if self._downloader.params.get('listsubtitles', False):
557 sub_lang_list = self._list_available_subtitles(video_id)
560 if 'length_seconds' not in video_info:
561 self._downloader.report_warning(u'unable to extract video duration')
564 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
567 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
569 # Decide which formats to download
570 req_format = self._downloader.params.get('format', None)
572 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
573 self.report_rtmp_download()
574 video_url_list = [(None, video_info['conn'][0])]
575 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
576 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
577 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
578 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
579 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
581 format_limit = self._downloader.params.get('format_limit', None)
582 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
583 if format_limit is not None and format_limit in available_formats:
584 format_list = available_formats[available_formats.index(format_limit):]
586 format_list = available_formats
587 existing_formats = [x for x in format_list if x in url_map]
588 if len(existing_formats) == 0:
589 self._downloader.report_error(u'no known formats available for video')
591 if self._downloader.params.get('listformats', None):
592 self._print_formats(existing_formats)
594 if req_format is None or req_format == 'best':
595 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
596 elif req_format == 'worst':
597 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
598 elif req_format in ('-1', 'all'):
599 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
601 # Specific formats. We pick the first in a slash-delimeted sequence.
602 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
603 req_formats = req_format.split('/')
604 video_url_list = None
605 for rf in req_formats:
607 video_url_list = [(rf, url_map[rf])]
609 if video_url_list is None:
610 self._downloader.report_error(u'requested format not available')
613 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
617 for format_param, video_real_url in video_url_list:
619 video_extension = self._video_extensions.get(format_param, 'flv')
621 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
622 self._video_dimensions.get(format_param, '???'))
626 'url': video_real_url,
627 'uploader': video_uploader,
628 'uploader_id': video_uploader_id,
629 'upload_date': upload_date,
630 'title': video_title,
631 'ext': video_extension,
632 'format': video_format,
633 'thumbnail': video_thumbnail,
634 'description': video_description,
635 'player_url': player_url,
636 'subtitles': video_subtitles,
637 'duration': video_duration
642 class MetacafeIE(InfoExtractor):
643 """Information Extractor for metacafe.com."""
645 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
646 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
647 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
648 IE_NAME = u'metacafe'
650 def __init__(self, downloader=None):
651 InfoExtractor.__init__(self, downloader)
653 def report_disclaimer(self):
654 """Report disclaimer retrieval."""
655 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
657 def report_age_confirmation(self):
658 """Report attempt to confirm age."""
659 self._downloader.to_screen(u'[metacafe] Confirming age')
661 def report_download_webpage(self, video_id):
662 """Report webpage download."""
663 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
665 def report_extraction(self, video_id):
666 """Report information extraction."""
667 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
669 def _real_initialize(self):
670 # Retrieve disclaimer
671 request = compat_urllib_request.Request(self._DISCLAIMER)
673 self.report_disclaimer()
674 disclaimer = compat_urllib_request.urlopen(request).read()
675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
676 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
682 'submit': "Continue - I'm over 18",
684 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
686 self.report_age_confirmation()
687 disclaimer = compat_urllib_request.urlopen(request).read()
688 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
689 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
692 def _real_extract(self, url):
693 # Extract id and simplified title from URL
694 mobj = re.match(self._VALID_URL, url)
696 self._downloader.report_error(u'invalid URL: %s' % url)
699 video_id = mobj.group(1)
701 # Check if video comes from YouTube
702 mobj2 = re.match(r'^yt-(.*)$', video_id)
703 if mobj2 is not None:
704 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
707 # Retrieve video webpage to extract further information
708 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
710 self.report_download_webpage(video_id)
711 webpage = compat_urllib_request.urlopen(request).read()
712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
713 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
716 # Extract URL, uploader and title from webpage
717 self.report_extraction(video_id)
718 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
720 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
721 video_extension = mediaURL[-3:]
723 # Extract gdaKey if available
724 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
728 gdaKey = mobj.group(1)
729 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
731 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
733 self._downloader.report_error(u'unable to extract media URL')
735 vardict = compat_parse_qs(mobj.group(1))
736 if 'mediaData' not in vardict:
737 self._downloader.report_error(u'unable to extract media URL')
739 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
741 self._downloader.report_error(u'unable to extract media URL')
743 mediaURL = mobj.group(1).replace('\\/', '/')
744 video_extension = mediaURL[-3:]
745 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
747 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
749 self._downloader.report_error(u'unable to extract title')
751 video_title = mobj.group(1).decode('utf-8')
753 mobj = re.search(r'submitter=(.*?);', webpage)
755 self._downloader.report_error(u'unable to extract uploader nickname')
757 video_uploader = mobj.group(1)
760 'id': video_id.decode('utf-8'),
761 'url': video_url.decode('utf-8'),
762 'uploader': video_uploader.decode('utf-8'),
764 'title': video_title,
765 'ext': video_extension.decode('utf-8'),
769 class DailymotionIE(InfoExtractor):
770 """Information Extractor for Dailymotion"""
772 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
773 IE_NAME = u'dailymotion'
776 def __init__(self, downloader=None):
777 InfoExtractor.__init__(self, downloader)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id and simplified title from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.report_error(u'invalid URL: %s' % url)
790 video_id = mobj.group(1).split('_')[0].split('?')[0]
792 video_extension = 'mp4'
794 # Retrieve video webpage to extract further information
795 request = compat_urllib_request.Request(url)
796 request.add_header('Cookie', 'family_filter=off')
797 webpage = self._download_webpage(request, video_id)
799 # Extract URL, uploader and title from webpage
800 self.report_extraction(video_id)
801 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
803 self._downloader.report_error(u'unable to extract media URL')
805 flashvars = compat_urllib_parse.unquote(mobj.group(1))
807 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
810 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
813 self._downloader.report_error(u'unable to extract video URL')
816 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
818 self._downloader.report_error(u'unable to extract video URL')
821 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
823 # TODO: support choosing qualities
825 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
827 self._downloader.report_error(u'unable to extract title')
829 video_title = unescapeHTML(mobj.group('title'))
831 video_uploader = None
832 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
834 # lookin for official user
835 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
836 if mobj_official is None:
837 self._downloader.report_warning(u'unable to extract uploader nickname')
839 video_uploader = mobj_official.group(1)
841 video_uploader = mobj.group(1)
843 video_upload_date = None
844 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
846 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
851 'uploader': video_uploader,
852 'upload_date': video_upload_date,
853 'title': video_title,
854 'ext': video_extension,
858 class PhotobucketIE(InfoExtractor):
859 """Information extractor for photobucket.com."""
861 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
862 IE_NAME = u'photobucket'
864 def __init__(self, downloader=None):
865 InfoExtractor.__init__(self, downloader)
867 def report_download_webpage(self, video_id):
868 """Report webpage download."""
869 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
871 def report_extraction(self, video_id):
872 """Report information extraction."""
873 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
875 def _real_extract(self, url):
876 # Extract id from URL
877 mobj = re.match(self._VALID_URL, url)
879 self._downloader.report_error(u'Invalid URL: %s' % url)
882 video_id = mobj.group(1)
884 video_extension = 'flv'
886 # Retrieve video webpage to extract further information
887 request = compat_urllib_request.Request(url)
889 self.report_download_webpage(video_id)
890 webpage = compat_urllib_request.urlopen(request).read()
891 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
892 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
895 # Extract URL, uploader, and title from webpage
896 self.report_extraction(video_id)
897 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
899 self._downloader.report_error(u'unable to extract media URL')
901 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
905 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
907 self._downloader.report_error(u'unable to extract title')
909 video_title = mobj.group(1).decode('utf-8')
911 video_uploader = mobj.group(2).decode('utf-8')
914 'id': video_id.decode('utf-8'),
915 'url': video_url.decode('utf-8'),
916 'uploader': video_uploader,
918 'title': video_title,
919 'ext': video_extension.decode('utf-8'),
923 class YahooIE(InfoExtractor):
924 """Information extractor for video.yahoo.com."""
927 # _VALID_URL matches all Yahoo! Video URLs
928 # _VPAGE_URL matches only the extractable '/watch/' URLs
929 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
930 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
931 IE_NAME = u'video.yahoo'
933 def __init__(self, downloader=None):
934 InfoExtractor.__init__(self, downloader)
936 def report_download_webpage(self, video_id):
937 """Report webpage download."""
938 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
940 def report_extraction(self, video_id):
941 """Report information extraction."""
942 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
944 def _real_extract(self, url, new_video=True):
945 # Extract ID from URL
946 mobj = re.match(self._VALID_URL, url)
948 self._downloader.report_error(u'Invalid URL: %s' % url)
951 video_id = mobj.group(2)
952 video_extension = 'flv'
954 # Rewrite valid but non-extractable URLs as
955 # extractable English language /watch/ URLs
956 if re.match(self._VPAGE_URL, url) is None:
957 request = compat_urllib_request.Request(url)
959 webpage = compat_urllib_request.urlopen(request).read()
960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
961 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
964 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
966 self._downloader.report_error(u'Unable to extract id field')
968 yahoo_id = mobj.group(1)
970 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
972 self._downloader.report_error(u'Unable to extract vid field')
974 yahoo_vid = mobj.group(1)
976 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
977 return self._real_extract(url, new_video=False)
979 # Retrieve video webpage to extract further information
980 request = compat_urllib_request.Request(url)
982 self.report_download_webpage(video_id)
983 webpage = compat_urllib_request.urlopen(request).read()
984 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
985 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
988 # Extract uploader and title from webpage
989 self.report_extraction(video_id)
990 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
992 self._downloader.report_error(u'unable to extract video title')
994 video_title = mobj.group(1).decode('utf-8')
996 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
998 self._downloader.report_error(u'unable to extract video uploader')
1000 video_uploader = mobj.group(1).decode('utf-8')
1002 # Extract video thumbnail
1003 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1005 self._downloader.report_error(u'unable to extract video thumbnail')
1007 video_thumbnail = mobj.group(1).decode('utf-8')
1009 # Extract video description
1010 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1012 self._downloader.report_error(u'unable to extract video description')
1014 video_description = mobj.group(1).decode('utf-8')
1015 if not video_description:
1016 video_description = 'No description available.'
1018 # Extract video height and width
1019 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1021 self._downloader.report_error(u'unable to extract video height')
1023 yv_video_height = mobj.group(1)
1025 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1027 self._downloader.report_error(u'unable to extract video width')
1029 yv_video_width = mobj.group(1)
1031 # Retrieve video playlist to extract media URL
1032 # I'm not completely sure what all these options are, but we
1033 # seem to need most of them, otherwise the server sends a 401.
1034 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1035 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1036 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1037 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1038 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1040 self.report_download_webpage(video_id)
1041 webpage = compat_urllib_request.urlopen(request).read()
1042 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1043 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1046 # Extract media URL from playlist XML
1047 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1049 self._downloader.report_error(u'Unable to extract media URL')
1051 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1052 video_url = unescapeHTML(video_url)
1055 'id': video_id.decode('utf-8'),
1057 'uploader': video_uploader,
1058 'upload_date': None,
1059 'title': video_title,
1060 'ext': video_extension.decode('utf-8'),
1061 'thumbnail': video_thumbnail.decode('utf-8'),
1062 'description': video_description,
1066 class VimeoIE(InfoExtractor):
1067 """Information extractor for vimeo.com."""
1069 # _VALID_URL matches Vimeo URLs
1070 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1073 def __init__(self, downloader=None):
1074 InfoExtractor.__init__(self, downloader)
1076 def report_download_webpage(self, video_id):
1077 """Report webpage download."""
1078 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1080 def report_extraction(self, video_id):
1081 """Report information extraction."""
1082 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1084 def _real_extract(self, url, new_video=True):
1085 # Extract ID from URL
1086 mobj = re.match(self._VALID_URL, url)
1088 self._downloader.report_error(u'Invalid URL: %s' % url)
1091 video_id = mobj.group('id')
1092 if not mobj.group('proto'):
1093 url = 'https://' + url
1094 if mobj.group('direct_link'):
1095 url = 'https://vimeo.com/' + video_id
1097 # Retrieve video webpage to extract further information
1098 request = compat_urllib_request.Request(url, None, std_headers)
1100 self.report_download_webpage(video_id)
1101 webpage_bytes = compat_urllib_request.urlopen(request).read()
1102 webpage = webpage_bytes.decode('utf-8')
1103 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1104 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1107 # Now we begin extracting as much information as we can from what we
1108 # retrieved. First we extract the information common to all extractors,
1109 # and latter we extract those that are Vimeo specific.
1110 self.report_extraction(video_id)
1112 # Extract the config JSON
1114 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1115 config = json.loads(config)
1117 self._downloader.report_error(u'unable to extract info section')
1121 video_title = config["video"]["title"]
1123 # Extract uploader and uploader_id
1124 video_uploader = config["video"]["owner"]["name"]
1125 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1127 # Extract video thumbnail
1128 video_thumbnail = config["video"]["thumbnail"]
1130 # Extract video description
1131 video_description = get_element_by_attribute("itemprop", "description", webpage)
1132 if video_description: video_description = clean_html(video_description)
1133 else: video_description = ''
1135 # Extract upload date
1136 video_upload_date = None
1137 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1138 if mobj is not None:
1139 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1141 # Vimeo specific: extract request signature and timestamp
1142 sig = config['request']['signature']
1143 timestamp = config['request']['timestamp']
1145 # Vimeo specific: extract video codec and quality information
1146 # First consider quality, then codecs, then take everything
1147 # TODO bind to format param
1148 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1149 files = { 'hd': [], 'sd': [], 'other': []}
1150 for codec_name, codec_extension in codecs:
1151 if codec_name in config["video"]["files"]:
1152 if 'hd' in config["video"]["files"][codec_name]:
1153 files['hd'].append((codec_name, codec_extension, 'hd'))
1154 elif 'sd' in config["video"]["files"][codec_name]:
1155 files['sd'].append((codec_name, codec_extension, 'sd'))
1157 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1159 for quality in ('hd', 'sd', 'other'):
1160 if len(files[quality]) > 0:
1161 video_quality = files[quality][0][2]
1162 video_codec = files[quality][0][0]
1163 video_extension = files[quality][0][1]
1164 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1167 self._downloader.report_error(u'no known codec found')
1170 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1171 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1176 'uploader': video_uploader,
1177 'uploader_id': video_uploader_id,
1178 'upload_date': video_upload_date,
1179 'title': video_title,
1180 'ext': video_extension,
1181 'thumbnail': video_thumbnail,
1182 'description': video_description,
1186 class ArteTvIE(InfoExtractor):
1187 """arte.tv information extractor."""
1189 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1190 _LIVE_URL = r'index-[0-9]+\.html$'
1192 IE_NAME = u'arte.tv'
1194 def __init__(self, downloader=None):
1195 InfoExtractor.__init__(self, downloader)
1197 def report_download_webpage(self, video_id):
1198 """Report webpage download."""
1199 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1201 def report_extraction(self, video_id):
1202 """Report information extraction."""
1203 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1205 def fetch_webpage(self, url):
1206 request = compat_urllib_request.Request(url)
1208 self.report_download_webpage(url)
1209 webpage = compat_urllib_request.urlopen(request).read()
1210 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1211 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213 except ValueError as err:
1214 self._downloader.report_error(u'Invalid URL: %s' % url)
1218 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1219 page = self.fetch_webpage(url)
1220 mobj = re.search(regex, page, regexFlags)
1224 self._downloader.report_error(u'Invalid URL: %s' % url)
1227 for (i, key, err) in matchTuples:
1228 if mobj.group(i) is None:
1229 self._downloader.trouble(err)
1232 info[key] = mobj.group(i)
1236 def extractLiveStream(self, url):
1237 video_lang = url.split('/')[-4]
1238 info = self.grep_webpage(
1240 r'src="(.*?/videothek_js.*?\.js)',
1243 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1246 http_host = url.split('/')[2]
1247 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1248 info = self.grep_webpage(
1250 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1251 '(http://.*?\.swf).*?' +
1255 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1256 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1257 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1260 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1262 def extractPlus7Stream(self, url):
1263 video_lang = url.split('/')[-3]
1264 info = self.grep_webpage(
1266 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1269 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1273 info = self.grep_webpage(
1275 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1278 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1281 next_url = compat_urllib_parse.unquote(info.get('url'))
1283 info = self.grep_webpage(
1285 r'<video id="(.*?)".*?>.*?' +
1286 '<name>(.*?)</name>.*?' +
1287 '<dateVideo>(.*?)</dateVideo>.*?' +
1288 '<url quality="hd">(.*?)</url>',
1291 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1292 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1293 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1294 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1299 'id': info.get('id'),
1300 'url': compat_urllib_parse.unquote(info.get('url')),
1301 'uploader': u'arte.tv',
1302 'upload_date': info.get('date'),
1303 'title': info.get('title').decode('utf-8'),
1309 def _real_extract(self, url):
1310 video_id = url.split('/')[-1]
1311 self.report_extraction(video_id)
1313 if re.search(self._LIVE_URL, video_id) is not None:
1314 self.extractLiveStream(url)
1317 info = self.extractPlus7Stream(url)
1322 class GenericIE(InfoExtractor):
1323 """Generic last-resort information extractor."""
1326 IE_NAME = u'generic'
1328 def __init__(self, downloader=None):
1329 InfoExtractor.__init__(self, downloader)
1331 def report_download_webpage(self, video_id):
1332 """Report webpage download."""
1333 if not self._downloader.params.get('test', False):
1334 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1335 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1337 def report_extraction(self, video_id):
1338 """Report information extraction."""
1339 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1341 def report_following_redirect(self, new_url):
1342 """Report information extraction."""
1343 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1345 def _test_redirect(self, url):
1346 """Check if it is a redirect, like url shorteners, in case restart chain."""
1347 class HeadRequest(compat_urllib_request.Request):
1348 def get_method(self):
1351 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1353 Subclass the HTTPRedirectHandler to make it use our
1354 HeadRequest also on the redirected URL
1356 def redirect_request(self, req, fp, code, msg, headers, newurl):
1357 if code in (301, 302, 303, 307):
1358 newurl = newurl.replace(' ', '%20')
1359 newheaders = dict((k,v) for k,v in req.headers.items()
1360 if k.lower() not in ("content-length", "content-type"))
1361 return HeadRequest(newurl,
1363 origin_req_host=req.get_origin_req_host(),
1366 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1368 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1370 Fallback to GET if HEAD is not allowed (405 HTTP error)
1372 def http_error_405(self, req, fp, code, msg, headers):
1376 newheaders = dict((k,v) for k,v in req.headers.items()
1377 if k.lower() not in ("content-length", "content-type"))
1378 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1380 origin_req_host=req.get_origin_req_host(),
1384 opener = compat_urllib_request.OpenerDirector()
1385 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1386 HTTPMethodFallback, HEADRedirectHandler,
1387 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1388 opener.add_handler(handler())
1390 response = opener.open(HeadRequest(url))
1391 new_url = response.geturl()
1396 self.report_following_redirect(new_url)
1397 self._downloader.download([new_url])
1400 def _real_extract(self, url):
1401 if self._test_redirect(url): return
1403 video_id = url.split('/')[-1]
1405 webpage = self._download_webpage(url, video_id)
1406 except ValueError as err:
1407 # since this is the last-resort InfoExtractor, if
1408 # this error is thrown, it'll be thrown here
1409 self._downloader.report_error(u'Invalid URL: %s' % url)
1412 self.report_extraction(video_id)
1413 # Start with something easy: JW Player in SWFObject
1414 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1416 # Broaden the search a little bit
1417 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1419 # Broaden the search a little bit: JWPlayer JS loader
1420 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1422 self._downloader.report_error(u'Invalid URL: %s' % url)
1425 # It's possible that one of the regexes
1426 # matched, but returned an empty group:
1427 if mobj.group(1) is None:
1428 self._downloader.report_error(u'Invalid URL: %s' % url)
1431 video_url = compat_urllib_parse.unquote(mobj.group(1))
1432 video_id = os.path.basename(video_url)
1434 # here's a fun little line of code for you:
1435 video_extension = os.path.splitext(video_id)[1][1:]
1436 video_id = os.path.splitext(video_id)[0]
1438 # it's tempting to parse this further, but you would
1439 # have to take into account all the variations like
1440 # Video Title - Site Name
1441 # Site Name | Video Title
1442 # Video Title - Tagline | Site Name
1443 # and so on and so forth; it's just not practical
1444 mobj = re.search(r'<title>(.*)</title>', webpage)
1446 self._downloader.report_error(u'unable to extract title')
1448 video_title = mobj.group(1)
1450 # video uploader is domain name
1451 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1453 self._downloader.report_error(u'unable to extract title')
1455 video_uploader = mobj.group(1)
1460 'uploader': video_uploader,
1461 'upload_date': None,
1462 'title': video_title,
1463 'ext': video_extension,
1467 class YoutubeSearchIE(InfoExtractor):
1468 """Information Extractor for YouTube search queries."""
1469 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1470 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1471 _max_youtube_results = 1000
1472 IE_NAME = u'youtube:search'
1474 def __init__(self, downloader=None):
1475 InfoExtractor.__init__(self, downloader)
1477 def report_download_page(self, query, pagenum):
1478 """Report attempt to download search page with given number."""
1479 query = query.decode(preferredencoding())
1480 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1482 def _real_extract(self, query):
1483 mobj = re.match(self._VALID_URL, query)
1485 self._downloader.report_error(u'invalid search query "%s"' % query)
1488 prefix, query = query.split(':')
1490 query = query.encode('utf-8')
1492 self._download_n_results(query, 1)
1494 elif prefix == 'all':
1495 self._download_n_results(query, self._max_youtube_results)
1501 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1503 elif n > self._max_youtube_results:
1504 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1505 n = self._max_youtube_results
1506 self._download_n_results(query, n)
1508 except ValueError: # parsing prefix as integer fails
1509 self._download_n_results(query, 1)
1512 def _download_n_results(self, query, n):
1513 """Downloads a specified number of results for a query"""
1519 while (50 * pagenum) < limit:
1520 self.report_download_page(query, pagenum+1)
1521 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1522 request = compat_urllib_request.Request(result_url)
1524 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1526 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1528 api_response = json.loads(data)['data']
1530 if not 'items' in api_response:
1531 self._downloader.trouble(u'[youtube] No video results')
1534 new_ids = list(video['id'] for video in api_response['items'])
1535 video_ids += new_ids
1537 limit = min(n, api_response['totalItems'])
1540 if len(video_ids) > n:
1541 video_ids = video_ids[:n]
1542 for id in video_ids:
1543 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1547 class GoogleSearchIE(InfoExtractor):
1548 """Information Extractor for Google Video search queries."""
1549 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1550 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1551 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1552 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1553 _max_google_results = 1000
1554 IE_NAME = u'video.google:search'
1556 def __init__(self, downloader=None):
1557 InfoExtractor.__init__(self, downloader)
1559 def report_download_page(self, query, pagenum):
1560 """Report attempt to download playlist page with given number."""
1561 query = query.decode(preferredencoding())
1562 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1564 def _real_extract(self, query):
1565 mobj = re.match(self._VALID_URL, query)
1567 self._downloader.report_error(u'invalid search query "%s"' % query)
1570 prefix, query = query.split(':')
1572 query = query.encode('utf-8')
1574 self._download_n_results(query, 1)
1576 elif prefix == 'all':
1577 self._download_n_results(query, self._max_google_results)
1583 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1585 elif n > self._max_google_results:
1586 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1587 n = self._max_google_results
1588 self._download_n_results(query, n)
1590 except ValueError: # parsing prefix as integer fails
1591 self._download_n_results(query, 1)
1594 def _download_n_results(self, query, n):
1595 """Downloads a specified number of results for a query"""
1601 self.report_download_page(query, pagenum)
1602 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1603 request = compat_urllib_request.Request(result_url)
1605 page = compat_urllib_request.urlopen(request).read()
1606 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1607 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1610 # Extract video identifiers
1611 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1612 video_id = mobj.group(1)
1613 if video_id not in video_ids:
1614 video_ids.append(video_id)
1615 if len(video_ids) == n:
1616 # Specified n videos reached
1617 for id in video_ids:
1618 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1621 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1626 pagenum = pagenum + 1
1629 class YahooSearchIE(InfoExtractor):
1630 """Information Extractor for Yahoo! Video search queries."""
1633 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1634 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1635 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1636 _MORE_PAGES_INDICATOR = r'\s*Next'
1637 _max_yahoo_results = 1000
1638 IE_NAME = u'video.yahoo:search'
1640 def __init__(self, downloader=None):
1641 InfoExtractor.__init__(self, downloader)
1643 def report_download_page(self, query, pagenum):
1644 """Report attempt to download playlist page with given number."""
1645 query = query.decode(preferredencoding())
1646 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1648 def _real_extract(self, query):
1649 mobj = re.match(self._VALID_URL, query)
1651 self._downloader.report_error(u'invalid search query "%s"' % query)
1654 prefix, query = query.split(':')
1656 query = query.encode('utf-8')
1658 self._download_n_results(query, 1)
1660 elif prefix == 'all':
1661 self._download_n_results(query, self._max_yahoo_results)
1667 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1669 elif n > self._max_yahoo_results:
1670 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1671 n = self._max_yahoo_results
1672 self._download_n_results(query, n)
1674 except ValueError: # parsing prefix as integer fails
1675 self._download_n_results(query, 1)
1678 def _download_n_results(self, query, n):
1679 """Downloads a specified number of results for a query"""
1682 already_seen = set()
1686 self.report_download_page(query, pagenum)
1687 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1688 request = compat_urllib_request.Request(result_url)
1690 page = compat_urllib_request.urlopen(request).read()
1691 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1692 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1695 # Extract video identifiers
1696 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697 video_id = mobj.group(1)
1698 if video_id not in already_seen:
1699 video_ids.append(video_id)
1700 already_seen.add(video_id)
1701 if len(video_ids) == n:
1702 # Specified n videos reached
1703 for id in video_ids:
1704 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1707 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1712 pagenum = pagenum + 1
1715 class YoutubePlaylistIE(InfoExtractor):
1716 """Information Extractor for YouTube playlists."""
1718 _VALID_URL = r"""(?:
1723 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1724 \? (?:.*?&)*? (?:p|a|list)=
1727 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1730 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1732 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1734 IE_NAME = u'youtube:playlist'
1736 def __init__(self, downloader=None):
1737 InfoExtractor.__init__(self, downloader)
1740 def suitable(cls, url):
1741 """Receives a URL and returns True if suitable for this IE."""
1742 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1744 def report_download_page(self, playlist_id, pagenum):
1745 """Report attempt to download playlist page with given number."""
1746 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1748 def _real_extract(self, url):
1749 # Extract playlist id
1750 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1752 self._downloader.report_error(u'invalid url: %s' % url)
1755 # Download playlist videos from API
1756 playlist_id = mobj.group(1) or mobj.group(2)
1761 self.report_download_page(playlist_id, page_num)
1763 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1765 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1767 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1771 response = json.loads(page)
1772 except ValueError as err:
1773 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1776 if not 'feed' in response or not 'entry' in response['feed']:
1777 self._downloader.report_error(u'Got a malformed response from YouTube API')
1779 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1780 for entry in response['feed']['entry']
1781 if 'content' in entry ]
1783 if len(response['feed']['entry']) < self._MAX_RESULTS:
1787 videos = [v[1] for v in sorted(videos)]
1790 playliststart = self._downloader.params.get('playliststart', 1) - 1
1791 playlistend = self._downloader.params.get('playlistend', -1)
1792 if playlistend == -1:
1793 videos = videos[playliststart:]
1795 videos = videos[playliststart:playlistend]
1797 if len(videos) == total:
1798 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1800 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1802 for video in videos:
1803 self._downloader.download([video])
1807 class YoutubeChannelIE(InfoExtractor):
1808 """Information Extractor for YouTube channels."""
1810 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1811 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1812 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1813 IE_NAME = u'youtube:channel'
1815 def report_download_page(self, channel_id, pagenum):
1816 """Report attempt to download channel page with given number."""
1817 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1819 def _real_extract(self, url):
1820 # Extract channel id
1821 mobj = re.match(self._VALID_URL, url)
1823 self._downloader.report_error(u'invalid url: %s' % url)
1826 # Download channel pages
1827 channel_id = mobj.group(1)
1832 self.report_download_page(channel_id, pagenum)
1833 url = self._TEMPLATE_URL % (channel_id, pagenum)
1834 request = compat_urllib_request.Request(url)
1836 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1841 # Extract video identifiers
1843 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1844 if mobj.group(1) not in ids_in_page:
1845 ids_in_page.append(mobj.group(1))
1846 video_ids.extend(ids_in_page)
1848 if self._MORE_PAGES_INDICATOR not in page:
1850 pagenum = pagenum + 1
1852 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1854 for id in video_ids:
1855 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1859 class YoutubeUserIE(InfoExtractor):
1860 """Information Extractor for YouTube users."""
1862 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1863 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1864 _GDATA_PAGE_SIZE = 50
1865 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1866 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1867 IE_NAME = u'youtube:user'
1869 def __init__(self, downloader=None):
1870 InfoExtractor.__init__(self, downloader)
1872 def report_download_page(self, username, start_index):
1873 """Report attempt to download user page."""
1874 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1875 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1877 def _real_extract(self, url):
1879 mobj = re.match(self._VALID_URL, url)
1881 self._downloader.report_error(u'invalid url: %s' % url)
1884 username = mobj.group(1)
1886 # Download video ids using YouTube Data API. Result size per
1887 # query is limited (currently to 50 videos) so we need to query
1888 # page by page until there are no video ids - it means we got
1895 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1896 self.report_download_page(username, start_index)
1898 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1901 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1902 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1903 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1906 # Extract video identifiers
1909 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1910 if mobj.group(1) not in ids_in_page:
1911 ids_in_page.append(mobj.group(1))
1913 video_ids.extend(ids_in_page)
1915 # A little optimization - if current page is not
1916 # "full", ie. does not contain PAGE_SIZE video ids then
1917 # we can assume that this page is the last one - there
1918 # are no more ids on further pages - no need to query
1921 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1926 all_ids_count = len(video_ids)
1927 playliststart = self._downloader.params.get('playliststart', 1) - 1
1928 playlistend = self._downloader.params.get('playlistend', -1)
1930 if playlistend == -1:
1931 video_ids = video_ids[playliststart:]
1933 video_ids = video_ids[playliststart:playlistend]
1935 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1936 (username, all_ids_count, len(video_ids)))
1938 for video_id in video_ids:
1939 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1942 class BlipTVUserIE(InfoExtractor):
1943 """Information Extractor for blip.tv users."""
1945 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1947 IE_NAME = u'blip.tv:user'
1949 def __init__(self, downloader=None):
1950 InfoExtractor.__init__(self, downloader)
1952 def report_download_page(self, username, pagenum):
1953 """Report attempt to download user page."""
1954 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1955 (self.IE_NAME, username, pagenum))
1957 def _real_extract(self, url):
1959 mobj = re.match(self._VALID_URL, url)
1961 self._downloader.report_error(u'invalid url: %s' % url)
1964 username = mobj.group(1)
1966 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1968 request = compat_urllib_request.Request(url)
1971 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1972 mobj = re.search(r'data-users-id="([^"]+)"', page)
1973 page_base = page_base % mobj.group(1)
1974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1975 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1979 # Download video ids using BlipTV Ajax calls. Result size per
1980 # query is limited (currently to 12 videos) so we need to query
1981 # page by page until there are no video ids - it means we got
1988 self.report_download_page(username, pagenum)
1989 url = page_base + "&page=" + str(pagenum)
1990 request = compat_urllib_request.Request( url )
1992 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1997 # Extract video identifiers
2000 for mobj in re.finditer(r'href="/([^"]+)"', page):
2001 if mobj.group(1) not in ids_in_page:
2002 ids_in_page.append(unescapeHTML(mobj.group(1)))
2004 video_ids.extend(ids_in_page)
2006 # A little optimization - if current page is not
2007 # "full", ie. does not contain PAGE_SIZE video ids then
2008 # we can assume that this page is the last one - there
2009 # are no more ids on further pages - no need to query
2012 if len(ids_in_page) < self._PAGE_SIZE:
2017 all_ids_count = len(video_ids)
2018 playliststart = self._downloader.params.get('playliststart', 1) - 1
2019 playlistend = self._downloader.params.get('playlistend', -1)
2021 if playlistend == -1:
2022 video_ids = video_ids[playliststart:]
2024 video_ids = video_ids[playliststart:playlistend]
2026 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2027 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2029 for video_id in video_ids:
2030 self._downloader.download([u'http://blip.tv/'+video_id])
2033 class DepositFilesIE(InfoExtractor):
2034 """Information extractor for depositfiles.com"""
2036 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2038 def report_download_webpage(self, file_id):
2039 """Report webpage download."""
2040 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2042 def report_extraction(self, file_id):
2043 """Report information extraction."""
2044 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2046 def _real_extract(self, url):
2047 file_id = url.split('/')[-1]
2048 # Rebuild url in english locale
2049 url = 'http://depositfiles.com/en/files/' + file_id
2051 # Retrieve file webpage with 'Free download' button pressed
2052 free_download_indication = { 'gateway_result' : '1' }
2053 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2055 self.report_download_webpage(file_id)
2056 webpage = compat_urllib_request.urlopen(request).read()
2057 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2058 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2061 # Search for the real file URL
2062 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2063 if (mobj is None) or (mobj.group(1) is None):
2064 # Try to figure out reason of the error.
2065 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2066 if (mobj is not None) and (mobj.group(1) is not None):
2067 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2068 self._downloader.report_error(u'%s' % restriction_message)
2070 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2073 file_url = mobj.group(1)
2074 file_extension = os.path.splitext(file_url)[1][1:]
2076 # Search for file title
2077 mobj = re.search(r'<b title="(.*?)">', webpage)
2079 self._downloader.report_error(u'unable to extract title')
2081 file_title = mobj.group(1).decode('utf-8')
2084 'id': file_id.decode('utf-8'),
2085 'url': file_url.decode('utf-8'),
2087 'upload_date': None,
2088 'title': file_title,
2089 'ext': file_extension.decode('utf-8'),
2093 class FacebookIE(InfoExtractor):
2094 """Information Extractor for Facebook"""
2096 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2097 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2098 _NETRC_MACHINE = 'facebook'
2099 IE_NAME = u'facebook'
2101 def report_login(self):
2102 """Report attempt to log in."""
2103 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2105 def _real_initialize(self):
2106 if self._downloader is None:
2111 downloader_params = self._downloader.params
2113 # Attempt to use provided username and password or .netrc data
2114 if downloader_params.get('username', None) is not None:
2115 useremail = downloader_params['username']
2116 password = downloader_params['password']
2117 elif downloader_params.get('usenetrc', False):
2119 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2120 if info is not None:
2124 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2125 except (IOError, netrc.NetrcParseError) as err:
2126 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2129 if useremail is None:
2138 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2141 login_results = compat_urllib_request.urlopen(request).read()
2142 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2143 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2145 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2146 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2149 def _real_extract(self, url):
2150 mobj = re.match(self._VALID_URL, url)
2152 self._downloader.report_error(u'invalid URL: %s' % url)
2154 video_id = mobj.group('ID')
2156 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2157 webpage = self._download_webpage(url, video_id)
2159 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2160 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2161 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2163 raise ExtractorError(u'Cannot parse data')
2164 data = dict(json.loads(m.group(1)))
2165 params_raw = compat_urllib_parse.unquote(data['params'])
2166 params = json.loads(params_raw)
2167 video_url = params['hd_src']
2169 video_url = params['sd_src']
2171 raise ExtractorError(u'Cannot find video URL')
2172 video_duration = int(params['video_duration'])
2174 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2176 raise ExtractorError(u'Cannot find title in webpage')
2177 video_title = unescapeHTML(m.group(1))
2181 'title': video_title,
2184 'duration': video_duration,
2185 'thumbnail': params['thumbnail_src'],
2190 class BlipTVIE(InfoExtractor):
2191 """Information extractor for blip.tv"""
2193 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195 IE_NAME = u'blip.tv'
2197 def report_extraction(self, file_id):
2198 """Report information extraction."""
2199 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2201 def report_direct_download(self, title):
2202 """Report information extraction."""
2203 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2205 def _real_extract(self, url):
2206 mobj = re.match(self._VALID_URL, url)
2208 self._downloader.report_error(u'invalid URL: %s' % url)
2211 urlp = compat_urllib_parse_urlparse(url)
2212 if urlp.path.startswith('/play/'):
2213 request = compat_urllib_request.Request(url)
2214 response = compat_urllib_request.urlopen(request)
2215 redirecturl = response.geturl()
2216 rurlp = compat_urllib_parse_urlparse(redirecturl)
2217 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2218 url = 'http://blip.tv/a/a-' + file_id
2219 return self._real_extract(url)
2226 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2227 request = compat_urllib_request.Request(json_url)
2228 request.add_header('User-Agent', 'iTunes/10.6.1')
2229 self.report_extraction(mobj.group(1))
2232 urlh = compat_urllib_request.urlopen(request)
2233 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2234 basename = url.split('/')[-1]
2235 title,ext = os.path.splitext(basename)
2236 title = title.decode('UTF-8')
2237 ext = ext.replace('.', '')
2238 self.report_direct_download(title)
2243 'upload_date': None,
2248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2249 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2250 if info is None: # Regular URL
2252 json_code_bytes = urlh.read()
2253 json_code = json_code_bytes.decode('utf-8')
2254 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2255 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2259 json_data = json.loads(json_code)
2260 if 'Post' in json_data:
2261 data = json_data['Post']
2265 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2266 video_url = data['media']['url']
2267 umobj = re.match(self._URL_EXT, video_url)
2269 raise ValueError('Can not determine filename extension')
2270 ext = umobj.group(1)
2273 'id': data['item_id'],
2275 'uploader': data['display_name'],
2276 'upload_date': upload_date,
2277 'title': data['title'],
2279 'format': data['media']['mimeType'],
2280 'thumbnail': data['thumbnailUrl'],
2281 'description': data['description'],
2282 'player_url': data['embedUrl'],
2283 'user_agent': 'iTunes/10.6.1',
2285 except (ValueError,KeyError) as err:
2286 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2292 class MyVideoIE(InfoExtractor):
2293 """Information Extractor for myvideo.de."""
2295 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2296 IE_NAME = u'myvideo'
2298 def __init__(self, downloader=None):
2299 InfoExtractor.__init__(self, downloader)
2301 def report_extraction(self, video_id):
2302 """Report information extraction."""
2303 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2305 def _real_extract(self,url):
2306 mobj = re.match(self._VALID_URL, url)
2308 self._download.report_error(u'invalid URL: %s' % url)
2311 video_id = mobj.group(1)
2314 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2315 webpage = self._download_webpage(webpage_url, video_id)
2317 self.report_extraction(video_id)
2318 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2321 self._downloader.report_error(u'unable to extract media URL')
2323 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2325 mobj = re.search('<title>([^<]+)</title>', webpage)
2327 self._downloader.report_error(u'unable to extract title')
2330 video_title = mobj.group(1)
2336 'upload_date': None,
2337 'title': video_title,
2341 class ComedyCentralIE(InfoExtractor):
2342 """Information extractor for The Daily Show and Colbert Report """
2344 # urls can be abbreviations like :thedailyshow or :colbert
2345 # urls for episodes like:
2346 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2347 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2348 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2349 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2350 |(https?://)?(www\.)?
2351 (?P<showname>thedailyshow|colbertnation)\.com/
2352 (full-episodes/(?P<episode>.*)|
2354 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2355 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2358 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2360 _video_extensions = {
2368 _video_dimensions = {
2378 def suitable(cls, url):
2379 """Receives a URL and returns True if suitable for this IE."""
2380 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2382 def report_extraction(self, episode_id):
2383 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2385 def report_config_download(self, episode_id, media_id):
2386 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2388 def report_index_download(self, episode_id):
2389 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2391 def _print_formats(self, formats):
2392 print('Available formats:')
2394 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2397 def _real_extract(self, url):
2398 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2400 self._downloader.report_error(u'invalid URL: %s' % url)
2403 if mobj.group('shortname'):
2404 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2405 url = u'http://www.thedailyshow.com/full-episodes/'
2407 url = u'http://www.colbertnation.com/full-episodes/'
2408 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409 assert mobj is not None
2411 if mobj.group('clip'):
2412 if mobj.group('showname') == 'thedailyshow':
2413 epTitle = mobj.group('tdstitle')
2415 epTitle = mobj.group('cntitle')
2418 dlNewest = not mobj.group('episode')
2420 epTitle = mobj.group('showname')
2422 epTitle = mobj.group('episode')
2424 req = compat_urllib_request.Request(url)
2425 self.report_extraction(epTitle)
2427 htmlHandle = compat_urllib_request.urlopen(req)
2428 html = htmlHandle.read()
2429 webpage = html.decode('utf-8')
2430 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2431 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2434 url = htmlHandle.geturl()
2435 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2437 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2439 if mobj.group('episode') == '':
2440 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2442 epTitle = mobj.group('episode')
2444 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2446 if len(mMovieParams) == 0:
2447 # The Colbert Report embeds the information in a without
2448 # a URL prefix; so extract the alternate reference
2449 # and then add the URL prefix manually.
2451 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2452 if len(altMovieParams) == 0:
2453 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2456 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2458 uri = mMovieParams[0][1]
2459 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460 self.report_index_download(epTitle)
2462 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2469 idoc = xml.etree.ElementTree.fromstring(indexXml)
2470 itemEls = idoc.findall('.//item')
2471 for partNum,itemEl in enumerate(itemEls):
2472 mediaId = itemEl.findall('./guid')[0].text
2473 shortMediaId = mediaId.split(':')[-1]
2474 showId = mediaId.split(':')[-2].replace('.com', '')
2475 officialTitle = itemEl.findall('./title')[0].text
2476 officialDate = itemEl.findall('./pubDate')[0].text
2478 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479 compat_urllib_parse.urlencode({'uri': mediaId}))
2480 configReq = compat_urllib_request.Request(configUrl)
2481 self.report_config_download(epTitle, shortMediaId)
2483 configXml = compat_urllib_request.urlopen(configReq).read()
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2488 cdoc = xml.etree.ElementTree.fromstring(configXml)
2490 for rendition in cdoc.findall('.//rendition'):
2491 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2495 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2498 if self._downloader.params.get('listformats', None):
2499 self._print_formats([i[0] for i in turls])
2502 # For now, just pick the highest bitrate
2503 format,rtmp_video_url = turls[-1]
2505 # Get the format arg from the arg stream
2506 req_format = self._downloader.params.get('format', None)
2508 # Select format if we can find one
2511 format, rtmp_video_url = f, v
2514 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2516 raise ExtractorError(u'Cannot transform RTMP url')
2517 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2518 video_url = base + m.group('finalid')
2520 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2525 'upload_date': officialDate,
2530 'description': officialTitle,
2532 results.append(info)
2537 class EscapistIE(InfoExtractor):
2538 """Information extractor for The Escapist """
2540 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2541 IE_NAME = u'escapist'
2543 def report_extraction(self, showName):
2544 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2546 def report_config_download(self, showName):
2547 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2549 def _real_extract(self, url):
2550 mobj = re.match(self._VALID_URL, url)
2552 self._downloader.report_error(u'invalid URL: %s' % url)
2554 showName = mobj.group('showname')
2555 videoId = mobj.group('episode')
2557 self.report_extraction(showName)
2559 webPage = compat_urllib_request.urlopen(url)
2560 webPageBytes = webPage.read()
2561 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2562 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2564 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2567 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2568 description = unescapeHTML(descMatch.group(1))
2569 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2570 imgUrl = unescapeHTML(imgMatch.group(1))
2571 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2572 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2573 configUrlMatch = re.search('config=(.*)$', playerUrl)
2574 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2576 self.report_config_download(showName)
2578 configJSON = compat_urllib_request.urlopen(configUrl)
2579 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2580 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2585 # Technically, it's JavaScript, not JSON
2586 configJSON = configJSON.replace("'", '"')
2589 config = json.loads(configJSON)
2590 except (ValueError,) as err:
2591 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2594 playlist = config['playlist']
2595 videoUrl = playlist[1]['url']
2600 'uploader': showName,
2601 'upload_date': None,
2604 'thumbnail': imgUrl,
2605 'description': description,
2606 'player_url': playerUrl,
2611 class CollegeHumorIE(InfoExtractor):
2612 """Information extractor for collegehumor.com"""
2615 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616 IE_NAME = u'collegehumor'
2618 def report_manifest(self, video_id):
2619 """Report information extraction."""
2620 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2622 def report_extraction(self, video_id):
2623 """Report information extraction."""
2624 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2626 def _real_extract(self, url):
2627 mobj = re.match(self._VALID_URL, url)
2629 self._downloader.report_error(u'invalid URL: %s' % url)
2631 video_id = mobj.group('videoid')
2636 'upload_date': None,
2639 self.report_extraction(video_id)
2640 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2642 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2647 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2649 videoNode = mdoc.findall('./video')[0]
2650 info['description'] = videoNode.findall('./description')[0].text
2651 info['title'] = videoNode.findall('./caption')[0].text
2652 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653 manifest_url = videoNode.findall('./file')[0].text
2655 self._downloader.report_error(u'Invalid metadata XML file')
2658 manifest_url += '?hdcore=2.10.3'
2659 self.report_manifest(video_id)
2661 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2666 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2668 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669 node_id = media_node.attrib['url']
2670 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671 except IndexError as err:
2672 self._downloader.report_error(u'Invalid manifest file')
2675 url_pr = compat_urllib_parse_urlparse(manifest_url)
2676 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2683 class XVideosIE(InfoExtractor):
2684 """Information extractor for xvideos.com"""
2686 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687 IE_NAME = u'xvideos'
2689 def report_extraction(self, video_id):
2690 """Report information extraction."""
2691 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2693 def _real_extract(self, url):
2694 mobj = re.match(self._VALID_URL, url)
2696 self._downloader.report_error(u'invalid URL: %s' % url)
2698 video_id = mobj.group(1)
2700 webpage = self._download_webpage(url, video_id)
2702 self.report_extraction(video_id)
2706 mobj = re.search(r'flv_url=(.+?)&', webpage)
2708 self._downloader.report_error(u'unable to extract video url')
2710 video_url = compat_urllib_parse.unquote(mobj.group(1))
2714 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2716 self._downloader.report_error(u'unable to extract video title')
2718 video_title = mobj.group(1)
2721 # Extract video thumbnail
2722 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2724 self._downloader.report_error(u'unable to extract video thumbnail')
2726 video_thumbnail = mobj.group(0)
2732 'upload_date': None,
2733 'title': video_title,
2735 'thumbnail': video_thumbnail,
2736 'description': None,
2742 class SoundcloudIE(InfoExtractor):
2743 """Information extractor for soundcloud.com
2744 To access the media, the uid of the song and a stream token
2745 must be extracted from the page source and the script must make
2746 a request to media.soundcloud.com/crossdomain.xml. Then
2747 the media can be grabbed by requesting from an url composed
2748 of the stream token and uid
2751 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752 IE_NAME = u'soundcloud'
2754 def __init__(self, downloader=None):
2755 InfoExtractor.__init__(self, downloader)
2757 def report_resolve(self, video_id):
2758 """Report information extraction."""
2759 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2761 def report_extraction(self, video_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2765 def _real_extract(self, url):
2766 mobj = re.match(self._VALID_URL, url)
2768 self._downloader.report_error(u'invalid URL: %s' % url)
2771 # extract uploader (which is in the url)
2772 uploader = mobj.group(1)
2773 # extract simple title (uploader + slug of song title)
2774 slug_title = mobj.group(2)
2775 simple_title = uploader + u'-' + slug_title
2777 self.report_resolve('%s/%s' % (uploader, slug_title))
2779 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2780 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2781 request = compat_urllib_request.Request(resolv_url)
2783 info_json_bytes = compat_urllib_request.urlopen(request).read()
2784 info_json = info_json_bytes.decode('utf-8')
2785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2786 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2789 info = json.loads(info_json)
2790 video_id = info['id']
2791 self.report_extraction('%s/%s' % (uploader, slug_title))
2793 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794 request = compat_urllib_request.Request(streams_url)
2796 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2797 stream_json = stream_json_bytes.decode('utf-8')
2798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2802 streams = json.loads(stream_json)
2803 mediaURL = streams['http_mp3_128_url']
2808 'uploader': info['user']['username'],
2809 'upload_date': info['created_at'],
2810 'title': info['title'],
2812 'description': info['description'],
2815 class SoundcloudSetIE(InfoExtractor):
2816 """Information extractor for soundcloud.com sets
2817 To access the media, the uid of the song and a stream token
2818 must be extracted from the page source and the script must make
2819 a request to media.soundcloud.com/crossdomain.xml. Then
2820 the media can be grabbed by requesting from an url composed
2821 of the stream token and uid
2824 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2825 IE_NAME = u'soundcloud'
2827 def __init__(self, downloader=None):
2828 InfoExtractor.__init__(self, downloader)
2830 def report_resolve(self, video_id):
2831 """Report information extraction."""
2832 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2834 def report_extraction(self, video_id):
2835 """Report information extraction."""
2836 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2838 def _real_extract(self, url):
2839 mobj = re.match(self._VALID_URL, url)
2841 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2844 # extract uploader (which is in the url)
2845 uploader = mobj.group(1)
2846 # extract simple title (uploader + slug of song title)
2847 slug_title = mobj.group(2)
2848 simple_title = uploader + u'-' + slug_title
2850 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2852 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2853 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2854 request = compat_urllib_request.Request(resolv_url)
2856 info_json_bytes = compat_urllib_request.urlopen(request).read()
2857 info_json = info_json_bytes.decode('utf-8')
2858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2863 info = json.loads(info_json)
2864 if 'errors' in info:
2865 for err in info['errors']:
2866 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2869 for track in info['tracks']:
2870 video_id = track['id']
2871 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2873 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2874 request = compat_urllib_request.Request(streams_url)
2876 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2877 stream_json = stream_json_bytes.decode('utf-8')
2878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2882 streams = json.loads(stream_json)
2883 mediaURL = streams['http_mp3_128_url']
2888 'uploader': track['user']['username'],
2889 'upload_date': track['created_at'],
2890 'title': track['title'],
2892 'description': track['description'],
2897 class InfoQIE(InfoExtractor):
2898 """Information extractor for infoq.com"""
2899 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2901 def report_extraction(self, video_id):
2902 """Report information extraction."""
2903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2905 def _real_extract(self, url):
2906 mobj = re.match(self._VALID_URL, url)
2908 self._downloader.report_error(u'invalid URL: %s' % url)
2911 webpage = self._download_webpage(url, video_id=url)
2912 self.report_extraction(url)
2915 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2917 self._downloader.report_error(u'unable to extract video url')
2919 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2920 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2923 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2925 self._downloader.report_error(u'unable to extract video title')
2927 video_title = mobj.group(1)
2929 # Extract description
2930 video_description = u'No description available.'
2931 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2932 if mobj is not None:
2933 video_description = mobj.group(1)
2935 video_filename = video_url.split('/')[-1]
2936 video_id, extension = video_filename.split('.')
2942 'upload_date': None,
2943 'title': video_title,
2944 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2946 'description': video_description,
2951 class MixcloudIE(InfoExtractor):
2952 """Information extractor for www.mixcloud.com"""
2954 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2955 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2956 IE_NAME = u'mixcloud'
2958 def __init__(self, downloader=None):
2959 InfoExtractor.__init__(self, downloader)
2961 def report_download_json(self, file_id):
2962 """Report JSON download."""
2963 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2965 def report_extraction(self, file_id):
2966 """Report information extraction."""
2967 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2969 def get_urls(self, jsonData, fmt, bitrate='best'):
2970 """Get urls from 'audio_formats' section in json"""
2973 bitrate_list = jsonData[fmt]
2974 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2975 bitrate = max(bitrate_list) # select highest
2977 url_list = jsonData[fmt][bitrate]
2978 except TypeError: # we have no bitrate info.
2979 url_list = jsonData[fmt]
2982 def check_urls(self, url_list):
2983 """Returns 1st active url from list"""
2984 for url in url_list:
2986 compat_urllib_request.urlopen(url)
2988 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2993 def _print_formats(self, formats):
2994 print('Available formats:')
2995 for fmt in formats.keys():
2996 for b in formats[fmt]:
2998 ext = formats[fmt][b][0]
2999 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3000 except TypeError: # we have no bitrate info
3001 ext = formats[fmt][0]
3002 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3005 def _real_extract(self, url):
3006 mobj = re.match(self._VALID_URL, url)
3008 self._downloader.report_error(u'invalid URL: %s' % url)
3010 # extract uploader & filename from url
3011 uploader = mobj.group(1).decode('utf-8')
3012 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3014 # construct API request
3015 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3016 # retrieve .json file with links to files
3017 request = compat_urllib_request.Request(file_url)
3019 self.report_download_json(file_url)
3020 jsonData = compat_urllib_request.urlopen(request).read()
3021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3026 json_data = json.loads(jsonData)
3027 player_url = json_data['player_swf_url']
3028 formats = dict(json_data['audio_formats'])
3030 req_format = self._downloader.params.get('format', None)
3033 if self._downloader.params.get('listformats', None):
3034 self._print_formats(formats)
3037 if req_format is None or req_format == 'best':
3038 for format_param in formats.keys():
3039 url_list = self.get_urls(formats, format_param)
3041 file_url = self.check_urls(url_list)
3042 if file_url is not None:
3045 if req_format not in formats:
3046 self._downloader.report_error(u'format is not available')
3049 url_list = self.get_urls(formats, req_format)
3050 file_url = self.check_urls(url_list)
3051 format_param = req_format
3054 'id': file_id.decode('utf-8'),
3055 'url': file_url.decode('utf-8'),
3056 'uploader': uploader.decode('utf-8'),
3057 'upload_date': None,
3058 'title': json_data['name'],
3059 'ext': file_url.split('.')[-1].decode('utf-8'),
3060 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3061 'thumbnail': json_data['thumbnail_url'],
3062 'description': json_data['description'],
3063 'player_url': player_url.decode('utf-8'),
3066 class StanfordOpenClassroomIE(InfoExtractor):
3067 """Information extractor for Stanford's Open ClassRoom"""
3069 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3070 IE_NAME = u'stanfordoc'
3072 def report_download_webpage(self, objid):
3073 """Report information extraction."""
3074 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3076 def report_extraction(self, video_id):
3077 """Report information extraction."""
3078 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3080 def _real_extract(self, url):
3081 mobj = re.match(self._VALID_URL, url)
3083 raise ExtractorError(u'Invalid URL: %s' % url)
3085 if mobj.group('course') and mobj.group('video'): # A specific video
3086 course = mobj.group('course')
3087 video = mobj.group('video')
3089 'id': course + '_' + video,
3091 'upload_date': None,
3094 self.report_extraction(info['id'])
3095 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3096 xmlUrl = baseUrl + video + '.xml'
3098 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3099 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3100 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3102 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3104 info['title'] = mdoc.findall('./title')[0].text
3105 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3107 self._downloader.report_error(u'Invalid metadata XML file')
3109 info['ext'] = info['url'].rpartition('.')[2]
3111 elif mobj.group('course'): # A course page
3112 course = mobj.group('course')
3117 'upload_date': None,
3120 coursepage = self._download_webpage(url, info['id'],
3121 note='Downloading course info page',
3122 errnote='Unable to download course info page')
3124 m = re.search('<h1>([^<]+)</h1>', coursepage)
3126 info['title'] = unescapeHTML(m.group(1))
3128 info['title'] = info['id']
3130 m = re.search('<description>([^<]+)</description>', coursepage)
3132 info['description'] = unescapeHTML(m.group(1))
3134 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3137 'type': 'reference',
3138 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3142 for entry in info['list']:
3143 assert entry['type'] == 'reference'
3144 results += self.extract(entry['url'])
3148 'id': 'Stanford OpenClassroom',
3151 'upload_date': None,
3154 self.report_download_webpage(info['id'])
3155 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3157 rootpage = compat_urllib_request.urlopen(rootURL).read()
3158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3162 info['title'] = info['id']
3164 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3167 'type': 'reference',
3168 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3173 for entry in info['list']:
3174 assert entry['type'] == 'reference'
3175 results += self.extract(entry['url'])
3178 class MTVIE(InfoExtractor):
3179 """Information extractor for MTV.com"""
3181 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3184 def report_extraction(self, video_id):
3185 """Report information extraction."""
3186 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3188 def _real_extract(self, url):
3189 mobj = re.match(self._VALID_URL, url)
3191 self._downloader.report_error(u'invalid URL: %s' % url)
3193 if not mobj.group('proto'):
3194 url = 'http://' + url
3195 video_id = mobj.group('videoid')
3197 webpage = self._download_webpage(url, video_id)
3199 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3201 self._downloader.report_error(u'unable to extract song name')
3203 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3204 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3206 self._downloader.report_error(u'unable to extract performer')
3208 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209 video_title = performer + ' - ' + song_name
3211 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3213 self._downloader.report_error(u'unable to mtvn_uri')
3215 mtvn_uri = mobj.group(1)
3217 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3219 self._downloader.report_error(u'unable to extract content id')
3221 content_id = mobj.group(1)
3223 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3224 self.report_extraction(video_id)
3225 request = compat_urllib_request.Request(videogen_url)
3227 metadataXml = compat_urllib_request.urlopen(request).read()
3228 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3229 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3232 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3233 renditions = mdoc.findall('.//rendition')
3235 # For now, always pick the highest quality.
3236 rendition = renditions[-1]
3239 _,_,ext = rendition.attrib['type'].partition('/')
3240 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3241 video_url = rendition.find('./src').text
3243 self._downloader.trouble('Invalid rendition field.')
3249 'uploader': performer,
3250 'upload_date': None,
3251 'title': video_title,
3259 class YoukuIE(InfoExtractor):
3260 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3262 def report_download_webpage(self, file_id):
3263 """Report webpage download."""
3264 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3266 def report_extraction(self, file_id):
3267 """Report information extraction."""
3268 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3271 nowTime = int(time.time() * 1000)
3272 random1 = random.randint(1000,1998)
3273 random2 = random.randint(1000,9999)
3275 return "%d%d%d" %(nowTime,random1,random2)
3277 def _get_file_ID_mix_string(self, seed):
3279 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3281 for i in range(len(source)):
3282 seed = (seed * 211 + 30031 ) % 65536
3283 index = math.floor(seed / 65536 * len(source) )
3284 mixed.append(source[int(index)])
3285 source.remove(source[int(index)])
3286 #return ''.join(mixed)
3289 def _get_file_id(self, fileId, seed):
3290 mixed = self._get_file_ID_mix_string(seed)
3291 ids = fileId.split('*')
3295 realId.append(mixed[int(ch)])
3296 return ''.join(realId)
3298 def _real_extract(self, url):
3299 mobj = re.match(self._VALID_URL, url)
3301 self._downloader.report_error(u'invalid URL: %s' % url)
3303 video_id = mobj.group('ID')
3305 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3307 request = compat_urllib_request.Request(info_url, None, std_headers)
3309 self.report_download_webpage(video_id)
3310 jsondata = compat_urllib_request.urlopen(request).read()
3311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3312 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3315 self.report_extraction(video_id)
3317 jsonstr = jsondata.decode('utf-8')
3318 config = json.loads(jsonstr)
3320 video_title = config['data'][0]['title']
3321 seed = config['data'][0]['seed']
3323 format = self._downloader.params.get('format', None)
3324 supported_format = list(config['data'][0]['streamfileids'].keys())
3326 if format is None or format == 'best':
3327 if 'hd2' in supported_format:
3332 elif format == 'worst':
3340 fileid = config['data'][0]['streamfileids'][format]
3341 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3342 except (UnicodeDecodeError, ValueError, KeyError):
3343 self._downloader.report_error(u'unable to extract info section')
3347 sid = self._gen_sid()
3348 fileid = self._get_file_id(fileid, seed)
3350 #column 8,9 of fileid represent the segment number
3351 #fileid[7:9] should be changed
3352 for index, key in enumerate(keys):
3354 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3355 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3358 'id': '%s_part%02d' % (video_id, index),
3359 'url': download_url,
3361 'upload_date': None,
3362 'title': video_title,
3365 files_info.append(info)
3370 class XNXXIE(InfoExtractor):
3371 """Information extractor for xnxx.com"""
3373 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3375 VIDEO_URL_RE = r'flv_url=(.*?)&'
3376 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3377 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3379 def report_webpage(self, video_id):
3380 """Report information extraction"""
3381 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3383 def report_extraction(self, video_id):
3384 """Report information extraction"""
3385 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3387 def _real_extract(self, url):
3388 mobj = re.match(self._VALID_URL, url)
3390 self._downloader.report_error(u'invalid URL: %s' % url)
3392 video_id = mobj.group(1)
3394 self.report_webpage(video_id)
3396 # Get webpage content
3398 webpage_bytes = compat_urllib_request.urlopen(url).read()
3399 webpage = webpage_bytes.decode('utf-8')
3400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3404 result = re.search(self.VIDEO_URL_RE, webpage)
3406 self._downloader.report_error(u'unable to extract video url')
3408 video_url = compat_urllib_parse.unquote(result.group(1))
3410 result = re.search(self.VIDEO_TITLE_RE, webpage)
3412 self._downloader.report_error(u'unable to extract video title')
3414 video_title = result.group(1)
3416 result = re.search(self.VIDEO_THUMB_RE, webpage)
3418 self._downloader.report_error(u'unable to extract video thumbnail')
3420 video_thumbnail = result.group(1)
3426 'upload_date': None,
3427 'title': video_title,
3429 'thumbnail': video_thumbnail,
3430 'description': None,
3434 class GooglePlusIE(InfoExtractor):
3435 """Information extractor for plus.google.com."""
3437 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3438 IE_NAME = u'plus.google'
3440 def __init__(self, downloader=None):
3441 InfoExtractor.__init__(self, downloader)
3443 def report_extract_entry(self, url):
3444 """Report downloading extry"""
3445 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3447 def report_date(self, upload_date):
3448 """Report downloading extry"""
3449 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3451 def report_uploader(self, uploader):
3452 """Report downloading extry"""
3453 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3455 def report_title(self, video_title):
3456 """Report downloading extry"""
3457 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3459 def report_extract_vid_page(self, video_page):
3460 """Report information extraction."""
3461 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3463 def _real_extract(self, url):
3464 # Extract id from URL
3465 mobj = re.match(self._VALID_URL, url)
3467 self._downloader.report_error(u'Invalid URL: %s' % url)
3470 post_url = mobj.group(0)
3471 video_id = mobj.group(1)
3473 video_extension = 'flv'
3475 # Step 1, Retrieve post webpage to extract further information
3476 self.report_extract_entry(post_url)
3477 request = compat_urllib_request.Request(post_url)
3479 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3480 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3481 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3484 # Extract update date
3486 pattern = 'title="Timestamp">(.*?)</a>'
3487 mobj = re.search(pattern, webpage)
3489 upload_date = mobj.group(1)
3490 # Convert timestring to a format suitable for filename
3491 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3492 upload_date = upload_date.strftime('%Y%m%d')
3493 self.report_date(upload_date)
3497 pattern = r'rel\="author".*?>(.*?)</a>'
3498 mobj = re.search(pattern, webpage)
3500 uploader = mobj.group(1)
3501 self.report_uploader(uploader)
3504 # Get the first line for title
3506 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3507 mobj = re.search(pattern, webpage)
3509 video_title = mobj.group(1)
3510 self.report_title(video_title)
3512 # Step 2, Stimulate clicking the image box to launch video
3513 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3514 mobj = re.search(pattern, webpage)
3516 self._downloader.report_error(u'unable to extract video page URL')
3518 video_page = mobj.group(1)
3519 request = compat_urllib_request.Request(video_page)
3521 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3523 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3525 self.report_extract_vid_page(video_page)
3528 # Extract video links on video page
3529 """Extract video links of all sizes"""
3530 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3531 mobj = re.findall(pattern, webpage)
3533 self._downloader.report_error(u'unable to extract video links')
3535 # Sort in resolution
3536 links = sorted(mobj)
3538 # Choose the lowest of the sort, i.e. highest resolution
3539 video_url = links[-1]
3540 # Only get the url. The resolution part in the tuple has no use anymore
3541 video_url = video_url[-1]
3542 # Treat escaped \u0026 style hex
3544 video_url = video_url.decode("unicode_escape")
3545 except AttributeError: # Python 3
3546 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3552 'uploader': uploader,
3553 'upload_date': upload_date,
3554 'title': video_title,
3555 'ext': video_extension,
3558 class NBAIE(InfoExtractor):
3559 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3562 def _real_extract(self, url):
3563 mobj = re.match(self._VALID_URL, url)
3565 self._downloader.report_error(u'invalid URL: %s' % url)
3568 video_id = mobj.group(1)
3569 if video_id.endswith('/index.html'):
3570 video_id = video_id[:-len('/index.html')]
3572 webpage = self._download_webpage(url, video_id)
3574 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3575 def _findProp(rexp, default=None):
3576 m = re.search(rexp, webpage)
3578 return unescapeHTML(m.group(1))
3582 shortened_video_id = video_id.rpartition('/')[2]
3583 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3585 'id': shortened_video_id,
3589 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3590 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3594 class JustinTVIE(InfoExtractor):
3595 """Information extractor for justin.tv and twitch.tv"""
3596 # TODO: One broadcast may be split into multiple videos. The key
3597 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3598 # starts at 1 and increases. Can we treat all parts as one video?
3600 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3601 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3602 _JUSTIN_PAGE_LIMIT = 100
3603 IE_NAME = u'justin.tv'
3605 def report_extraction(self, file_id):
3606 """Report information extraction."""
3607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3609 def report_download_page(self, channel, offset):
3610 """Report attempt to download a single page of videos."""
3611 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3612 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3614 # Return count of items, list of *valid* items
3615 def _parse_page(self, url):
3617 urlh = compat_urllib_request.urlopen(url)
3618 webpage_bytes = urlh.read()
3619 webpage = webpage_bytes.decode('utf-8', 'ignore')
3620 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3621 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3624 response = json.loads(webpage)
3625 if type(response) != list:
3626 error_text = response.get('error', 'unknown error')
3627 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3630 for clip in response:
3631 video_url = clip['video_file_url']
3633 video_extension = os.path.splitext(video_url)[1][1:]
3634 video_date = re.sub('-', '', clip['start_time'][:10])
3635 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3636 video_id = clip['id']
3637 video_title = clip.get('title', video_id)
3641 'title': video_title,
3642 'uploader': clip.get('channel_name', video_uploader_id),
3643 'uploader_id': video_uploader_id,
3644 'upload_date': video_date,
3645 'ext': video_extension,
3647 return (len(response), info)
3649 def _real_extract(self, url):
3650 mobj = re.match(self._VALID_URL, url)
3652 self._downloader.report_error(u'invalid URL: %s' % url)
3655 api = 'http://api.justin.tv'
3656 video_id = mobj.group(mobj.lastindex)
3658 if mobj.lastindex == 1:
3660 api += '/channel/archives/%s.json'
3662 api += '/broadcast/by_archive/%s.json'
3663 api = api % (video_id,)
3665 self.report_extraction(video_id)
3669 limit = self._JUSTIN_PAGE_LIMIT
3672 self.report_download_page(video_id, offset)
3673 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3674 page_count, page_info = self._parse_page(page_url)
3675 info.extend(page_info)
3676 if not paged or page_count != limit:
3681 class FunnyOrDieIE(InfoExtractor):
3682 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3684 def _real_extract(self, url):
3685 mobj = re.match(self._VALID_URL, url)
3687 self._downloader.report_error(u'invalid URL: %s' % url)
3690 video_id = mobj.group('id')
3691 webpage = self._download_webpage(url, video_id)
3693 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3695 self._downloader.report_error(u'unable to find video information')
3696 video_url = unescapeHTML(m.group('url'))
3698 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3700 self._downloader.trouble(u'Cannot find video title')
3701 title = clean_html(m.group('title'))
3703 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3705 desc = unescapeHTML(m.group('desc'))
3714 'description': desc,
3718 class SteamIE(InfoExtractor):
3719 _VALID_URL = r"""http://store.steampowered.com/
3720 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3722 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3726 def suitable(cls, url):
3727 """Receives a URL and returns True if suitable for this IE."""
3728 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3730 def _real_extract(self, url):
3731 m = re.match(self._VALID_URL, url, re.VERBOSE)
3732 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3733 gameID = m.group('gameID')
3734 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3735 webpage = self._download_webpage(videourl, gameID)
3736 mweb = re.finditer(urlRE, webpage)
3737 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3738 titles = re.finditer(namesRE, webpage)
3739 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3740 thumbs = re.finditer(thumbsRE, webpage)
3742 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3743 video_id = vid.group('videoID')
3744 title = vtitle.group('videoName')
3745 video_url = vid.group('videoURL')
3746 video_thumb = thumb.group('thumbnail')
3748 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3753 'title': unescapeHTML(title),
3754 'thumbnail': video_thumb
3759 class UstreamIE(InfoExtractor):
3760 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3761 IE_NAME = u'ustream'
3763 def _real_extract(self, url):
3764 m = re.match(self._VALID_URL, url)
3765 video_id = m.group('videoID')
3766 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3767 webpage = self._download_webpage(url, video_id)
3768 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3769 title = m.group('title')
3770 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3771 uploader = m.group('uploader')
3777 'uploader': uploader
3781 class WorldStarHipHopIE(InfoExtractor):
3782 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3783 IE_NAME = u'WorldStarHipHop'
3785 def _real_extract(self, url):
3786 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3788 webpage_src = compat_urllib_request.urlopen(url).read()
3789 webpage_src = webpage_src.decode('utf-8')
3791 mobj = re.search(_src_url, webpage_src)
3793 m = re.match(self._VALID_URL, url)
3794 video_id = m.group('id')
3796 if mobj is not None:
3797 video_url = mobj.group()
3798 if 'mp4' in video_url:
3803 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3806 _title = r"""<title>(.*)</title>"""
3808 mobj = re.search(_title, webpage_src)
3810 if mobj is not None:
3811 title = mobj.group(1)
3813 title = 'World Start Hip Hop - %s' % time.ctime()
3815 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3816 mobj = re.search(_thumbnail, webpage_src)
3818 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3819 if mobj is not None:
3820 thumbnail = mobj.group(1)
3822 _title = r"""candytitles.*>(.*)</span>"""
3823 mobj = re.search(_title, webpage_src)
3824 if mobj is not None:
3825 title = mobj.group(1)
3832 'thumbnail' : thumbnail,
3837 class RBMARadioIE(InfoExtractor):
3838 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3840 def _real_extract(self, url):
3841 m = re.match(self._VALID_URL, url)
3842 video_id = m.group('videoID')
3844 webpage = self._download_webpage(url, video_id)
3845 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3847 raise ExtractorError(u'Cannot find metadata')
3848 json_data = m.group(1)
3851 data = json.loads(json_data)
3852 except ValueError as e:
3853 raise ExtractorError(u'Invalid JSON: ' + str(e))
3855 video_url = data['akamai_url'] + '&cbr=256'
3856 url_parts = compat_urllib_parse_urlparse(video_url)
3857 video_ext = url_parts.path.rpartition('.')[2]
3862 'title': data['title'],
3863 'description': data.get('teaser_text'),
3864 'location': data.get('country_of_origin'),
3865 'uploader': data.get('host', {}).get('name'),
3866 'uploader_id': data.get('host', {}).get('slug'),
3867 'thumbnail': data.get('image', {}).get('large_url_2x'),
3868 'duration': data.get('duration'),
3873 class YouPornIE(InfoExtractor):
3874 """Information extractor for youporn.com."""
3875 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3877 def _print_formats(self, formats):
3878 """Print all available formats"""
3879 print(u'Available formats:')
3880 print(u'ext\t\tformat')
3881 print(u'---------------------------------')
3882 for format in formats:
3883 print(u'%s\t\t%s' % (format['ext'], format['format']))
3885 def _specific(self, req_format, formats):
3887 if(x["format"]==req_format):
3891 def _real_extract(self, url):
3892 mobj = re.match(self._VALID_URL, url)
3894 self._downloader.report_error(u'invalid URL: %s' % url)
3897 video_id = mobj.group('videoid')
3899 req = compat_urllib_request.Request(url)
3900 req.add_header('Cookie', 'age_verified=1')
3901 webpage = self._download_webpage(req, video_id)
3903 # Get the video title
3904 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3906 raise ExtractorError(u'Unable to extract video title')
3907 video_title = result.group('title').strip()
3909 # Get the video date
3910 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3912 self._downloader.report_warning(u'unable to extract video date')
3915 upload_date = result.group('date').strip()
3917 # Get the video uploader
3918 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3920 self._downloader.report_warning(u'unable to extract uploader')
3921 video_uploader = None
3923 video_uploader = result.group('uploader').strip()
3924 video_uploader = clean_html( video_uploader )
3926 # Get all of the formats available
3927 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3928 result = re.search(DOWNLOAD_LIST_RE, webpage)
3930 raise ExtractorError(u'Unable to extract download list')
3931 download_list_html = result.group('download_list').strip()
3933 # Get all of the links from the page
3934 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3935 links = re.findall(LINK_RE, download_list_html)
3936 if(len(links) == 0):
3937 raise ExtractorError(u'ERROR: no known formats available for video')
3939 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3944 # A link looks like this:
3945 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3946 # A path looks like this:
3947 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3948 video_url = unescapeHTML( link )
3949 path = compat_urllib_parse_urlparse( video_url ).path
3950 extension = os.path.splitext( path )[1][1:]
3951 format = path.split('/')[4].split('_')[:2]
3954 format = "-".join( format )
3955 title = u'%s-%s-%s' % (video_title, size, bitrate)
3960 'uploader': video_uploader,
3961 'upload_date': upload_date,
3966 'description': None,
3970 if self._downloader.params.get('listformats', None):
3971 self._print_formats(formats)
3974 req_format = self._downloader.params.get('format', None)
3975 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3977 if req_format is None or req_format == 'best':
3979 elif req_format == 'worst':
3980 return [formats[-1]]
3981 elif req_format in ('-1', 'all'):
3984 format = self._specific( req_format, formats )
3986 self._downloader.report_error(u'requested format not available')
3992 class PornotubeIE(InfoExtractor):
3993 """Information extractor for pornotube.com."""
3994 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3996 def _real_extract(self, url):
3997 mobj = re.match(self._VALID_URL, url)
3999 self._downloader.report_error(u'invalid URL: %s' % url)
4002 video_id = mobj.group('videoid')
4003 video_title = mobj.group('title')
4005 # Get webpage content
4006 webpage = self._download_webpage(url, video_id)
4009 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4010 result = re.search(VIDEO_URL_RE, webpage)
4012 self._downloader.report_error(u'unable to extract video url')
4014 video_url = compat_urllib_parse.unquote(result.group('url'))
4016 #Get the uploaded date
4017 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4018 result = re.search(VIDEO_UPLOADED_RE, webpage)
4020 self._downloader.report_error(u'unable to extract video title')
4022 upload_date = result.group('date')
4024 info = {'id': video_id,
4027 'upload_date': upload_date,
4028 'title': video_title,
4034 class YouJizzIE(InfoExtractor):
4035 """Information extractor for youjizz.com."""
4036 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4038 def _real_extract(self, url):
4039 mobj = re.match(self._VALID_URL, url)
4041 self._downloader.report_error(u'invalid URL: %s' % url)
4044 video_id = mobj.group('videoid')
4046 # Get webpage content
4047 webpage = self._download_webpage(url, video_id)
4049 # Get the video title
4050 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4052 raise ExtractorError(u'ERROR: unable to extract video title')
4053 video_title = result.group('title').strip()
4055 # Get the embed page
4056 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4058 raise ExtractorError(u'ERROR: unable to extract embed page')
4060 embed_page_url = result.group(0).strip()
4061 video_id = result.group('videoid')
4063 webpage = self._download_webpage(embed_page_url, video_id)
4066 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4068 raise ExtractorError(u'ERROR: unable to extract video url')
4069 video_url = result.group('source')
4071 info = {'id': video_id,
4073 'title': video_title,
4076 'player_url': embed_page_url}
4080 class EightTracksIE(InfoExtractor):
4082 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4084 def _real_extract(self, url):
4085 mobj = re.match(self._VALID_URL, url)
4087 raise ExtractorError(u'Invalid URL: %s' % url)
4088 playlist_id = mobj.group('id')
4090 webpage = self._download_webpage(url, playlist_id)
4092 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4094 raise ExtractorError(u'Cannot find trax information')
4095 json_like = m.group(1)
4096 data = json.loads(json_like)
4098 session = str(random.randint(0, 1000000000))
4100 track_count = data['tracks_count']
4101 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4102 next_url = first_url
4104 for i in itertools.count():
4105 api_json = self._download_webpage(next_url, playlist_id,
4106 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4107 errnote=u'Failed to download song information')
4108 api_data = json.loads(api_json)
4109 track_data = api_data[u'set']['track']
4111 'id': track_data['id'],
4112 'url': track_data['track_file_stream_url'],
4113 'title': track_data['performer'] + u' - ' + track_data['name'],
4114 'raw_title': track_data['name'],
4115 'uploader_id': data['user']['login'],
4119 if api_data['set']['at_last_track']:
4121 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4124 class KeekIE(InfoExtractor):
4125 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4128 def _real_extract(self, url):
4129 m = re.match(self._VALID_URL, url)
4130 video_id = m.group('videoID')
4131 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4132 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4133 webpage = self._download_webpage(url, video_id)
4134 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4135 title = unescapeHTML(m.group('title'))
4136 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4137 uploader = clean_html(m.group('uploader'))
4143 'thumbnail': thumbnail,
4144 'uploader': uploader
4148 class TEDIE(InfoExtractor):
4149 _VALID_URL=r'''http://www.ted.com/
4151 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4153 ((?P<type_talk>talks)) # We have a simple talk
4155 /(?P<name>\w+) # Here goes the name and then ".html"
4159 def suitable(cls, url):
4160 """Receives a URL and returns True if suitable for this IE."""
4161 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4163 def _real_extract(self, url):
4164 m=re.match(self._VALID_URL, url, re.VERBOSE)
4165 if m.group('type_talk'):
4166 return [self._talk_info(url)]
4168 playlist_id=m.group('playlist_id')
4169 name=m.group('name')
4170 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4171 return self._playlist_videos_info(url,name,playlist_id)
4173 def _talk_video_link(self,mediaSlug):
4174 '''Returns the video link for that mediaSlug'''
4175 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4177 def _playlist_videos_info(self,url,name,playlist_id=0):
4178 '''Returns the videos of the playlist'''
4180 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4181 ([.\s]*?)data-playlist_item_id="(\d+)"
4182 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4184 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4185 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4186 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4187 m_names=re.finditer(video_name_RE,webpage)
4189 for m_video, m_name in zip(m_videos,m_names):
4190 video_id=m_video.group('video_id')
4191 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4192 info.append(self._talk_info(talk_url,video_id))
4195 def _talk_info(self, url, video_id=0):
4196 """Return the video for the talk in the url"""
4197 m=re.match(self._VALID_URL, url,re.VERBOSE)
4198 videoName=m.group('name')
4199 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4200 # If the url includes the language we get the title translated
4201 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4202 title=re.search(title_RE, webpage).group('title')
4203 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4204 "id":(?P<videoID>[\d]+).*?
4205 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4206 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4207 thumb_match=re.search(thumb_RE,webpage)
4208 info_match=re.search(info_RE,webpage,re.VERBOSE)
4209 video_id=info_match.group('videoID')
4210 mediaSlug=info_match.group('mediaSlug')
4211 video_url=self._talk_video_link(mediaSlug)
4217 'thumbnail': thumb_match.group('thumbnail')
4221 class MySpassIE(InfoExtractor):
4222 _VALID_URL = r'http://www.myspass.de/.*'
4224 def _real_extract(self, url):
4225 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4227 # video id is the last path element of the URL
4228 # usually there is a trailing slash, so also try the second but last
4229 url_path = compat_urllib_parse_urlparse(url).path
4230 url_parent_path, video_id = os.path.split(url_path)
4232 _, video_id = os.path.split(url_parent_path)
4235 metadata_url = META_DATA_URL_TEMPLATE % video_id
4236 metadata_text = self._download_webpage(metadata_url, video_id)
4237 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4239 # extract values from metadata
4240 url_flv_el = metadata.find('url_flv')
4241 if url_flv_el is None:
4242 self._downloader.report_error(u'unable to extract download url')
4244 video_url = url_flv_el.text
4245 extension = os.path.splitext(video_url)[1][1:]
4246 title_el = metadata.find('title')
4247 if title_el is None:
4248 self._downloader.report_error(u'unable to extract title')
4250 title = title_el.text
4251 format_id_el = metadata.find('format_id')
4252 if format_id_el is None:
4255 format = format_id_el.text
4256 description_el = metadata.find('description')
4257 if description_el is not None:
4258 description = description_el.text
4261 imagePreview_el = metadata.find('imagePreview')
4262 if imagePreview_el is not None:
4263 thumbnail = imagePreview_el.text
4272 'thumbnail': thumbnail,
4273 'description': description
4277 class SpiegelIE(InfoExtractor):
4278 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4280 def _real_extract(self, url):
4281 m = re.match(self._VALID_URL, url)
4282 video_id = m.group('videoID')
4284 webpage = self._download_webpage(url, video_id)
4285 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4287 raise ExtractorError(u'Cannot find title')
4288 video_title = unescapeHTML(m.group(1))
4290 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4291 xml_code = self._download_webpage(xml_url, video_id,
4292 note=u'Downloading XML', errnote=u'Failed to download XML')
4294 idoc = xml.etree.ElementTree.fromstring(xml_code)
4295 last_type = idoc[-1]
4296 filename = last_type.findall('./filename')[0].text
4297 duration = float(last_type.findall('./duration')[0].text)
4299 video_url = 'http://video2.spiegel.de/flash/' + filename
4300 video_ext = filename.rpartition('.')[2]
4305 'title': video_title,
4306 'duration': duration,
4310 class LiveLeakIE(InfoExtractor):
4312 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4313 IE_NAME = u'liveleak'
4315 def _real_extract(self, url):
4316 mobj = re.match(self._VALID_URL, url)
4318 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4321 video_id = mobj.group('video_id')
4323 webpage = self._download_webpage(url, video_id)
4325 m = re.search(r'file: "(.*?)",', webpage)
4327 self._downloader.report_error(u'unable to find video url')
4329 video_url = m.group(1)
4331 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4333 self._downloader.trouble(u'Cannot find video title')
4334 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4336 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4338 desc = unescapeHTML(m.group('desc'))
4342 m = re.search(r'By:.*?(\w+)</a>', webpage)
4344 uploader = clean_html(m.group(1))
4353 'description': desc,
4354 'uploader': uploader
4360 def gen_extractors():
4361 """ Return a list of an instance of every supported extractor.
4362 The order does matter; the first extractor matched is the one handling the URL.
4365 YoutubePlaylistIE(),
4390 StanfordOpenClassroomIE(),
4400 WorldStarHipHopIE(),