2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
614 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
615 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
616 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
618 format_limit = self._downloader.params.get('format_limit', None)
619 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
620 if format_limit is not None and format_limit in available_formats:
621 format_list = available_formats[available_formats.index(format_limit):]
623 format_list = available_formats
624 existing_formats = [x for x in format_list if x in url_map]
625 if len(existing_formats) == 0:
626 raise ExtractorError(u'no known formats available for video')
627 if self._downloader.params.get('listformats', None):
628 self._print_formats(existing_formats)
630 if req_format is None or req_format == 'best':
631 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632 elif req_format == 'worst':
633 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634 elif req_format in ('-1', 'all'):
635 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
637 # Specific formats. We pick the first in a slash-delimeted sequence.
638 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639 req_formats = req_format.split('/')
640 video_url_list = None
641 for rf in req_formats:
643 video_url_list = [(rf, url_map[rf])]
645 if video_url_list is None:
646 raise ExtractorError(u'requested format not available')
648 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
651 for format_param, video_real_url in video_url_list:
653 video_extension = self._video_extensions.get(format_param, 'flv')
655 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
656 self._video_dimensions.get(format_param, '???'))
660 'url': video_real_url,
661 'uploader': video_uploader,
662 'uploader_id': video_uploader_id,
663 'upload_date': upload_date,
664 'title': video_title,
665 'ext': video_extension,
666 'format': video_format,
667 'thumbnail': video_thumbnail,
668 'description': video_description,
669 'player_url': player_url,
670 'subtitles': video_subtitles,
671 'duration': video_duration
676 class MetacafeIE(InfoExtractor):
677 """Information Extractor for metacafe.com."""
679 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
680 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
681 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
682 IE_NAME = u'metacafe'
684 def report_disclaimer(self):
685 """Report disclaimer retrieval."""
686 self.to_screen(u'Retrieving disclaimer')
688 def _real_initialize(self):
689 # Retrieve disclaimer
690 request = compat_urllib_request.Request(self._DISCLAIMER)
692 self.report_disclaimer()
693 disclaimer = compat_urllib_request.urlopen(request).read()
694 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
695 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
700 'submit': "Continue - I'm over 18",
702 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
704 self.report_age_confirmation()
705 disclaimer = compat_urllib_request.urlopen(request).read()
706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
707 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
709 def _real_extract(self, url):
710 # Extract id and simplified title from URL
711 mobj = re.match(self._VALID_URL, url)
713 raise ExtractorError(u'Invalid URL: %s' % url)
715 video_id = mobj.group(1)
717 # Check if video comes from YouTube
718 mobj2 = re.match(r'^yt-(.*)$', video_id)
719 if mobj2 is not None:
720 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
722 # Retrieve video webpage to extract further information
723 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
725 # Extract URL, uploader and title from webpage
726 self.report_extraction(video_id)
727 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
729 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
730 video_extension = mediaURL[-3:]
732 # Extract gdaKey if available
733 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737 gdaKey = mobj.group(1)
738 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
740 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
742 raise ExtractorError(u'Unable to extract media URL')
743 vardict = compat_parse_qs(mobj.group(1))
744 if 'mediaData' not in vardict:
745 raise ExtractorError(u'Unable to extract media URL')
746 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
748 raise ExtractorError(u'Unable to extract media URL')
749 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
750 video_extension = mediaURL[-3:]
751 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
753 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
755 raise ExtractorError(u'Unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 raise ExtractorError(u'Unable to extract uploader nickname')
761 video_uploader = mobj.group(1)
764 'id': video_id.decode('utf-8'),
765 'url': video_url.decode('utf-8'),
766 'uploader': video_uploader.decode('utf-8'),
768 'title': video_title,
769 'ext': video_extension.decode('utf-8'),
772 class DailymotionIE(InfoExtractor):
773 """Information Extractor for Dailymotion"""
775 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
776 IE_NAME = u'dailymotion'
778 def _real_extract(self, url):
779 # Extract id and simplified title from URL
780 mobj = re.match(self._VALID_URL, url)
782 raise ExtractorError(u'Invalid URL: %s' % url)
784 video_id = mobj.group(1).split('_')[0].split('?')[0]
786 video_extension = 'mp4'
788 # Retrieve video webpage to extract further information
789 request = compat_urllib_request.Request(url)
790 request.add_header('Cookie', 'family_filter=off')
791 webpage = self._download_webpage(request, video_id)
793 # Extract URL, uploader and title from webpage
794 self.report_extraction(video_id)
795 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
797 raise ExtractorError(u'Unable to extract media URL')
798 flashvars = compat_urllib_parse.unquote(mobj.group(1))
800 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
803 self.to_screen(u'Using %s' % key)
806 raise ExtractorError(u'Unable to extract video URL')
808 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
810 raise ExtractorError(u'Unable to extract video URL')
812 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
814 # TODO: support choosing qualities
816 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
818 raise ExtractorError(u'Unable to extract title')
819 video_title = unescapeHTML(mobj.group('title'))
821 video_uploader = None
822 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
824 # lookin for official user
825 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
826 if mobj_official is None:
827 self._downloader.report_warning(u'unable to extract uploader nickname')
829 video_uploader = mobj_official.group(1)
831 video_uploader = mobj.group(1)
833 video_upload_date = None
834 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
836 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
841 'uploader': video_uploader,
842 'upload_date': video_upload_date,
843 'title': video_title,
844 'ext': video_extension,
848 class PhotobucketIE(InfoExtractor):
849 """Information extractor for photobucket.com."""
851 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
852 IE_NAME = u'photobucket'
854 def _real_extract(self, url):
855 # Extract id from URL
856 mobj = re.match(self._VALID_URL, url)
858 raise ExtractorError(u'Invalid URL: %s' % url)
860 video_id = mobj.group(1)
862 video_extension = 'flv'
864 # Retrieve video webpage to extract further information
865 request = compat_urllib_request.Request(url)
867 self.report_download_webpage(video_id)
868 webpage = compat_urllib_request.urlopen(request).read()
869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
870 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
872 # Extract URL, uploader, and title from webpage
873 self.report_extraction(video_id)
874 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
876 raise ExtractorError(u'Unable to extract media URL')
877 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
881 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
883 raise ExtractorError(u'Unable to extract title')
884 video_title = mobj.group(1).decode('utf-8')
886 video_uploader = mobj.group(2).decode('utf-8')
889 'id': video_id.decode('utf-8'),
890 'url': video_url.decode('utf-8'),
891 'uploader': video_uploader,
893 'title': video_title,
894 'ext': video_extension.decode('utf-8'),
898 class YahooIE(InfoExtractor):
899 """Information extractor for video.yahoo.com."""
902 # _VALID_URL matches all Yahoo! Video URLs
903 # _VPAGE_URL matches only the extractable '/watch/' URLs
904 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
905 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
906 IE_NAME = u'video.yahoo'
908 def _real_extract(self, url, new_video=True):
909 # Extract ID from URL
910 mobj = re.match(self._VALID_URL, url)
912 raise ExtractorError(u'Invalid URL: %s' % url)
914 video_id = mobj.group(2)
915 video_extension = 'flv'
917 # Rewrite valid but non-extractable URLs as
918 # extractable English language /watch/ URLs
919 if re.match(self._VPAGE_URL, url) is None:
920 request = compat_urllib_request.Request(url)
922 webpage = compat_urllib_request.urlopen(request).read()
923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
924 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
926 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
928 raise ExtractorError(u'Unable to extract id field')
929 yahoo_id = mobj.group(1)
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
933 raise ExtractorError(u'Unable to extract vid field')
934 yahoo_vid = mobj.group(1)
936 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
937 return self._real_extract(url, new_video=False)
939 # Retrieve video webpage to extract further information
940 request = compat_urllib_request.Request(url)
942 self.report_download_webpage(video_id)
943 webpage = compat_urllib_request.urlopen(request).read()
944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
945 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
947 # Extract uploader and title from webpage
948 self.report_extraction(video_id)
949 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
951 raise ExtractorError(u'Unable to extract video title')
952 video_title = mobj.group(1).decode('utf-8')
954 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
956 raise ExtractorError(u'Unable to extract video uploader')
957 video_uploader = mobj.group(1).decode('utf-8')
959 # Extract video thumbnail
960 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
962 raise ExtractorError(u'Unable to extract video thumbnail')
963 video_thumbnail = mobj.group(1).decode('utf-8')
965 # Extract video description
966 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
968 raise ExtractorError(u'Unable to extract video description')
969 video_description = mobj.group(1).decode('utf-8')
970 if not video_description:
971 video_description = 'No description available.'
973 # Extract video height and width
974 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
976 raise ExtractorError(u'Unable to extract video height')
977 yv_video_height = mobj.group(1)
979 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
981 raise ExtractorError(u'Unable to extract video width')
982 yv_video_width = mobj.group(1)
984 # Retrieve video playlist to extract media URL
985 # I'm not completely sure what all these options are, but we
986 # seem to need most of them, otherwise the server sends a 401.
987 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
988 yv_bitrate = '700' # according to Wikipedia this is hard-coded
989 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
990 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
991 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
993 self.report_download_webpage(video_id)
994 webpage = compat_urllib_request.urlopen(request).read()
995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
996 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
998 # Extract media URL from playlist XML
999 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1001 raise ExtractorError(u'Unable to extract media URL')
1002 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003 video_url = unescapeHTML(video_url)
1006 'id': video_id.decode('utf-8'),
1008 'uploader': video_uploader,
1009 'upload_date': None,
1010 'title': video_title,
1011 'ext': video_extension.decode('utf-8'),
1012 'thumbnail': video_thumbnail.decode('utf-8'),
1013 'description': video_description,
1017 class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1024 def _real_extract(self, url, new_video=True):
1025 # Extract ID from URL
1026 mobj = re.match(self._VALID_URL, url)
1028 raise ExtractorError(u'Invalid URL: %s' % url)
1030 video_id = mobj.group('id')
1031 if not mobj.group('proto'):
1032 url = 'https://' + url
1033 if mobj.group('direct_link'):
1034 url = 'https://vimeo.com/' + video_id
1036 # Retrieve video webpage to extract further information
1037 request = compat_urllib_request.Request(url, None, std_headers)
1038 webpage = self._download_webpage(request, video_id)
1040 # Now we begin extracting as much information as we can from what we
1041 # retrieved. First we extract the information common to all extractors,
1042 # and latter we extract those that are Vimeo specific.
1043 self.report_extraction(video_id)
1045 # Extract the config JSON
1047 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1048 config = json.loads(config)
1050 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1051 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1053 raise ExtractorError(u'Unable to extract info section')
1056 video_title = config["video"]["title"]
1058 # Extract uploader and uploader_id
1059 video_uploader = config["video"]["owner"]["name"]
1060 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1062 # Extract video thumbnail
1063 video_thumbnail = config["video"]["thumbnail"]
1065 # Extract video description
1066 video_description = get_element_by_attribute("itemprop", "description", webpage)
1067 if video_description: video_description = clean_html(video_description)
1068 else: video_description = u''
1070 # Extract upload date
1071 video_upload_date = None
1072 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1073 if mobj is not None:
1074 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1076 # Vimeo specific: extract request signature and timestamp
1077 sig = config['request']['signature']
1078 timestamp = config['request']['timestamp']
1080 # Vimeo specific: extract video codec and quality information
1081 # First consider quality, then codecs, then take everything
1082 # TODO bind to format param
1083 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1084 files = { 'hd': [], 'sd': [], 'other': []}
1085 for codec_name, codec_extension in codecs:
1086 if codec_name in config["video"]["files"]:
1087 if 'hd' in config["video"]["files"][codec_name]:
1088 files['hd'].append((codec_name, codec_extension, 'hd'))
1089 elif 'sd' in config["video"]["files"][codec_name]:
1090 files['sd'].append((codec_name, codec_extension, 'sd'))
1092 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1094 for quality in ('hd', 'sd', 'other'):
1095 if len(files[quality]) > 0:
1096 video_quality = files[quality][0][2]
1097 video_codec = files[quality][0][0]
1098 video_extension = files[quality][0][1]
1099 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1102 raise ExtractorError(u'No known codec found')
1104 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1105 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1110 'uploader': video_uploader,
1111 'uploader_id': video_uploader_id,
1112 'upload_date': video_upload_date,
1113 'title': video_title,
1114 'ext': video_extension,
1115 'thumbnail': video_thumbnail,
1116 'description': video_description,
1120 class ArteTvIE(InfoExtractor):
1121 """arte.tv information extractor."""
1123 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1124 _LIVE_URL = r'index-[0-9]+\.html$'
1126 IE_NAME = u'arte.tv'
1128 def fetch_webpage(self, url):
1129 request = compat_urllib_request.Request(url)
1131 self.report_download_webpage(url)
1132 webpage = compat_urllib_request.urlopen(request).read()
1133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1134 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1135 except ValueError as err:
1136 raise ExtractorError(u'Invalid URL: %s' % url)
1139 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1140 page = self.fetch_webpage(url)
1141 mobj = re.search(regex, page, regexFlags)
1145 raise ExtractorError(u'Invalid URL: %s' % url)
1147 for (i, key, err) in matchTuples:
1148 if mobj.group(i) is None:
1149 raise ExtractorError(err)
1151 info[key] = mobj.group(i)
1155 def extractLiveStream(self, url):
1156 video_lang = url.split('/')[-4]
1157 info = self.grep_webpage(
1159 r'src="(.*?/videothek_js.*?\.js)',
1162 (1, 'url', u'Invalid URL: %s' % url)
1165 http_host = url.split('/')[2]
1166 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1167 info = self.grep_webpage(
1169 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1170 '(http://.*?\.swf).*?' +
1174 (1, 'path', u'could not extract video path: %s' % url),
1175 (2, 'player', u'could not extract video player: %s' % url),
1176 (3, 'url', u'could not extract video url: %s' % url)
1179 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1181 def extractPlus7Stream(self, url):
1182 video_lang = url.split('/')[-3]
1183 info = self.grep_webpage(
1185 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1188 (1, 'url', u'Invalid URL: %s' % url)
1191 next_url = compat_urllib_parse.unquote(info.get('url'))
1192 info = self.grep_webpage(
1194 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1197 (1, 'url', u'Could not find <video> tag: %s' % url)
1200 next_url = compat_urllib_parse.unquote(info.get('url'))
1202 info = self.grep_webpage(
1204 r'<video id="(.*?)".*?>.*?' +
1205 '<name>(.*?)</name>.*?' +
1206 '<dateVideo>(.*?)</dateVideo>.*?' +
1207 '<url quality="hd">(.*?)</url>',
1210 (1, 'id', u'could not extract video id: %s' % url),
1211 (2, 'title', u'could not extract video title: %s' % url),
1212 (3, 'date', u'could not extract video date: %s' % url),
1213 (4, 'url', u'could not extract video url: %s' % url)
1218 'id': info.get('id'),
1219 'url': compat_urllib_parse.unquote(info.get('url')),
1220 'uploader': u'arte.tv',
1221 'upload_date': unified_strdate(info.get('date')),
1222 'title': info.get('title').decode('utf-8'),
1228 def _real_extract(self, url):
1229 video_id = url.split('/')[-1]
1230 self.report_extraction(video_id)
1232 if re.search(self._LIVE_URL, video_id) is not None:
1233 self.extractLiveStream(url)
1236 info = self.extractPlus7Stream(url)
1241 class GenericIE(InfoExtractor):
1242 """Generic last-resort information extractor."""
1245 IE_NAME = u'generic'
1247 def report_download_webpage(self, video_id):
1248 """Report webpage download."""
1249 if not self._downloader.params.get('test', False):
1250 self._downloader.report_warning(u'Falling back on generic information extractor.')
1251 super(GenericIE, self).report_download_webpage(video_id)
1253 def report_following_redirect(self, new_url):
1254 """Report information extraction."""
1255 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1257 def _test_redirect(self, url):
1258 """Check if it is a redirect, like url shorteners, in case return the new url."""
1259 class HeadRequest(compat_urllib_request.Request):
1260 def get_method(self):
1263 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1265 Subclass the HTTPRedirectHandler to make it use our
1266 HeadRequest also on the redirected URL
1268 def redirect_request(self, req, fp, code, msg, headers, newurl):
1269 if code in (301, 302, 303, 307):
1270 newurl = newurl.replace(' ', '%20')
1271 newheaders = dict((k,v) for k,v in req.headers.items()
1272 if k.lower() not in ("content-length", "content-type"))
1273 return HeadRequest(newurl,
1275 origin_req_host=req.get_origin_req_host(),
1278 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1280 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1282 Fallback to GET if HEAD is not allowed (405 HTTP error)
1284 def http_error_405(self, req, fp, code, msg, headers):
1288 newheaders = dict((k,v) for k,v in req.headers.items()
1289 if k.lower() not in ("content-length", "content-type"))
1290 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1292 origin_req_host=req.get_origin_req_host(),
1296 opener = compat_urllib_request.OpenerDirector()
1297 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1298 HTTPMethodFallback, HEADRedirectHandler,
1299 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1300 opener.add_handler(handler())
1302 response = opener.open(HeadRequest(url))
1303 new_url = response.geturl()
1308 self.report_following_redirect(new_url)
1311 def _real_extract(self, url):
1312 new_url = self._test_redirect(url)
1313 if new_url: return [self.url_result(new_url)]
1315 video_id = url.split('/')[-1]
1317 webpage = self._download_webpage(url, video_id)
1318 except ValueError as err:
1319 # since this is the last-resort InfoExtractor, if
1320 # this error is thrown, it'll be thrown here
1321 raise ExtractorError(u'Invalid URL: %s' % url)
1323 self.report_extraction(video_id)
1324 # Start with something easy: JW Player in SWFObject
1325 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1327 # Broaden the search a little bit
1328 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1330 # Broaden the search a little bit: JWPlayer JS loader
1331 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1333 raise ExtractorError(u'Invalid URL: %s' % url)
1335 # It's possible that one of the regexes
1336 # matched, but returned an empty group:
1337 if mobj.group(1) is None:
1338 raise ExtractorError(u'Invalid URL: %s' % url)
1340 video_url = compat_urllib_parse.unquote(mobj.group(1))
1341 video_id = os.path.basename(video_url)
1343 # here's a fun little line of code for you:
1344 video_extension = os.path.splitext(video_id)[1][1:]
1345 video_id = os.path.splitext(video_id)[0]
1347 # it's tempting to parse this further, but you would
1348 # have to take into account all the variations like
1349 # Video Title - Site Name
1350 # Site Name | Video Title
1351 # Video Title - Tagline | Site Name
1352 # and so on and so forth; it's just not practical
1353 mobj = re.search(r'<title>(.*)</title>', webpage)
1355 raise ExtractorError(u'Unable to extract title')
1356 video_title = mobj.group(1)
1358 # video uploader is domain name
1359 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1361 raise ExtractorError(u'Unable to extract title')
1362 video_uploader = mobj.group(1)
1367 'uploader': video_uploader,
1368 'upload_date': None,
1369 'title': video_title,
1370 'ext': video_extension,
1374 class YoutubeSearchIE(InfoExtractor):
1375 """Information Extractor for YouTube search queries."""
1376 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1377 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1378 _max_youtube_results = 1000
1379 IE_NAME = u'youtube:search'
1381 def report_download_page(self, query, pagenum):
1382 """Report attempt to download search page with given number."""
1383 query = query.decode(preferredencoding())
1384 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1386 def _real_extract(self, query):
1387 mobj = re.match(self._VALID_URL, query)
1389 raise ExtractorError(u'Invalid search query "%s"' % query)
1391 prefix, query = query.split(':')
1393 query = query.encode('utf-8')
1395 return self._get_n_results(query, 1)
1396 elif prefix == 'all':
1397 self._get_n_results(query, self._max_youtube_results)
1402 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1403 elif n > self._max_youtube_results:
1404 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1405 n = self._max_youtube_results
1406 return self._get_n_results(query, n)
1407 except ValueError: # parsing prefix as integer fails
1408 return self._get_n_results(query, 1)
1410 def _get_n_results(self, query, n):
1411 """Get a specified number of results for a query"""
1417 while (50 * pagenum) < limit:
1418 self.report_download_page(query, pagenum+1)
1419 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1420 request = compat_urllib_request.Request(result_url)
1422 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1423 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1424 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1425 api_response = json.loads(data)['data']
1427 if not 'items' in api_response:
1428 raise ExtractorError(u'[youtube] No video results')
1430 new_ids = list(video['id'] for video in api_response['items'])
1431 video_ids += new_ids
1433 limit = min(n, api_response['totalItems'])
1436 if len(video_ids) > n:
1437 video_ids = video_ids[:n]
1438 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1442 class GoogleSearchIE(InfoExtractor):
1443 """Information Extractor for Google Video search queries."""
1444 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1445 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1446 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1447 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1448 _max_google_results = 1000
1449 IE_NAME = u'video.google:search'
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1459 raise ExtractorError(u'Invalid search query "%s"' % query)
1461 prefix, query = query.split(':')
1463 query = query.encode('utf-8')
1465 self._download_n_results(query, 1)
1467 elif prefix == 'all':
1468 self._download_n_results(query, self._max_google_results)
1474 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1475 elif n > self._max_google_results:
1476 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1477 n = self._max_google_results
1478 self._download_n_results(query, n)
1480 except ValueError: # parsing prefix as integer fails
1481 self._download_n_results(query, 1)
1484 def _download_n_results(self, query, n):
1485 """Downloads a specified number of results for a query"""
1491 self.report_download_page(query, pagenum)
1492 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1493 request = compat_urllib_request.Request(result_url)
1495 page = compat_urllib_request.urlopen(request).read()
1496 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1497 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1499 # Extract video identifiers
1500 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1501 video_id = mobj.group(1)
1502 if video_id not in video_ids:
1503 video_ids.append(video_id)
1504 if len(video_ids) == n:
1505 # Specified n videos reached
1506 for id in video_ids:
1507 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1510 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1511 for id in video_ids:
1512 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1515 pagenum = pagenum + 1
1518 class YahooSearchIE(InfoExtractor):
1519 """Information Extractor for Yahoo! Video search queries."""
1522 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1523 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1524 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1525 _MORE_PAGES_INDICATOR = r'\s*Next'
1526 _max_yahoo_results = 1000
1527 IE_NAME = u'video.yahoo:search'
1529 def report_download_page(self, query, pagenum):
1530 """Report attempt to download playlist page with given number."""
1531 query = query.decode(preferredencoding())
1532 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1534 def _real_extract(self, query):
1535 mobj = re.match(self._VALID_URL, query)
1537 raise ExtractorError(u'Invalid search query "%s"' % query)
1539 prefix, query = query.split(':')
1541 query = query.encode('utf-8')
1543 self._download_n_results(query, 1)
1545 elif prefix == 'all':
1546 self._download_n_results(query, self._max_yahoo_results)
1552 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1553 elif n > self._max_yahoo_results:
1554 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1555 n = self._max_yahoo_results
1556 self._download_n_results(query, n)
1558 except ValueError: # parsing prefix as integer fails
1559 self._download_n_results(query, 1)
1562 def _download_n_results(self, query, n):
1563 """Downloads a specified number of results for a query"""
1566 already_seen = set()
1570 self.report_download_page(query, pagenum)
1571 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1572 request = compat_urllib_request.Request(result_url)
1574 page = compat_urllib_request.urlopen(request).read()
1575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1576 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1578 # Extract video identifiers
1579 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1580 video_id = mobj.group(1)
1581 if video_id not in already_seen:
1582 video_ids.append(video_id)
1583 already_seen.add(video_id)
1584 if len(video_ids) == n:
1585 # Specified n videos reached
1586 for id in video_ids:
1587 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1590 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1591 for id in video_ids:
1592 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1595 pagenum = pagenum + 1
1598 class YoutubePlaylistIE(InfoExtractor):
1599 """Information Extractor for YouTube playlists."""
1601 _VALID_URL = r"""(?:
1606 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1607 \? (?:.*?&)*? (?:p|a|list)=
1610 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1613 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1615 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1617 IE_NAME = u'youtube:playlist'
1620 def suitable(cls, url):
1621 """Receives a URL and returns True if suitable for this IE."""
1622 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1624 def _real_extract(self, url):
1625 # Extract playlist id
1626 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1628 raise ExtractorError(u'Invalid URL: %s' % url)
1630 # Download playlist videos from API
1631 playlist_id = mobj.group(1) or mobj.group(2)
1636 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1637 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1640 response = json.loads(page)
1641 except ValueError as err:
1642 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1644 if 'feed' not in response:
1645 raise ExtractorError(u'Got a malformed response from YouTube API')
1646 playlist_title = response['feed']['title']['$t']
1647 if 'entry' not in response['feed']:
1648 # Number of videos is a multiple of self._MAX_RESULTS
1651 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1652 for entry in response['feed']['entry']
1653 if 'content' in entry ]
1655 if len(response['feed']['entry']) < self._MAX_RESULTS:
1659 videos = [v[1] for v in sorted(videos)]
1661 url_results = [self.url_result(url, 'Youtube') for url in videos]
1662 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1665 class YoutubeChannelIE(InfoExtractor):
1666 """Information Extractor for YouTube channels."""
1668 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1669 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1670 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1671 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1672 IE_NAME = u'youtube:channel'
1674 def extract_videos_from_page(self, page):
1676 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1677 if mobj.group(1) not in ids_in_page:
1678 ids_in_page.append(mobj.group(1))
1681 def _real_extract(self, url):
1682 # Extract channel id
1683 mobj = re.match(self._VALID_URL, url)
1685 raise ExtractorError(u'Invalid URL: %s' % url)
1687 # Download channel page
1688 channel_id = mobj.group(1)
1692 url = self._TEMPLATE_URL % (channel_id, pagenum)
1693 page = self._download_webpage(url, channel_id,
1694 u'Downloading page #%s' % pagenum)
1696 # Extract video identifiers
1697 ids_in_page = self.extract_videos_from_page(page)
1698 video_ids.extend(ids_in_page)
1700 # Download any subsequent channel pages using the json-based channel_ajax query
1701 if self._MORE_PAGES_INDICATOR in page:
1703 pagenum = pagenum + 1
1705 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1706 page = self._download_webpage(url, channel_id,
1707 u'Downloading page #%s' % pagenum)
1709 page = json.loads(page)
1711 ids_in_page = self.extract_videos_from_page(page['content_html'])
1712 video_ids.extend(ids_in_page)
1714 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1717 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1719 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1720 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1721 return [self.playlist_result(url_entries, channel_id)]
1724 class YoutubeUserIE(InfoExtractor):
1725 """Information Extractor for YouTube users."""
1727 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1728 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1729 _GDATA_PAGE_SIZE = 50
1730 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1731 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1732 IE_NAME = u'youtube:user'
1734 def _real_extract(self, url):
1736 mobj = re.match(self._VALID_URL, url)
1738 raise ExtractorError(u'Invalid URL: %s' % url)
1740 username = mobj.group(1)
1742 # Download video ids using YouTube Data API. Result size per
1743 # query is limited (currently to 50 videos) so we need to query
1744 # page by page until there are no video ids - it means we got
1751 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1753 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1754 page = self._download_webpage(gdata_url, username,
1755 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1757 # Extract video identifiers
1760 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1761 if mobj.group(1) not in ids_in_page:
1762 ids_in_page.append(mobj.group(1))
1764 video_ids.extend(ids_in_page)
1766 # A little optimization - if current page is not
1767 # "full", ie. does not contain PAGE_SIZE video ids then
1768 # we can assume that this page is the last one - there
1769 # are no more ids on further pages - no need to query
1772 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1777 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1778 url_results = [self.url_result(url, 'Youtube') for url in urls]
1779 return [self.playlist_result(url_results, playlist_title = username)]
1782 class BlipTVUserIE(InfoExtractor):
1783 """Information Extractor for blip.tv users."""
1785 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1787 IE_NAME = u'blip.tv:user'
1789 def _real_extract(self, url):
1791 mobj = re.match(self._VALID_URL, url)
1793 raise ExtractorError(u'Invalid URL: %s' % url)
1795 username = mobj.group(1)
1797 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1799 page = self._download_webpage(url, username, u'Downloading user page')
1800 mobj = re.search(r'data-users-id="([^"]+)"', page)
1801 page_base = page_base % mobj.group(1)
1804 # Download video ids using BlipTV Ajax calls. Result size per
1805 # query is limited (currently to 12 videos) so we need to query
1806 # page by page until there are no video ids - it means we got
1813 url = page_base + "&page=" + str(pagenum)
1814 page = self._download_webpage(url, username,
1815 u'Downloading video ids from page %d' % pagenum)
1817 # Extract video identifiers
1820 for mobj in re.finditer(r'href="/([^"]+)"', page):
1821 if mobj.group(1) not in ids_in_page:
1822 ids_in_page.append(unescapeHTML(mobj.group(1)))
1824 video_ids.extend(ids_in_page)
1826 # A little optimization - if current page is not
1827 # "full", ie. does not contain PAGE_SIZE video ids then
1828 # we can assume that this page is the last one - there
1829 # are no more ids on further pages - no need to query
1832 if len(ids_in_page) < self._PAGE_SIZE:
1837 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1838 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1839 return [self.playlist_result(url_entries, playlist_title = username)]
1842 class DepositFilesIE(InfoExtractor):
1843 """Information extractor for depositfiles.com"""
1845 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1847 def _real_extract(self, url):
1848 file_id = url.split('/')[-1]
1849 # Rebuild url in english locale
1850 url = 'http://depositfiles.com/en/files/' + file_id
1852 # Retrieve file webpage with 'Free download' button pressed
1853 free_download_indication = { 'gateway_result' : '1' }
1854 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1856 self.report_download_webpage(file_id)
1857 webpage = compat_urllib_request.urlopen(request).read()
1858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1861 # Search for the real file URL
1862 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1863 if (mobj is None) or (mobj.group(1) is None):
1864 # Try to figure out reason of the error.
1865 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1866 if (mobj is not None) and (mobj.group(1) is not None):
1867 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1868 raise ExtractorError(u'%s' % restriction_message)
1870 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1872 file_url = mobj.group(1)
1873 file_extension = os.path.splitext(file_url)[1][1:]
1875 # Search for file title
1876 mobj = re.search(r'<b title="(.*?)">', webpage)
1878 raise ExtractorError(u'Unable to extract title')
1879 file_title = mobj.group(1).decode('utf-8')
1882 'id': file_id.decode('utf-8'),
1883 'url': file_url.decode('utf-8'),
1885 'upload_date': None,
1886 'title': file_title,
1887 'ext': file_extension.decode('utf-8'),
1891 class FacebookIE(InfoExtractor):
1892 """Information Extractor for Facebook"""
1894 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1895 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1896 _NETRC_MACHINE = 'facebook'
1897 IE_NAME = u'facebook'
1899 def report_login(self):
1900 """Report attempt to log in."""
1901 self.to_screen(u'Logging in')
1903 def _real_initialize(self):
1904 if self._downloader is None:
1909 downloader_params = self._downloader.params
1911 # Attempt to use provided username and password or .netrc data
1912 if downloader_params.get('username', None) is not None:
1913 useremail = downloader_params['username']
1914 password = downloader_params['password']
1915 elif downloader_params.get('usenetrc', False):
1917 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1918 if info is not None:
1922 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1923 except (IOError, netrc.NetrcParseError) as err:
1924 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1927 if useremail is None:
1936 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1939 login_results = compat_urllib_request.urlopen(request).read()
1940 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1941 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1943 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1947 def _real_extract(self, url):
1948 mobj = re.match(self._VALID_URL, url)
1950 raise ExtractorError(u'Invalid URL: %s' % url)
1951 video_id = mobj.group('ID')
1953 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1954 webpage = self._download_webpage(url, video_id)
1956 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1957 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1958 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1960 raise ExtractorError(u'Cannot parse data')
1961 data = dict(json.loads(m.group(1)))
1962 params_raw = compat_urllib_parse.unquote(data['params'])
1963 params = json.loads(params_raw)
1964 video_data = params['video_data'][0]
1965 video_url = video_data.get('hd_src')
1967 video_url = video_data['sd_src']
1969 raise ExtractorError(u'Cannot find video URL')
1970 video_duration = int(video_data['video_duration'])
1971 thumbnail = video_data['thumbnail_src']
1973 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1975 raise ExtractorError(u'Cannot find title in webpage')
1976 video_title = unescapeHTML(m.group(1))
1980 'title': video_title,
1983 'duration': video_duration,
1984 'thumbnail': thumbnail,
1989 class BlipTVIE(InfoExtractor):
1990 """Information extractor for blip.tv"""
1992 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1993 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1994 IE_NAME = u'blip.tv'
1996 def report_direct_download(self, title):
1997 """Report information extraction."""
1998 self.to_screen(u'%s: Direct download detected' % title)
2000 def _real_extract(self, url):
2001 mobj = re.match(self._VALID_URL, url)
2003 raise ExtractorError(u'Invalid URL: %s' % url)
2005 urlp = compat_urllib_parse_urlparse(url)
2006 if urlp.path.startswith('/play/'):
2007 request = compat_urllib_request.Request(url)
2008 response = compat_urllib_request.urlopen(request)
2009 redirecturl = response.geturl()
2010 rurlp = compat_urllib_parse_urlparse(redirecturl)
2011 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2012 url = 'http://blip.tv/a/a-' + file_id
2013 return self._real_extract(url)
2020 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2021 request = compat_urllib_request.Request(json_url)
2022 request.add_header('User-Agent', 'iTunes/10.6.1')
2023 self.report_extraction(mobj.group(1))
2026 urlh = compat_urllib_request.urlopen(request)
2027 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2028 basename = url.split('/')[-1]
2029 title,ext = os.path.splitext(basename)
2030 title = title.decode('UTF-8')
2031 ext = ext.replace('.', '')
2032 self.report_direct_download(title)
2037 'upload_date': None,
2042 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2043 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2044 if info is None: # Regular URL
2046 json_code_bytes = urlh.read()
2047 json_code = json_code_bytes.decode('utf-8')
2048 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2049 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2052 json_data = json.loads(json_code)
2053 if 'Post' in json_data:
2054 data = json_data['Post']
2058 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2059 video_url = data['media']['url']
2060 umobj = re.match(self._URL_EXT, video_url)
2062 raise ValueError('Can not determine filename extension')
2063 ext = umobj.group(1)
2066 'id': data['item_id'],
2068 'uploader': data['display_name'],
2069 'upload_date': upload_date,
2070 'title': data['title'],
2072 'format': data['media']['mimeType'],
2073 'thumbnail': data['thumbnailUrl'],
2074 'description': data['description'],
2075 'player_url': data['embedUrl'],
2076 'user_agent': 'iTunes/10.6.1',
2078 except (ValueError,KeyError) as err:
2079 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2084 class MyVideoIE(InfoExtractor):
2085 """Information Extractor for myvideo.de."""
2087 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2088 IE_NAME = u'myvideo'
2090 def _real_extract(self,url):
2091 mobj = re.match(self._VALID_URL, url)
2093 raise ExtractorError(u'Invalid URL: %s' % url)
2095 video_id = mobj.group(1)
2098 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2099 webpage = self._download_webpage(webpage_url, video_id)
2101 self.report_extraction(video_id)
2102 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2105 raise ExtractorError(u'Unable to extract media URL')
2106 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2108 mobj = re.search('<title>([^<]+)</title>', webpage)
2110 raise ExtractorError(u'Unable to extract title')
2112 video_title = mobj.group(1)
2118 'upload_date': None,
2119 'title': video_title,
2123 class ComedyCentralIE(InfoExtractor):
2124 """Information extractor for The Daily Show and Colbert Report """
2126 # urls can be abbreviations like :thedailyshow or :colbert
2127 # urls for episodes like:
2128 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2129 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2130 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2131 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2132 |(https?://)?(www\.)?
2133 (?P<showname>thedailyshow|colbertnation)\.com/
2134 (full-episodes/(?P<episode>.*)|
2136 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2137 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2140 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2142 _video_extensions = {
2150 _video_dimensions = {
2160 def suitable(cls, url):
2161 """Receives a URL and returns True if suitable for this IE."""
2162 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2164 def _print_formats(self, formats):
2165 print('Available formats:')
2167 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2170 def _real_extract(self, url):
2171 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2173 raise ExtractorError(u'Invalid URL: %s' % url)
2175 if mobj.group('shortname'):
2176 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2177 url = u'http://www.thedailyshow.com/full-episodes/'
2179 url = u'http://www.colbertnation.com/full-episodes/'
2180 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2181 assert mobj is not None
2183 if mobj.group('clip'):
2184 if mobj.group('showname') == 'thedailyshow':
2185 epTitle = mobj.group('tdstitle')
2187 epTitle = mobj.group('cntitle')
2190 dlNewest = not mobj.group('episode')
2192 epTitle = mobj.group('showname')
2194 epTitle = mobj.group('episode')
2196 self.report_extraction(epTitle)
2197 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2199 url = htmlHandle.geturl()
2200 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2202 raise ExtractorError(u'Invalid redirected URL: ' + url)
2203 if mobj.group('episode') == '':
2204 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2205 epTitle = mobj.group('episode')
2207 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2209 if len(mMovieParams) == 0:
2210 # The Colbert Report embeds the information in a without
2211 # a URL prefix; so extract the alternate reference
2212 # and then add the URL prefix manually.
2214 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2215 if len(altMovieParams) == 0:
2216 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2218 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2220 uri = mMovieParams[0][1]
2221 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2222 indexXml = self._download_webpage(indexUrl, epTitle,
2223 u'Downloading show index',
2224 u'unable to download episode index')
2228 idoc = xml.etree.ElementTree.fromstring(indexXml)
2229 itemEls = idoc.findall('.//item')
2230 for partNum,itemEl in enumerate(itemEls):
2231 mediaId = itemEl.findall('./guid')[0].text
2232 shortMediaId = mediaId.split(':')[-1]
2233 showId = mediaId.split(':')[-2].replace('.com', '')
2234 officialTitle = itemEl.findall('./title')[0].text
2235 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2237 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2238 compat_urllib_parse.urlencode({'uri': mediaId}))
2239 configXml = self._download_webpage(configUrl, epTitle,
2240 u'Downloading configuration for %s' % shortMediaId)
2242 cdoc = xml.etree.ElementTree.fromstring(configXml)
2244 for rendition in cdoc.findall('.//rendition'):
2245 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2249 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2252 if self._downloader.params.get('listformats', None):
2253 self._print_formats([i[0] for i in turls])
2256 # For now, just pick the highest bitrate
2257 format,rtmp_video_url = turls[-1]
2259 # Get the format arg from the arg stream
2260 req_format = self._downloader.params.get('format', None)
2262 # Select format if we can find one
2265 format, rtmp_video_url = f, v
2268 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2270 raise ExtractorError(u'Cannot transform RTMP url')
2271 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2272 video_url = base + m.group('finalid')
2274 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2279 'upload_date': officialDate,
2284 'description': officialTitle,
2286 results.append(info)
2291 class EscapistIE(InfoExtractor):
2292 """Information extractor for The Escapist """
2294 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2295 IE_NAME = u'escapist'
2297 def _real_extract(self, url):
2298 mobj = re.match(self._VALID_URL, url)
2300 raise ExtractorError(u'Invalid URL: %s' % url)
2301 showName = mobj.group('showname')
2302 videoId = mobj.group('episode')
2304 self.report_extraction(showName)
2305 webPage = self._download_webpage(url, showName)
2307 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2308 description = unescapeHTML(descMatch.group(1))
2309 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2310 imgUrl = unescapeHTML(imgMatch.group(1))
2311 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2312 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2313 configUrlMatch = re.search('config=(.*)$', playerUrl)
2314 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2316 configJSON = self._download_webpage(configUrl, showName,
2317 u'Downloading configuration',
2318 u'unable to download configuration')
2320 # Technically, it's JavaScript, not JSON
2321 configJSON = configJSON.replace("'", '"')
2324 config = json.loads(configJSON)
2325 except (ValueError,) as err:
2326 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2328 playlist = config['playlist']
2329 videoUrl = playlist[1]['url']
2334 'uploader': showName,
2335 'upload_date': None,
2338 'thumbnail': imgUrl,
2339 'description': description,
2340 'player_url': playerUrl,
2345 class CollegeHumorIE(InfoExtractor):
2346 """Information extractor for collegehumor.com"""
2349 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2350 IE_NAME = u'collegehumor'
2352 def report_manifest(self, video_id):
2353 """Report information extraction."""
2354 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2356 def _real_extract(self, url):
2357 mobj = re.match(self._VALID_URL, url)
2359 raise ExtractorError(u'Invalid URL: %s' % url)
2360 video_id = mobj.group('videoid')
2365 'upload_date': None,
2368 self.report_extraction(video_id)
2369 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2371 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2372 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2373 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2375 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2377 videoNode = mdoc.findall('./video')[0]
2378 info['description'] = videoNode.findall('./description')[0].text
2379 info['title'] = videoNode.findall('./caption')[0].text
2380 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2381 manifest_url = videoNode.findall('./file')[0].text
2383 raise ExtractorError(u'Invalid metadata XML file')
2385 manifest_url += '?hdcore=2.10.3'
2386 self.report_manifest(video_id)
2388 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2390 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2392 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2394 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2395 node_id = media_node.attrib['url']
2396 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2397 except IndexError as err:
2398 raise ExtractorError(u'Invalid manifest file')
2400 url_pr = compat_urllib_parse_urlparse(manifest_url)
2401 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2408 class XVideosIE(InfoExtractor):
2409 """Information extractor for xvideos.com"""
2411 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2412 IE_NAME = u'xvideos'
2414 def _real_extract(self, url):
2415 mobj = re.match(self._VALID_URL, url)
2417 raise ExtractorError(u'Invalid URL: %s' % url)
2418 video_id = mobj.group(1)
2420 webpage = self._download_webpage(url, video_id)
2422 self.report_extraction(video_id)
2426 mobj = re.search(r'flv_url=(.+?)&', webpage)
2428 raise ExtractorError(u'Unable to extract video url')
2429 video_url = compat_urllib_parse.unquote(mobj.group(1))
2433 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2435 raise ExtractorError(u'Unable to extract video title')
2436 video_title = mobj.group(1)
2439 # Extract video thumbnail
2440 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2442 raise ExtractorError(u'Unable to extract video thumbnail')
2443 video_thumbnail = mobj.group(0)
2449 'upload_date': None,
2450 'title': video_title,
2452 'thumbnail': video_thumbnail,
2453 'description': None,
2459 class SoundcloudIE(InfoExtractor):
2460 """Information extractor for soundcloud.com
2461 To access the media, the uid of the song and a stream token
2462 must be extracted from the page source and the script must make
2463 a request to media.soundcloud.com/crossdomain.xml. Then
2464 the media can be grabbed by requesting from an url composed
2465 of the stream token and uid
2468 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2469 IE_NAME = u'soundcloud'
2471 def report_resolve(self, video_id):
2472 """Report information extraction."""
2473 self.to_screen(u'%s: Resolving id' % video_id)
2475 def _real_extract(self, url):
2476 mobj = re.match(self._VALID_URL, url)
2478 raise ExtractorError(u'Invalid URL: %s' % url)
2480 # extract uploader (which is in the url)
2481 uploader = mobj.group(1)
2482 # extract simple title (uploader + slug of song title)
2483 slug_title = mobj.group(2)
2484 simple_title = uploader + u'-' + slug_title
2485 full_title = '%s/%s' % (uploader, slug_title)
2487 self.report_resolve(full_title)
2489 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2490 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2491 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2493 info = json.loads(info_json)
2494 video_id = info['id']
2495 self.report_extraction(full_title)
2497 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2498 stream_json = self._download_webpage(streams_url, full_title,
2499 u'Downloading stream definitions',
2500 u'unable to download stream definitions')
2502 streams = json.loads(stream_json)
2503 mediaURL = streams['http_mp3_128_url']
2504 upload_date = unified_strdate(info['created_at'])
2509 'uploader': info['user']['username'],
2510 'upload_date': upload_date,
2511 'title': info['title'],
2513 'description': info['description'],
2516 class SoundcloudSetIE(InfoExtractor):
2517 """Information extractor for soundcloud.com sets
2518 To access the media, the uid of the song and a stream token
2519 must be extracted from the page source and the script must make
2520 a request to media.soundcloud.com/crossdomain.xml. Then
2521 the media can be grabbed by requesting from an url composed
2522 of the stream token and uid
2525 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2526 IE_NAME = u'soundcloud:set'
2528 def report_resolve(self, video_id):
2529 """Report information extraction."""
2530 self.to_screen(u'%s: Resolving id' % video_id)
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2535 raise ExtractorError(u'Invalid URL: %s' % url)
2537 # extract uploader (which is in the url)
2538 uploader = mobj.group(1)
2539 # extract simple title (uploader + slug of song title)
2540 slug_title = mobj.group(2)
2541 simple_title = uploader + u'-' + slug_title
2542 full_title = '%s/sets/%s' % (uploader, slug_title)
2544 self.report_resolve(full_title)
2546 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2547 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2548 info_json = self._download_webpage(resolv_url, full_title)
2551 info = json.loads(info_json)
2552 if 'errors' in info:
2553 for err in info['errors']:
2554 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2557 self.report_extraction(full_title)
2558 for track in info['tracks']:
2559 video_id = track['id']
2561 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2562 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2564 self.report_extraction(video_id)
2565 streams = json.loads(stream_json)
2566 mediaURL = streams['http_mp3_128_url']
2571 'uploader': track['user']['username'],
2572 'upload_date': unified_strdate(track['created_at']),
2573 'title': track['title'],
2575 'description': track['description'],
2580 class InfoQIE(InfoExtractor):
2581 """Information extractor for infoq.com"""
2582 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2584 def _real_extract(self, url):
2585 mobj = re.match(self._VALID_URL, url)
2587 raise ExtractorError(u'Invalid URL: %s' % url)
2589 webpage = self._download_webpage(url, video_id=url)
2590 self.report_extraction(url)
2593 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2595 raise ExtractorError(u'Unable to extract video url')
2596 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2597 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2600 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2602 raise ExtractorError(u'Unable to extract video title')
2603 video_title = mobj.group(1)
2605 # Extract description
2606 video_description = u'No description available.'
2607 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2608 if mobj is not None:
2609 video_description = mobj.group(1)
2611 video_filename = video_url.split('/')[-1]
2612 video_id, extension = video_filename.split('.')
2618 'upload_date': None,
2619 'title': video_title,
2620 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2622 'description': video_description,
2627 class MixcloudIE(InfoExtractor):
2628 """Information extractor for www.mixcloud.com"""
2630 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2631 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2632 IE_NAME = u'mixcloud'
2634 def report_download_json(self, file_id):
2635 """Report JSON download."""
2636 self.to_screen(u'Downloading json')
2638 def get_urls(self, jsonData, fmt, bitrate='best'):
2639 """Get urls from 'audio_formats' section in json"""
2642 bitrate_list = jsonData[fmt]
2643 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2644 bitrate = max(bitrate_list) # select highest
2646 url_list = jsonData[fmt][bitrate]
2647 except TypeError: # we have no bitrate info.
2648 url_list = jsonData[fmt]
2651 def check_urls(self, url_list):
2652 """Returns 1st active url from list"""
2653 for url in url_list:
2655 compat_urllib_request.urlopen(url)
2657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2662 def _print_formats(self, formats):
2663 print('Available formats:')
2664 for fmt in formats.keys():
2665 for b in formats[fmt]:
2667 ext = formats[fmt][b][0]
2668 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2669 except TypeError: # we have no bitrate info
2670 ext = formats[fmt][0]
2671 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2674 def _real_extract(self, url):
2675 mobj = re.match(self._VALID_URL, url)
2677 raise ExtractorError(u'Invalid URL: %s' % url)
2678 # extract uploader & filename from url
2679 uploader = mobj.group(1).decode('utf-8')
2680 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2682 # construct API request
2683 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2684 # retrieve .json file with links to files
2685 request = compat_urllib_request.Request(file_url)
2687 self.report_download_json(file_url)
2688 jsonData = compat_urllib_request.urlopen(request).read()
2689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2690 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2693 json_data = json.loads(jsonData)
2694 player_url = json_data['player_swf_url']
2695 formats = dict(json_data['audio_formats'])
2697 req_format = self._downloader.params.get('format', None)
2700 if self._downloader.params.get('listformats', None):
2701 self._print_formats(formats)
2704 if req_format is None or req_format == 'best':
2705 for format_param in formats.keys():
2706 url_list = self.get_urls(formats, format_param)
2708 file_url = self.check_urls(url_list)
2709 if file_url is not None:
2712 if req_format not in formats:
2713 raise ExtractorError(u'Format is not available')
2715 url_list = self.get_urls(formats, req_format)
2716 file_url = self.check_urls(url_list)
2717 format_param = req_format
2720 'id': file_id.decode('utf-8'),
2721 'url': file_url.decode('utf-8'),
2722 'uploader': uploader.decode('utf-8'),
2723 'upload_date': None,
2724 'title': json_data['name'],
2725 'ext': file_url.split('.')[-1].decode('utf-8'),
2726 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2727 'thumbnail': json_data['thumbnail_url'],
2728 'description': json_data['description'],
2729 'player_url': player_url.decode('utf-8'),
2732 class StanfordOpenClassroomIE(InfoExtractor):
2733 """Information extractor for Stanford's Open ClassRoom"""
2735 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2736 IE_NAME = u'stanfordoc'
2738 def _real_extract(self, url):
2739 mobj = re.match(self._VALID_URL, url)
2741 raise ExtractorError(u'Invalid URL: %s' % url)
2743 if mobj.group('course') and mobj.group('video'): # A specific video
2744 course = mobj.group('course')
2745 video = mobj.group('video')
2747 'id': course + '_' + video,
2749 'upload_date': None,
2752 self.report_extraction(info['id'])
2753 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2754 xmlUrl = baseUrl + video + '.xml'
2756 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2757 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2758 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2759 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2761 info['title'] = mdoc.findall('./title')[0].text
2762 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2764 raise ExtractorError(u'Invalid metadata XML file')
2765 info['ext'] = info['url'].rpartition('.')[2]
2767 elif mobj.group('course'): # A course page
2768 course = mobj.group('course')
2773 'upload_date': None,
2776 coursepage = self._download_webpage(url, info['id'],
2777 note='Downloading course info page',
2778 errnote='Unable to download course info page')
2780 m = re.search('<h1>([^<]+)</h1>', coursepage)
2782 info['title'] = unescapeHTML(m.group(1))
2784 info['title'] = info['id']
2786 m = re.search('<description>([^<]+)</description>', coursepage)
2788 info['description'] = unescapeHTML(m.group(1))
2790 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2793 'type': 'reference',
2794 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2798 for entry in info['list']:
2799 assert entry['type'] == 'reference'
2800 results += self.extract(entry['url'])
2804 'id': 'Stanford OpenClassroom',
2807 'upload_date': None,
2810 self.report_download_webpage(info['id'])
2811 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2813 rootpage = compat_urllib_request.urlopen(rootURL).read()
2814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2817 info['title'] = info['id']
2819 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2822 'type': 'reference',
2823 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2828 for entry in info['list']:
2829 assert entry['type'] == 'reference'
2830 results += self.extract(entry['url'])
2833 class MTVIE(InfoExtractor):
2834 """Information extractor for MTV.com"""
2836 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2839 def _real_extract(self, url):
2840 mobj = re.match(self._VALID_URL, url)
2842 raise ExtractorError(u'Invalid URL: %s' % url)
2843 if not mobj.group('proto'):
2844 url = 'http://' + url
2845 video_id = mobj.group('videoid')
2847 webpage = self._download_webpage(url, video_id)
2849 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2851 raise ExtractorError(u'Unable to extract song name')
2852 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2853 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2855 raise ExtractorError(u'Unable to extract performer')
2856 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2857 video_title = performer + ' - ' + song_name
2859 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2861 raise ExtractorError(u'Unable to mtvn_uri')
2862 mtvn_uri = mobj.group(1)
2864 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2866 raise ExtractorError(u'Unable to extract content id')
2867 content_id = mobj.group(1)
2869 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2870 self.report_extraction(video_id)
2871 request = compat_urllib_request.Request(videogen_url)
2873 metadataXml = compat_urllib_request.urlopen(request).read()
2874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2877 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2878 renditions = mdoc.findall('.//rendition')
2880 # For now, always pick the highest quality.
2881 rendition = renditions[-1]
2884 _,_,ext = rendition.attrib['type'].partition('/')
2885 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2886 video_url = rendition.find('./src').text
2888 raise ExtractorError('Invalid rendition field.')
2893 'uploader': performer,
2894 'upload_date': None,
2895 'title': video_title,
2903 class YoukuIE(InfoExtractor):
2904 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2907 nowTime = int(time.time() * 1000)
2908 random1 = random.randint(1000,1998)
2909 random2 = random.randint(1000,9999)
2911 return "%d%d%d" %(nowTime,random1,random2)
2913 def _get_file_ID_mix_string(self, seed):
2915 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2917 for i in range(len(source)):
2918 seed = (seed * 211 + 30031 ) % 65536
2919 index = math.floor(seed / 65536 * len(source) )
2920 mixed.append(source[int(index)])
2921 source.remove(source[int(index)])
2922 #return ''.join(mixed)
2925 def _get_file_id(self, fileId, seed):
2926 mixed = self._get_file_ID_mix_string(seed)
2927 ids = fileId.split('*')
2931 realId.append(mixed[int(ch)])
2932 return ''.join(realId)
2934 def _real_extract(self, url):
2935 mobj = re.match(self._VALID_URL, url)
2937 raise ExtractorError(u'Invalid URL: %s' % url)
2938 video_id = mobj.group('ID')
2940 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2942 jsondata = self._download_webpage(info_url, video_id)
2944 self.report_extraction(video_id)
2946 config = json.loads(jsondata)
2948 video_title = config['data'][0]['title']
2949 seed = config['data'][0]['seed']
2951 format = self._downloader.params.get('format', None)
2952 supported_format = list(config['data'][0]['streamfileids'].keys())
2954 if format is None or format == 'best':
2955 if 'hd2' in supported_format:
2960 elif format == 'worst':
2968 fileid = config['data'][0]['streamfileids'][format]
2969 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2970 except (UnicodeDecodeError, ValueError, KeyError):
2971 raise ExtractorError(u'Unable to extract info section')
2974 sid = self._gen_sid()
2975 fileid = self._get_file_id(fileid, seed)
2977 #column 8,9 of fileid represent the segment number
2978 #fileid[7:9] should be changed
2979 for index, key in enumerate(keys):
2981 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2982 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2985 'id': '%s_part%02d' % (video_id, index),
2986 'url': download_url,
2988 'upload_date': None,
2989 'title': video_title,
2992 files_info.append(info)
2997 class XNXXIE(InfoExtractor):
2998 """Information extractor for xnxx.com"""
3000 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3002 VIDEO_URL_RE = r'flv_url=(.*?)&'
3003 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3004 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3006 def _real_extract(self, url):
3007 mobj = re.match(self._VALID_URL, url)
3009 raise ExtractorError(u'Invalid URL: %s' % url)
3010 video_id = mobj.group(1)
3012 # Get webpage content
3013 webpage = self._download_webpage(url, video_id)
3015 result = re.search(self.VIDEO_URL_RE, webpage)
3017 raise ExtractorError(u'Unable to extract video url')
3018 video_url = compat_urllib_parse.unquote(result.group(1))
3020 result = re.search(self.VIDEO_TITLE_RE, webpage)
3022 raise ExtractorError(u'Unable to extract video title')
3023 video_title = result.group(1)
3025 result = re.search(self.VIDEO_THUMB_RE, webpage)
3027 raise ExtractorError(u'Unable to extract video thumbnail')
3028 video_thumbnail = result.group(1)
3034 'upload_date': None,
3035 'title': video_title,
3037 'thumbnail': video_thumbnail,
3038 'description': None,
3042 class GooglePlusIE(InfoExtractor):
3043 """Information extractor for plus.google.com."""
3045 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3046 IE_NAME = u'plus.google'
3048 def report_extract_entry(self, url):
3049 """Report downloading extry"""
3050 self.to_screen(u'Downloading entry: %s' % url)
3052 def report_date(self, upload_date):
3053 """Report downloading extry"""
3054 self.to_screen(u'Entry date: %s' % upload_date)
3056 def report_uploader(self, uploader):
3057 """Report downloading extry"""
3058 self.to_screen(u'Uploader: %s' % uploader)
3060 def report_title(self, video_title):
3061 """Report downloading extry"""
3062 self.to_screen(u'Title: %s' % video_title)
3064 def report_extract_vid_page(self, video_page):
3065 """Report information extraction."""
3066 self.to_screen(u'Extracting video page: %s' % video_page)
3068 def _real_extract(self, url):
3069 # Extract id from URL
3070 mobj = re.match(self._VALID_URL, url)
3072 raise ExtractorError(u'Invalid URL: %s' % url)
3074 post_url = mobj.group(0)
3075 video_id = mobj.group(1)
3077 video_extension = 'flv'
3079 # Step 1, Retrieve post webpage to extract further information
3080 self.report_extract_entry(post_url)
3081 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3083 # Extract update date
3085 pattern = 'title="Timestamp">(.*?)</a>'
3086 mobj = re.search(pattern, webpage)
3088 upload_date = mobj.group(1)
3089 # Convert timestring to a format suitable for filename
3090 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3091 upload_date = upload_date.strftime('%Y%m%d')
3092 self.report_date(upload_date)
3096 pattern = r'rel\="author".*?>(.*?)</a>'
3097 mobj = re.search(pattern, webpage)
3099 uploader = mobj.group(1)
3100 self.report_uploader(uploader)
3103 # Get the first line for title
3105 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3106 mobj = re.search(pattern, webpage)
3108 video_title = mobj.group(1)
3109 self.report_title(video_title)
3111 # Step 2, Stimulate clicking the image box to launch video
3112 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3113 mobj = re.search(pattern, webpage)
3115 self._downloader.report_error(u'unable to extract video page URL')
3117 video_page = mobj.group(1)
3118 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3119 self.report_extract_vid_page(video_page)
3122 # Extract video links on video page
3123 """Extract video links of all sizes"""
3124 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3125 mobj = re.findall(pattern, webpage)
3127 self._downloader.report_error(u'unable to extract video links')
3129 # Sort in resolution
3130 links = sorted(mobj)
3132 # Choose the lowest of the sort, i.e. highest resolution
3133 video_url = links[-1]
3134 # Only get the url. The resolution part in the tuple has no use anymore
3135 video_url = video_url[-1]
3136 # Treat escaped \u0026 style hex
3138 video_url = video_url.decode("unicode_escape")
3139 except AttributeError: # Python 3
3140 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3146 'uploader': uploader,
3147 'upload_date': upload_date,
3148 'title': video_title,
3149 'ext': video_extension,
3152 class NBAIE(InfoExtractor):
3153 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3156 def _real_extract(self, url):
3157 mobj = re.match(self._VALID_URL, url)
3159 raise ExtractorError(u'Invalid URL: %s' % url)
3161 video_id = mobj.group(1)
3162 if video_id.endswith('/index.html'):
3163 video_id = video_id[:-len('/index.html')]
3165 webpage = self._download_webpage(url, video_id)
3167 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3168 def _findProp(rexp, default=None):
3169 m = re.search(rexp, webpage)
3171 return unescapeHTML(m.group(1))
3175 shortened_video_id = video_id.rpartition('/')[2]
3176 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3178 'id': shortened_video_id,
3182 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3183 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3187 class JustinTVIE(InfoExtractor):
3188 """Information extractor for justin.tv and twitch.tv"""
3189 # TODO: One broadcast may be split into multiple videos. The key
3190 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3191 # starts at 1 and increases. Can we treat all parts as one video?
3193 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3195 (?P<channelid>[^/]+)|
3196 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3197 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3201 _JUSTIN_PAGE_LIMIT = 100
3202 IE_NAME = u'justin.tv'
3204 def report_download_page(self, channel, offset):
3205 """Report attempt to download a single page of videos."""
3206 self.to_screen(u'%s: Downloading video information from %d to %d' %
3207 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3209 # Return count of items, list of *valid* items
3210 def _parse_page(self, url, video_id):
3211 webpage = self._download_webpage(url, video_id,
3212 u'Downloading video info JSON',
3213 u'unable to download video info JSON')
3215 response = json.loads(webpage)
3216 if type(response) != list:
3217 error_text = response.get('error', 'unknown error')
3218 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3220 for clip in response:
3221 video_url = clip['video_file_url']
3223 video_extension = os.path.splitext(video_url)[1][1:]
3224 video_date = re.sub('-', '', clip['start_time'][:10])
3225 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3226 video_id = clip['id']
3227 video_title = clip.get('title', video_id)
3231 'title': video_title,
3232 'uploader': clip.get('channel_name', video_uploader_id),
3233 'uploader_id': video_uploader_id,
3234 'upload_date': video_date,
3235 'ext': video_extension,
3237 return (len(response), info)
3239 def _real_extract(self, url):
3240 mobj = re.match(self._VALID_URL, url)
3242 raise ExtractorError(u'invalid URL: %s' % url)
3244 api_base = 'http://api.justin.tv'
3246 if mobj.group('channelid'):
3248 video_id = mobj.group('channelid')
3249 api = api_base + '/channel/archives/%s.json' % video_id
3250 elif mobj.group('chapterid'):
3251 chapter_id = mobj.group('chapterid')
3253 webpage = self._download_webpage(url, chapter_id)
3254 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3256 raise ExtractorError(u'Cannot find archive of a chapter')
3257 archive_id = m.group(1)
3259 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3260 chapter_info_xml = self._download_webpage(api, chapter_id,
3261 note=u'Downloading chapter information',
3262 errnote=u'Chapter information download failed')
3263 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3264 for a in doc.findall('.//archive'):
3265 if archive_id == a.find('./id').text:
3268 raise ExtractorError(u'Could not find chapter in chapter information')
3270 video_url = a.find('./video_file_url').text
3271 video_ext = video_url.rpartition('.')[2] or u'flv'
3273 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3274 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3275 note='Downloading chapter metadata',
3276 errnote='Download of chapter metadata failed')
3277 chapter_info = json.loads(chapter_info_json)
3279 bracket_start = int(doc.find('.//bracket_start').text)
3280 bracket_end = int(doc.find('.//bracket_end').text)
3282 # TODO determine start (and probably fix up file)
3283 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3284 #video_url += u'?start=' + TODO:start_timestamp
3285 # bracket_start is 13290, but we want 51670615
3286 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3287 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3290 'id': u'c' + chapter_id,
3293 'title': chapter_info['title'],
3294 'thumbnail': chapter_info['preview'],
3295 'description': chapter_info['description'],
3296 'uploader': chapter_info['channel']['display_name'],
3297 'uploader_id': chapter_info['channel']['name'],
3301 video_id = mobj.group('videoid')
3302 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3304 self.report_extraction(video_id)
3308 limit = self._JUSTIN_PAGE_LIMIT
3311 self.report_download_page(video_id, offset)
3312 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3313 page_count, page_info = self._parse_page(page_url, video_id)
3314 info.extend(page_info)
3315 if not paged or page_count != limit:
3320 class FunnyOrDieIE(InfoExtractor):
3321 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3323 def _real_extract(self, url):
3324 mobj = re.match(self._VALID_URL, url)
3326 raise ExtractorError(u'invalid URL: %s' % url)
3328 video_id = mobj.group('id')
3329 webpage = self._download_webpage(url, video_id)
3331 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3333 self._downloader.report_error(u'unable to find video information')
3334 video_url = unescapeHTML(m.group('url'))
3336 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3338 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3340 self._downloader.report_error(u'Cannot find video title')
3341 title = clean_html(m.group('title'))
3343 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3345 desc = unescapeHTML(m.group('desc'))
3354 'description': desc,
3358 class SteamIE(InfoExtractor):
3359 _VALID_URL = r"""http://store\.steampowered\.com/
3361 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3363 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3367 def suitable(cls, url):
3368 """Receives a URL and returns True if suitable for this IE."""
3369 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3371 def _real_extract(self, url):
3372 m = re.match(self._VALID_URL, url, re.VERBOSE)
3373 gameID = m.group('gameID')
3374 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3375 self.report_age_confirmation()
3376 webpage = self._download_webpage(videourl, gameID)
3377 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3379 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3380 mweb = re.finditer(urlRE, webpage)
3381 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3382 titles = re.finditer(namesRE, webpage)
3383 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3384 thumbs = re.finditer(thumbsRE, webpage)
3386 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3387 video_id = vid.group('videoID')
3388 title = vtitle.group('videoName')
3389 video_url = vid.group('videoURL')
3390 video_thumb = thumb.group('thumbnail')
3392 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3397 'title': unescapeHTML(title),
3398 'thumbnail': video_thumb
3401 return [self.playlist_result(videos, gameID, game_title)]
3403 class UstreamIE(InfoExtractor):
3404 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3405 IE_NAME = u'ustream'
3407 def _real_extract(self, url):
3408 m = re.match(self._VALID_URL, url)
3409 video_id = m.group('videoID')
3410 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3411 webpage = self._download_webpage(url, video_id)
3412 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3413 title = m.group('title')
3414 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3415 uploader = m.group('uploader')
3421 'uploader': uploader
3425 class WorldStarHipHopIE(InfoExtractor):
3426 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3427 IE_NAME = u'WorldStarHipHop'
3429 def _real_extract(self, url):
3430 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3432 m = re.match(self._VALID_URL, url)
3433 video_id = m.group('id')
3435 webpage_src = self._download_webpage(url, video_id)
3437 mobj = re.search(_src_url, webpage_src)
3439 if mobj is not None:
3440 video_url = mobj.group(1)
3441 if 'mp4' in video_url:
3446 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3448 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3451 raise ExtractorError(u'Cannot determine title')
3452 title = mobj.group(1)
3454 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3455 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3456 if mobj is not None:
3457 thumbnail = mobj.group(1)
3459 _title = r"""candytitles.*>(.*)</span>"""
3460 mobj = re.search(_title, webpage_src)
3461 if mobj is not None:
3462 title = mobj.group(1)
3469 'thumbnail' : thumbnail,
3474 class RBMARadioIE(InfoExtractor):
3475 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3477 def _real_extract(self, url):
3478 m = re.match(self._VALID_URL, url)
3479 video_id = m.group('videoID')
3481 webpage = self._download_webpage(url, video_id)
3482 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3484 raise ExtractorError(u'Cannot find metadata')
3485 json_data = m.group(1)
3488 data = json.loads(json_data)
3489 except ValueError as e:
3490 raise ExtractorError(u'Invalid JSON: ' + str(e))
3492 video_url = data['akamai_url'] + '&cbr=256'
3493 url_parts = compat_urllib_parse_urlparse(video_url)
3494 video_ext = url_parts.path.rpartition('.')[2]
3499 'title': data['title'],
3500 'description': data.get('teaser_text'),
3501 'location': data.get('country_of_origin'),
3502 'uploader': data.get('host', {}).get('name'),
3503 'uploader_id': data.get('host', {}).get('slug'),
3504 'thumbnail': data.get('image', {}).get('large_url_2x'),
3505 'duration': data.get('duration'),
3510 class YouPornIE(InfoExtractor):
3511 """Information extractor for youporn.com."""
3512 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3514 def _print_formats(self, formats):
3515 """Print all available formats"""
3516 print(u'Available formats:')
3517 print(u'ext\t\tformat')
3518 print(u'---------------------------------')
3519 for format in formats:
3520 print(u'%s\t\t%s' % (format['ext'], format['format']))
3522 def _specific(self, req_format, formats):
3524 if(x["format"]==req_format):
3528 def _real_extract(self, url):
3529 mobj = re.match(self._VALID_URL, url)
3531 raise ExtractorError(u'Invalid URL: %s' % url)
3533 video_id = mobj.group('videoid')
3535 req = compat_urllib_request.Request(url)
3536 req.add_header('Cookie', 'age_verified=1')
3537 webpage = self._download_webpage(req, video_id)
3539 # Get the video title
3540 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3542 raise ExtractorError(u'Unable to extract video title')
3543 video_title = result.group('title').strip()
3545 # Get the video date
3546 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3548 self._downloader.report_warning(u'unable to extract video date')
3551 upload_date = unified_strdate(result.group('date').strip())
3553 # Get the video uploader
3554 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3556 self._downloader.report_warning(u'unable to extract uploader')
3557 video_uploader = None
3559 video_uploader = result.group('uploader').strip()
3560 video_uploader = clean_html( video_uploader )
3562 # Get all of the formats available
3563 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3564 result = re.search(DOWNLOAD_LIST_RE, webpage)
3566 raise ExtractorError(u'Unable to extract download list')
3567 download_list_html = result.group('download_list').strip()
3569 # Get all of the links from the page
3570 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3571 links = re.findall(LINK_RE, download_list_html)
3572 if(len(links) == 0):
3573 raise ExtractorError(u'ERROR: no known formats available for video')
3575 self.to_screen(u'Links found: %d' % len(links))
3580 # A link looks like this:
3581 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3582 # A path looks like this:
3583 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3584 video_url = unescapeHTML( link )
3585 path = compat_urllib_parse_urlparse( video_url ).path
3586 extension = os.path.splitext( path )[1][1:]
3587 format = path.split('/')[4].split('_')[:2]
3590 format = "-".join( format )
3591 title = u'%s-%s-%s' % (video_title, size, bitrate)
3596 'uploader': video_uploader,
3597 'upload_date': upload_date,
3602 'description': None,
3606 if self._downloader.params.get('listformats', None):
3607 self._print_formats(formats)
3610 req_format = self._downloader.params.get('format', None)
3611 self.to_screen(u'Format: %s' % req_format)
3613 if req_format is None or req_format == 'best':
3615 elif req_format == 'worst':
3616 return [formats[-1]]
3617 elif req_format in ('-1', 'all'):
3620 format = self._specific( req_format, formats )
3622 raise ExtractorError(u'Requested format not available')
3627 class PornotubeIE(InfoExtractor):
3628 """Information extractor for pornotube.com."""
3629 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3631 def _real_extract(self, url):
3632 mobj = re.match(self._VALID_URL, url)
3634 raise ExtractorError(u'Invalid URL: %s' % url)
3636 video_id = mobj.group('videoid')
3637 video_title = mobj.group('title')
3639 # Get webpage content
3640 webpage = self._download_webpage(url, video_id)
3643 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3644 result = re.search(VIDEO_URL_RE, webpage)
3646 raise ExtractorError(u'Unable to extract video url')
3647 video_url = compat_urllib_parse.unquote(result.group('url'))
3649 #Get the uploaded date
3650 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3651 result = re.search(VIDEO_UPLOADED_RE, webpage)
3653 raise ExtractorError(u'Unable to extract video title')
3654 upload_date = unified_strdate(result.group('date'))
3656 info = {'id': video_id,
3659 'upload_date': upload_date,
3660 'title': video_title,
3666 class YouJizzIE(InfoExtractor):
3667 """Information extractor for youjizz.com."""
3668 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3670 def _real_extract(self, url):
3671 mobj = re.match(self._VALID_URL, url)
3673 raise ExtractorError(u'Invalid URL: %s' % url)
3675 video_id = mobj.group('videoid')
3677 # Get webpage content
3678 webpage = self._download_webpage(url, video_id)
3680 # Get the video title
3681 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3683 raise ExtractorError(u'ERROR: unable to extract video title')
3684 video_title = result.group('title').strip()
3686 # Get the embed page
3687 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3689 raise ExtractorError(u'ERROR: unable to extract embed page')
3691 embed_page_url = result.group(0).strip()
3692 video_id = result.group('videoid')
3694 webpage = self._download_webpage(embed_page_url, video_id)
3697 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3699 raise ExtractorError(u'ERROR: unable to extract video url')
3700 video_url = result.group('source')
3702 info = {'id': video_id,
3704 'title': video_title,
3707 'player_url': embed_page_url}
3711 class EightTracksIE(InfoExtractor):
3713 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3715 def _real_extract(self, url):
3716 mobj = re.match(self._VALID_URL, url)
3718 raise ExtractorError(u'Invalid URL: %s' % url)
3719 playlist_id = mobj.group('id')
3721 webpage = self._download_webpage(url, playlist_id)
3723 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3725 raise ExtractorError(u'Cannot find trax information')
3726 json_like = m.group(1)
3727 data = json.loads(json_like)
3729 session = str(random.randint(0, 1000000000))
3731 track_count = data['tracks_count']
3732 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3733 next_url = first_url
3735 for i in itertools.count():
3736 api_json = self._download_webpage(next_url, playlist_id,
3737 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3738 errnote=u'Failed to download song information')
3739 api_data = json.loads(api_json)
3740 track_data = api_data[u'set']['track']
3742 'id': track_data['id'],
3743 'url': track_data['track_file_stream_url'],
3744 'title': track_data['performer'] + u' - ' + track_data['name'],
3745 'raw_title': track_data['name'],
3746 'uploader_id': data['user']['login'],
3750 if api_data['set']['at_last_track']:
3752 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3755 class KeekIE(InfoExtractor):
3756 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3759 def _real_extract(self, url):
3760 m = re.match(self._VALID_URL, url)
3761 video_id = m.group('videoID')
3762 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3763 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3764 webpage = self._download_webpage(url, video_id)
3765 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3766 title = unescapeHTML(m.group('title'))
3767 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3768 uploader = clean_html(m.group('uploader'))
3774 'thumbnail': thumbnail,
3775 'uploader': uploader
3779 class TEDIE(InfoExtractor):
3780 _VALID_URL=r'''http://www\.ted\.com/
3782 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3784 ((?P<type_talk>talks)) # We have a simple talk
3786 (/lang/(.*?))? # The url may contain the language
3787 /(?P<name>\w+) # Here goes the name and then ".html"
3791 def suitable(cls, url):
3792 """Receives a URL and returns True if suitable for this IE."""
3793 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3795 def _real_extract(self, url):
3796 m=re.match(self._VALID_URL, url, re.VERBOSE)
3797 if m.group('type_talk'):
3798 return [self._talk_info(url)]
3800 playlist_id=m.group('playlist_id')
3801 name=m.group('name')
3802 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3803 return [self._playlist_videos_info(url,name,playlist_id)]
3805 def _talk_video_link(self,mediaSlug):
3806 '''Returns the video link for that mediaSlug'''
3807 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3809 def _playlist_videos_info(self,url,name,playlist_id=0):
3810 '''Returns the videos of the playlist'''
3812 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3813 ([.\s]*?)data-playlist_item_id="(\d+)"
3814 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3816 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3817 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3818 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3819 m_names=re.finditer(video_name_RE,webpage)
3821 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3822 m_playlist = re.search(playlist_RE, webpage)
3823 playlist_title = m_playlist.group('playlist_title')
3825 playlist_entries = []
3826 for m_video, m_name in zip(m_videos,m_names):
3827 video_id=m_video.group('video_id')
3828 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3829 playlist_entries.append(self.url_result(talk_url, 'TED'))
3830 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3832 def _talk_info(self, url, video_id=0):
3833 """Return the video for the talk in the url"""
3834 m=re.match(self._VALID_URL, url,re.VERBOSE)
3835 videoName=m.group('name')
3836 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3837 # If the url includes the language we get the title translated
3838 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3839 title=re.search(title_RE, webpage).group('title')
3840 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3841 "id":(?P<videoID>[\d]+).*?
3842 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3843 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3844 thumb_match=re.search(thumb_RE,webpage)
3845 info_match=re.search(info_RE,webpage,re.VERBOSE)
3846 video_id=info_match.group('videoID')
3847 mediaSlug=info_match.group('mediaSlug')
3848 video_url=self._talk_video_link(mediaSlug)
3854 'thumbnail': thumb_match.group('thumbnail')
3858 class MySpassIE(InfoExtractor):
3859 _VALID_URL = r'http://www.myspass.de/.*'
3861 def _real_extract(self, url):
3862 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3864 # video id is the last path element of the URL
3865 # usually there is a trailing slash, so also try the second but last
3866 url_path = compat_urllib_parse_urlparse(url).path
3867 url_parent_path, video_id = os.path.split(url_path)
3869 _, video_id = os.path.split(url_parent_path)
3872 metadata_url = META_DATA_URL_TEMPLATE % video_id
3873 metadata_text = self._download_webpage(metadata_url, video_id)
3874 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3876 # extract values from metadata
3877 url_flv_el = metadata.find('url_flv')
3878 if url_flv_el is None:
3879 raise ExtractorError(u'Unable to extract download url')
3880 video_url = url_flv_el.text
3881 extension = os.path.splitext(video_url)[1][1:]
3882 title_el = metadata.find('title')
3883 if title_el is None:
3884 raise ExtractorError(u'Unable to extract title')
3885 title = title_el.text
3886 format_id_el = metadata.find('format_id')
3887 if format_id_el is None:
3890 format = format_id_el.text
3891 description_el = metadata.find('description')
3892 if description_el is not None:
3893 description = description_el.text
3896 imagePreview_el = metadata.find('imagePreview')
3897 if imagePreview_el is not None:
3898 thumbnail = imagePreview_el.text
3907 'thumbnail': thumbnail,
3908 'description': description
3912 class SpiegelIE(InfoExtractor):
3913 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3915 def _real_extract(self, url):
3916 m = re.match(self._VALID_URL, url)
3917 video_id = m.group('videoID')
3919 webpage = self._download_webpage(url, video_id)
3920 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3922 raise ExtractorError(u'Cannot find title')
3923 video_title = unescapeHTML(m.group(1))
3925 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3926 xml_code = self._download_webpage(xml_url, video_id,
3927 note=u'Downloading XML', errnote=u'Failed to download XML')
3929 idoc = xml.etree.ElementTree.fromstring(xml_code)
3930 last_type = idoc[-1]
3931 filename = last_type.findall('./filename')[0].text
3932 duration = float(last_type.findall('./duration')[0].text)
3934 video_url = 'http://video2.spiegel.de/flash/' + filename
3935 video_ext = filename.rpartition('.')[2]
3940 'title': video_title,
3941 'duration': duration,
3945 class LiveLeakIE(InfoExtractor):
3947 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3948 IE_NAME = u'liveleak'
3950 def _real_extract(self, url):
3951 mobj = re.match(self._VALID_URL, url)
3953 raise ExtractorError(u'Invalid URL: %s' % url)
3955 video_id = mobj.group('video_id')
3957 webpage = self._download_webpage(url, video_id)
3959 m = re.search(r'file: "(.*?)",', webpage)
3961 raise ExtractorError(u'Unable to find video url')
3962 video_url = m.group(1)
3964 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3966 self._downloader.report_error(u'Cannot find video title')
3967 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3969 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3971 desc = unescapeHTML(m.group('desc'))
3975 m = re.search(r'By:.*?(\w+)</a>', webpage)
3977 uploader = clean_html(m.group(1))
3986 'description': desc,
3987 'uploader': uploader
3992 class ARDIE(InfoExtractor):
3993 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3994 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3995 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3997 def _real_extract(self, url):
3998 # determine video id from url
3999 m = re.match(self._VALID_URL, url)
4001 numid = re.search(r'documentId=([0-9]+)', url)
4003 video_id = numid.group(1)
4005 video_id = m.group('video_id')
4007 # determine title and media streams from webpage
4008 html = self._download_webpage(url, video_id)
4009 title = re.search(self._TITLE, html).group('title')
4010 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4012 assert '"fsk"' in html
4013 raise ExtractorError(u'This video is only available after 8:00 pm')
4015 # choose default media type and highest quality for now
4016 stream = max([s for s in streams if int(s["media_type"]) == 0],
4017 key=lambda s: int(s["quality"]))
4019 # there's two possibilities: RTMP stream or HTTP download
4020 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4021 if stream['rtmp_url']:
4022 self.to_screen(u'RTMP download detected')
4023 assert stream['video_url'].startswith('mp4:')
4024 info["url"] = stream["rtmp_url"]
4025 info["play_path"] = stream['video_url']
4027 assert stream["video_url"].endswith('.mp4')
4028 info["url"] = stream["video_url"]
4031 class TumblrIE(InfoExtractor):
4032 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4034 def _real_extract(self, url):
4035 m_url = re.match(self._VALID_URL, url)
4036 video_id = m_url.group('id')
4037 blog = m_url.group('blog_name')
4039 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4040 webpage = self._download_webpage(url, video_id)
4042 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4043 video = re.search(re_video, webpage)
4045 self.to_screen("No video founded")
4047 video_url = video.group('video_url')
4048 ext = video.group('ext')
4050 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4051 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4053 # The only place where you can get a title, it's not complete,
4054 # but searching in other places doesn't work for all videos
4055 re_title = r'<title>(?P<title>.*?)</title>'
4056 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4058 return [{'id': video_id,
4065 class BandcampIE(InfoExtractor):
4066 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4068 def _real_extract(self, url):
4069 mobj = re.match(self._VALID_URL, url)
4070 title = mobj.group('title')
4071 webpage = self._download_webpage(url, title)
4072 # We get the link to the free download page
4073 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4074 if m_download is None:
4075 raise ExtractorError(u'No free songs founded')
4077 download_link = m_download.group(1)
4078 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4079 webpage, re.MULTILINE|re.DOTALL).group('id')
4081 download_webpage = self._download_webpage(download_link, id,
4082 'Downloading free downloads page')
4083 # We get the dictionary of the track from some javascrip code
4084 info = re.search(r'items: (.*?),$',
4085 download_webpage, re.MULTILINE).group(1)
4086 info = json.loads(info)[0]
4087 # We pick mp3-320 for now, until format selection can be easily implemented.
4088 mp3_info = info[u'downloads'][u'mp3-320']
4089 # If we try to use this url it says the link has expired
4090 initial_url = mp3_info[u'url']
4091 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4092 m_url = re.match(re_url, initial_url)
4093 #We build the url we will use to get the final track url
4094 # This url is build in Bandcamp in the script download_bunde_*.js
4095 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4096 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4097 # If we could correctly generate the .rand field the url would be
4098 #in the "download_url" key
4099 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4101 track_info = {'id':id,
4102 'title' : info[u'title'],
4105 'thumbnail' : info[u'thumb_url'],
4106 'uploader' : info[u'artist']
4111 class RedTubeIE(InfoExtractor):
4112 """Information Extractor for redtube"""
4113 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4115 def _real_extract(self,url):
4116 mobj = re.match(self._VALID_URL, url)
4118 raise ExtractorError(u'Invalid URL: %s' % url)
4120 video_id = mobj.group('id')
4121 video_extension = 'mp4'
4122 webpage = self._download_webpage(url, video_id)
4123 self.report_extraction(video_id)
4124 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4127 raise ExtractorError(u'Unable to extract media URL')
4129 video_url = mobj.group(1)
4130 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4132 raise ExtractorError(u'Unable to extract title')
4133 video_title = mobj.group(1)
4138 'ext': video_extension,
4139 'title': video_title,
4143 def gen_extractors():
4144 """ Return a list of an instance of every supported extractor.
4145 The order does matter; the first extractor matched is the one handling the URL.
4148 YoutubePlaylistIE(),
4173 StanfordOpenClassroomIE(),
4183 WorldStarHipHopIE(),
4202 def get_info_extractor(ie_name):
4203 """Returns the info extractor class with the given ie_name"""
4204 return globals()[ie_name+'IE']