2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
478 def _extract_id(self, url):
479 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
481 self._downloader.report_error(u'invalid URL: %s' % url)
483 video_id = mobj.group(2)
486 def _real_extract(self, url):
487 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
488 mobj = re.search(self._NEXT_URL_RE, url)
490 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
491 video_id = self._extract_id(url)
494 self.report_video_webpage_download(video_id)
495 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
496 request = compat_urllib_request.Request(url)
498 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
499 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
500 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
503 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
505 # Attempt to extract SWF player URL
506 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
508 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
513 self.report_video_info_webpage_download(video_id)
514 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
515 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
516 % (video_id, el_type))
517 video_info_webpage = self._download_webpage(video_info_url, video_id,
519 errnote='unable to download video info webpage')
520 video_info = compat_parse_qs(video_info_webpage)
521 if 'token' in video_info:
523 if 'token' not in video_info:
524 if 'reason' in video_info:
525 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
527 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
530 # Check for "rental" videos
531 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
532 self._downloader.report_error(u'"rental" videos not supported')
535 # Start extracting information
536 self.report_information_extraction(video_id)
539 if 'author' not in video_info:
540 self._downloader.report_error(u'unable to extract uploader name')
542 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
545 video_uploader_id = None
546 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
548 video_uploader_id = mobj.group(1)
550 self._downloader.report_warning(u'unable to extract uploader nickname')
553 if 'title' not in video_info:
554 self._downloader.report_error(u'unable to extract video title')
556 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
559 if 'thumbnail_url' not in video_info:
560 self._downloader.report_warning(u'unable to extract video thumbnail')
562 else: # don't panic if we can't find it
563 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
567 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
569 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
570 upload_date = unified_strdate(upload_date)
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
579 video_description = unescapeHTML(fd_mobj.group(1))
581 video_description = u''
584 video_subtitles = None
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
589 (sub_error, sub_lang, sub) = video_subtitles[0]
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
598 self._downloader.report_error(sub_error)
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
650 video_url_list = [(rf, url_map[rf])]
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
658 for format_param, video_real_url in video_url_list:
660 video_extension = self._video_extensions.get(format_param, 'flv')
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
691 def report_disclaimer(self):
692 """Report disclaimer retrieval."""
693 self.to_screen(u'Retrieving disclaimer')
695 def _real_initialize(self):
696 # Retrieve disclaimer
697 request = compat_urllib_request.Request(self._DISCLAIMER)
699 self.report_disclaimer()
700 disclaimer = compat_urllib_request.urlopen(request).read()
701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
708 'submit': "Continue - I'm over 18",
710 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712 self.report_age_confirmation()
713 disclaimer = compat_urllib_request.urlopen(request).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
718 def _real_extract(self, url):
719 # Extract id and simplified title from URL
720 mobj = re.match(self._VALID_URL, url)
722 self._downloader.report_error(u'invalid URL: %s' % url)
725 video_id = mobj.group(1)
727 # Check if video comes from YouTube
728 mobj2 = re.match(r'^yt-(.*)$', video_id)
729 if mobj2 is not None:
730 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
732 # Retrieve video webpage to extract further information
733 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
739 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740 video_extension = mediaURL[-3:]
742 # Extract gdaKey if available
743 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747 gdaKey = mobj.group(1)
748 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
750 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
752 self._downloader.report_error(u'unable to extract media URL')
754 vardict = compat_parse_qs(mobj.group(1))
755 if 'mediaData' not in vardict:
756 self._downloader.report_error(u'unable to extract media URL')
758 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
760 self._downloader.report_error(u'unable to extract media URL')
762 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763 video_extension = mediaURL[-3:]
764 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
766 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
768 self._downloader.report_error(u'unable to extract title')
770 video_title = mobj.group(1).decode('utf-8')
772 mobj = re.search(r'submitter=(.*?);', webpage)
774 self._downloader.report_error(u'unable to extract uploader nickname')
776 video_uploader = mobj.group(1)
779 'id': video_id.decode('utf-8'),
780 'url': video_url.decode('utf-8'),
781 'uploader': video_uploader.decode('utf-8'),
783 'title': video_title,
784 'ext': video_extension.decode('utf-8'),
787 class DailymotionIE(InfoExtractor):
788 """Information Extractor for Dailymotion"""
790 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
791 IE_NAME = u'dailymotion'
793 def _real_extract(self, url):
794 # Extract id and simplified title from URL
795 mobj = re.match(self._VALID_URL, url)
797 self._downloader.report_error(u'invalid URL: %s' % url)
800 video_id = mobj.group(1).split('_')[0].split('?')[0]
802 video_extension = 'mp4'
804 # Retrieve video webpage to extract further information
805 request = compat_urllib_request.Request(url)
806 request.add_header('Cookie', 'family_filter=off')
807 webpage = self._download_webpage(request, video_id)
809 # Extract URL, uploader and title from webpage
810 self.report_extraction(video_id)
811 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
813 self._downloader.report_error(u'unable to extract media URL')
815 flashvars = compat_urllib_parse.unquote(mobj.group(1))
817 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
820 self.to_screen(u'Using %s' % key)
823 self._downloader.report_error(u'unable to extract video URL')
826 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
828 self._downloader.report_error(u'unable to extract video URL')
831 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
833 # TODO: support choosing qualities
835 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
837 self._downloader.report_error(u'unable to extract title')
839 video_title = unescapeHTML(mobj.group('title'))
841 video_uploader = None
842 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
844 # lookin for official user
845 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
846 if mobj_official is None:
847 self._downloader.report_warning(u'unable to extract uploader nickname')
849 video_uploader = mobj_official.group(1)
851 video_uploader = mobj.group(1)
853 video_upload_date = None
854 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
856 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
861 'uploader': video_uploader,
862 'upload_date': video_upload_date,
863 'title': video_title,
864 'ext': video_extension,
868 class PhotobucketIE(InfoExtractor):
869 """Information extractor for photobucket.com."""
871 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
872 IE_NAME = u'photobucket'
874 def _real_extract(self, url):
875 # Extract id from URL
876 mobj = re.match(self._VALID_URL, url)
878 self._downloader.report_error(u'Invalid URL: %s' % url)
881 video_id = mobj.group(1)
883 video_extension = 'flv'
885 # Retrieve video webpage to extract further information
886 request = compat_urllib_request.Request(url)
888 self.report_download_webpage(video_id)
889 webpage = compat_urllib_request.urlopen(request).read()
890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
891 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
894 # Extract URL, uploader, and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
898 self._downloader.report_error(u'unable to extract media URL')
900 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
904 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
906 self._downloader.report_error(u'unable to extract title')
908 video_title = mobj.group(1).decode('utf-8')
910 video_uploader = mobj.group(2).decode('utf-8')
913 'id': video_id.decode('utf-8'),
914 'url': video_url.decode('utf-8'),
915 'uploader': video_uploader,
917 'title': video_title,
918 'ext': video_extension.decode('utf-8'),
922 class YahooIE(InfoExtractor):
923 """Information extractor for video.yahoo.com."""
926 # _VALID_URL matches all Yahoo! Video URLs
927 # _VPAGE_URL matches only the extractable '/watch/' URLs
928 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
929 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
930 IE_NAME = u'video.yahoo'
932 def _real_extract(self, url, new_video=True):
933 # Extract ID from URL
934 mobj = re.match(self._VALID_URL, url)
936 self._downloader.report_error(u'Invalid URL: %s' % url)
939 video_id = mobj.group(2)
940 video_extension = 'flv'
942 # Rewrite valid but non-extractable URLs as
943 # extractable English language /watch/ URLs
944 if re.match(self._VPAGE_URL, url) is None:
945 request = compat_urllib_request.Request(url)
947 webpage = compat_urllib_request.urlopen(request).read()
948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
952 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
954 self._downloader.report_error(u'Unable to extract id field')
956 yahoo_id = mobj.group(1)
958 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
960 self._downloader.report_error(u'Unable to extract vid field')
962 yahoo_vid = mobj.group(1)
964 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965 return self._real_extract(url, new_video=False)
967 # Retrieve video webpage to extract further information
968 request = compat_urllib_request.Request(url)
970 self.report_download_webpage(video_id)
971 webpage = compat_urllib_request.urlopen(request).read()
972 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
976 # Extract uploader and title from webpage
977 self.report_extraction(video_id)
978 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
980 self._downloader.report_error(u'unable to extract video title')
982 video_title = mobj.group(1).decode('utf-8')
984 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
986 self._downloader.report_error(u'unable to extract video uploader')
988 video_uploader = mobj.group(1).decode('utf-8')
990 # Extract video thumbnail
991 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
993 self._downloader.report_error(u'unable to extract video thumbnail')
995 video_thumbnail = mobj.group(1).decode('utf-8')
997 # Extract video description
998 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1000 self._downloader.report_error(u'unable to extract video description')
1002 video_description = mobj.group(1).decode('utf-8')
1003 if not video_description:
1004 video_description = 'No description available.'
1006 # Extract video height and width
1007 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1009 self._downloader.report_error(u'unable to extract video height')
1011 yv_video_height = mobj.group(1)
1013 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1015 self._downloader.report_error(u'unable to extract video width')
1017 yv_video_width = mobj.group(1)
1019 # Retrieve video playlist to extract media URL
1020 # I'm not completely sure what all these options are, but we
1021 # seem to need most of them, otherwise the server sends a 401.
1022 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1023 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1024 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1028 self.report_download_webpage(video_id)
1029 webpage = compat_urllib_request.urlopen(request).read()
1030 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034 # Extract media URL from playlist XML
1035 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1037 self._downloader.report_error(u'Unable to extract media URL')
1039 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040 video_url = unescapeHTML(video_url)
1043 'id': video_id.decode('utf-8'),
1045 'uploader': video_uploader,
1046 'upload_date': None,
1047 'title': video_title,
1048 'ext': video_extension.decode('utf-8'),
1049 'thumbnail': video_thumbnail.decode('utf-8'),
1050 'description': video_description,
1054 class VimeoIE(InfoExtractor):
1055 """Information extractor for vimeo.com."""
1057 # _VALID_URL matches Vimeo URLs
1058 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061 def _real_extract(self, url, new_video=True):
1062 # Extract ID from URL
1063 mobj = re.match(self._VALID_URL, url)
1065 self._downloader.report_error(u'Invalid URL: %s' % url)
1068 video_id = mobj.group('id')
1069 if not mobj.group('proto'):
1070 url = 'https://' + url
1071 if mobj.group('direct_link'):
1072 url = 'https://vimeo.com/' + video_id
1074 # Retrieve video webpage to extract further information
1075 request = compat_urllib_request.Request(url, None, std_headers)
1076 webpage = self._download_webpage(request, video_id)
1078 # Now we begin extracting as much information as we can from what we
1079 # retrieved. First we extract the information common to all extractors,
1080 # and latter we extract those that are Vimeo specific.
1081 self.report_extraction(video_id)
1083 # Extract the config JSON
1085 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1086 config = json.loads(config)
1088 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1089 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1091 self._downloader.report_error(u'unable to extract info section')
1095 video_title = config["video"]["title"]
1097 # Extract uploader and uploader_id
1098 video_uploader = config["video"]["owner"]["name"]
1099 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1101 # Extract video thumbnail
1102 video_thumbnail = config["video"]["thumbnail"]
1104 # Extract video description
1105 video_description = get_element_by_attribute("itemprop", "description", webpage)
1106 if video_description: video_description = clean_html(video_description)
1107 else: video_description = u''
1109 # Extract upload date
1110 video_upload_date = None
1111 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1112 if mobj is not None:
1113 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1115 # Vimeo specific: extract request signature and timestamp
1116 sig = config['request']['signature']
1117 timestamp = config['request']['timestamp']
1119 # Vimeo specific: extract video codec and quality information
1120 # First consider quality, then codecs, then take everything
1121 # TODO bind to format param
1122 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1123 files = { 'hd': [], 'sd': [], 'other': []}
1124 for codec_name, codec_extension in codecs:
1125 if codec_name in config["video"]["files"]:
1126 if 'hd' in config["video"]["files"][codec_name]:
1127 files['hd'].append((codec_name, codec_extension, 'hd'))
1128 elif 'sd' in config["video"]["files"][codec_name]:
1129 files['sd'].append((codec_name, codec_extension, 'sd'))
1131 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1133 for quality in ('hd', 'sd', 'other'):
1134 if len(files[quality]) > 0:
1135 video_quality = files[quality][0][2]
1136 video_codec = files[quality][0][0]
1137 video_extension = files[quality][0][1]
1138 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1141 self._downloader.report_error(u'no known codec found')
1144 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1145 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1150 'uploader': video_uploader,
1151 'uploader_id': video_uploader_id,
1152 'upload_date': video_upload_date,
1153 'title': video_title,
1154 'ext': video_extension,
1155 'thumbnail': video_thumbnail,
1156 'description': video_description,
1160 class ArteTvIE(InfoExtractor):
1161 """arte.tv information extractor."""
1163 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1164 _LIVE_URL = r'index-[0-9]+\.html$'
1166 IE_NAME = u'arte.tv'
1168 def fetch_webpage(self, url):
1169 request = compat_urllib_request.Request(url)
1171 self.report_download_webpage(url)
1172 webpage = compat_urllib_request.urlopen(request).read()
1173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1176 except ValueError as err:
1177 self._downloader.report_error(u'Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 self._downloader.report_error(u'Invalid URL: %s' % url)
1190 for (i, key, err) in matchTuples:
1191 if mobj.group(i) is None:
1192 self._downloader.report_error(err)
1195 info[key] = mobj.group(i)
1199 def extractLiveStream(self, url):
1200 video_lang = url.split('/')[-4]
1201 info = self.grep_webpage(
1203 r'src="(.*?/videothek_js.*?\.js)',
1206 (1, 'url', u'Invalid URL: %s' % url)
1209 http_host = url.split('/')[2]
1210 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211 info = self.grep_webpage(
1213 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214 '(http://.*?\.swf).*?' +
1218 (1, 'path', u'could not extract video path: %s' % url),
1219 (2, 'player', u'could not extract video player: %s' % url),
1220 (3, 'url', u'could not extract video url: %s' % url)
1223 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1225 def extractPlus7Stream(self, url):
1226 video_lang = url.split('/')[-3]
1227 info = self.grep_webpage(
1229 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232 (1, 'url', u'Invalid URL: %s' % url)
1235 next_url = compat_urllib_parse.unquote(info.get('url'))
1236 info = self.grep_webpage(
1238 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241 (1, 'url', u'Could not find <video> tag: %s' % url)
1244 next_url = compat_urllib_parse.unquote(info.get('url'))
1246 info = self.grep_webpage(
1248 r'<video id="(.*?)".*?>.*?' +
1249 '<name>(.*?)</name>.*?' +
1250 '<dateVideo>(.*?)</dateVideo>.*?' +
1251 '<url quality="hd">(.*?)</url>',
1254 (1, 'id', u'could not extract video id: %s' % url),
1255 (2, 'title', u'could not extract video title: %s' % url),
1256 (3, 'date', u'could not extract video date: %s' % url),
1257 (4, 'url', u'could not extract video url: %s' % url)
1262 'id': info.get('id'),
1263 'url': compat_urllib_parse.unquote(info.get('url')),
1264 'uploader': u'arte.tv',
1265 'upload_date': info.get('date'),
1266 'title': info.get('title').decode('utf-8'),
1272 def _real_extract(self, url):
1273 video_id = url.split('/')[-1]
1274 self.report_extraction(video_id)
1276 if re.search(self._LIVE_URL, video_id) is not None:
1277 self.extractLiveStream(url)
1280 info = self.extractPlus7Stream(url)
1285 class GenericIE(InfoExtractor):
1286 """Generic last-resort information extractor."""
1289 IE_NAME = u'generic'
1291 def report_download_webpage(self, video_id):
1292 """Report webpage download."""
1293 if not self._downloader.params.get('test', False):
1294 self._downloader.report_warning(u'Falling back on generic information extractor.')
1295 super(GenericIE, self).report_download_webpage(video_id)
1297 def report_following_redirect(self, new_url):
1298 """Report information extraction."""
1299 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1301 def _test_redirect(self, url):
1302 """Check if it is a redirect, like url shorteners, in case return the new url."""
1303 class HeadRequest(compat_urllib_request.Request):
1304 def get_method(self):
1307 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1309 Subclass the HTTPRedirectHandler to make it use our
1310 HeadRequest also on the redirected URL
1312 def redirect_request(self, req, fp, code, msg, headers, newurl):
1313 if code in (301, 302, 303, 307):
1314 newurl = newurl.replace(' ', '%20')
1315 newheaders = dict((k,v) for k,v in req.headers.items()
1316 if k.lower() not in ("content-length", "content-type"))
1317 return HeadRequest(newurl,
1319 origin_req_host=req.get_origin_req_host(),
1322 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1324 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1326 Fallback to GET if HEAD is not allowed (405 HTTP error)
1328 def http_error_405(self, req, fp, code, msg, headers):
1332 newheaders = dict((k,v) for k,v in req.headers.items()
1333 if k.lower() not in ("content-length", "content-type"))
1334 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1336 origin_req_host=req.get_origin_req_host(),
1340 opener = compat_urllib_request.OpenerDirector()
1341 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342 HTTPMethodFallback, HEADRedirectHandler,
1343 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344 opener.add_handler(handler())
1346 response = opener.open(HeadRequest(url))
1347 new_url = response.geturl()
1352 self.report_following_redirect(new_url)
1355 def _real_extract(self, url):
1356 new_url = self._test_redirect(url)
1357 if new_url: return [self.url_result(new_url)]
1359 video_id = url.split('/')[-1]
1361 webpage = self._download_webpage(url, video_id)
1362 except ValueError as err:
1363 # since this is the last-resort InfoExtractor, if
1364 # this error is thrown, it'll be thrown here
1365 self._downloader.report_error(u'Invalid URL: %s' % url)
1368 self.report_extraction(video_id)
1369 # Start with something easy: JW Player in SWFObject
1370 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1372 # Broaden the search a little bit
1373 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1375 # Broaden the search a little bit: JWPlayer JS loader
1376 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1378 self._downloader.report_error(u'Invalid URL: %s' % url)
1381 # It's possible that one of the regexes
1382 # matched, but returned an empty group:
1383 if mobj.group(1) is None:
1384 self._downloader.report_error(u'Invalid URL: %s' % url)
1387 video_url = compat_urllib_parse.unquote(mobj.group(1))
1388 video_id = os.path.basename(video_url)
1390 # here's a fun little line of code for you:
1391 video_extension = os.path.splitext(video_id)[1][1:]
1392 video_id = os.path.splitext(video_id)[0]
1394 # it's tempting to parse this further, but you would
1395 # have to take into account all the variations like
1396 # Video Title - Site Name
1397 # Site Name | Video Title
1398 # Video Title - Tagline | Site Name
1399 # and so on and so forth; it's just not practical
1400 mobj = re.search(r'<title>(.*)</title>', webpage)
1402 self._downloader.report_error(u'unable to extract title')
1404 video_title = mobj.group(1)
1406 # video uploader is domain name
1407 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1409 self._downloader.report_error(u'unable to extract title')
1411 video_uploader = mobj.group(1)
1416 'uploader': video_uploader,
1417 'upload_date': None,
1418 'title': video_title,
1419 'ext': video_extension,
1423 class YoutubeSearchIE(InfoExtractor):
1424 """Information Extractor for YouTube search queries."""
1425 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427 _max_youtube_results = 1000
1428 IE_NAME = u'youtube:search'
1430 def report_download_page(self, query, pagenum):
1431 """Report attempt to download search page with given number."""
1432 query = query.decode(preferredencoding())
1433 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1435 def _real_extract(self, query):
1436 mobj = re.match(self._VALID_URL, query)
1438 self._downloader.report_error(u'invalid search query "%s"' % query)
1441 prefix, query = query.split(':')
1443 query = query.encode('utf-8')
1445 return self._get_n_results(query, 1)
1446 elif prefix == 'all':
1447 self._get_n_results(query, self._max_youtube_results)
1452 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1454 elif n > self._max_youtube_results:
1455 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456 n = self._max_youtube_results
1457 return self._get_n_results(query, n)
1458 except ValueError: # parsing prefix as integer fails
1459 return self._get_n_results(query, 1)
1461 def _get_n_results(self, query, n):
1462 """Get a specified number of results for a query"""
1468 while (50 * pagenum) < limit:
1469 self.report_download_page(query, pagenum+1)
1470 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471 request = compat_urllib_request.Request(result_url)
1473 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1477 api_response = json.loads(data)['data']
1479 if not 'items' in api_response:
1480 self._downloader.report_error(u'[youtube] No video results')
1483 new_ids = list(video['id'] for video in api_response['items'])
1484 video_ids += new_ids
1486 limit = min(n, api_response['totalItems'])
1489 if len(video_ids) > n:
1490 video_ids = video_ids[:n]
1491 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495 class GoogleSearchIE(InfoExtractor):
1496 """Information Extractor for Google Video search queries."""
1497 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1498 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1499 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1500 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1501 _max_google_results = 1000
1502 IE_NAME = u'video.google:search'
1504 def report_download_page(self, query, pagenum):
1505 """Report attempt to download playlist page with given number."""
1506 query = query.decode(preferredencoding())
1507 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1509 def _real_extract(self, query):
1510 mobj = re.match(self._VALID_URL, query)
1512 self._downloader.report_error(u'invalid search query "%s"' % query)
1515 prefix, query = query.split(':')
1517 query = query.encode('utf-8')
1519 self._download_n_results(query, 1)
1521 elif prefix == 'all':
1522 self._download_n_results(query, self._max_google_results)
1528 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1530 elif n > self._max_google_results:
1531 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1532 n = self._max_google_results
1533 self._download_n_results(query, n)
1535 except ValueError: # parsing prefix as integer fails
1536 self._download_n_results(query, 1)
1539 def _download_n_results(self, query, n):
1540 """Downloads a specified number of results for a query"""
1546 self.report_download_page(query, pagenum)
1547 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1548 request = compat_urllib_request.Request(result_url)
1550 page = compat_urllib_request.urlopen(request).read()
1551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1555 # Extract video identifiers
1556 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1557 video_id = mobj.group(1)
1558 if video_id not in video_ids:
1559 video_ids.append(video_id)
1560 if len(video_ids) == n:
1561 # Specified n videos reached
1562 for id in video_ids:
1563 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1566 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567 for id in video_ids:
1568 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571 pagenum = pagenum + 1
1574 class YahooSearchIE(InfoExtractor):
1575 """Information Extractor for Yahoo! Video search queries."""
1578 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1579 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1580 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1581 _MORE_PAGES_INDICATOR = r'\s*Next'
1582 _max_yahoo_results = 1000
1583 IE_NAME = u'video.yahoo:search'
1585 def report_download_page(self, query, pagenum):
1586 """Report attempt to download playlist page with given number."""
1587 query = query.decode(preferredencoding())
1588 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1590 def _real_extract(self, query):
1591 mobj = re.match(self._VALID_URL, query)
1593 self._downloader.report_error(u'invalid search query "%s"' % query)
1596 prefix, query = query.split(':')
1598 query = query.encode('utf-8')
1600 self._download_n_results(query, 1)
1602 elif prefix == 'all':
1603 self._download_n_results(query, self._max_yahoo_results)
1609 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1611 elif n > self._max_yahoo_results:
1612 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1613 n = self._max_yahoo_results
1614 self._download_n_results(query, n)
1616 except ValueError: # parsing prefix as integer fails
1617 self._download_n_results(query, 1)
1620 def _download_n_results(self, query, n):
1621 """Downloads a specified number of results for a query"""
1624 already_seen = set()
1628 self.report_download_page(query, pagenum)
1629 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1630 request = compat_urllib_request.Request(result_url)
1632 page = compat_urllib_request.urlopen(request).read()
1633 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1634 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1637 # Extract video identifiers
1638 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1639 video_id = mobj.group(1)
1640 if video_id not in already_seen:
1641 video_ids.append(video_id)
1642 already_seen.add(video_id)
1643 if len(video_ids) == n:
1644 # Specified n videos reached
1645 for id in video_ids:
1646 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1650 for id in video_ids:
1651 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654 pagenum = pagenum + 1
1657 class YoutubePlaylistIE(InfoExtractor):
1658 """Information Extractor for YouTube playlists."""
1660 _VALID_URL = r"""(?:
1665 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1666 \? (?:.*?&)*? (?:p|a|list)=
1669 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1672 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1674 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1676 IE_NAME = u'youtube:playlist'
1679 def suitable(cls, url):
1680 """Receives a URL and returns True if suitable for this IE."""
1681 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1683 def _real_extract(self, url):
1684 # Extract playlist id
1685 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1687 self._downloader.report_error(u'invalid url: %s' % url)
1690 # Download playlist videos from API
1691 playlist_id = mobj.group(1) or mobj.group(2)
1696 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1697 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1700 response = json.loads(page)
1701 except ValueError as err:
1702 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1705 if 'feed' not in response:
1706 self._downloader.report_error(u'Got a malformed response from YouTube API')
1708 playlist_title = response['feed']['title']['$t']
1709 if 'entry' not in response['feed']:
1710 # Number of videos is a multiple of self._MAX_RESULTS
1713 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1714 for entry in response['feed']['entry']
1715 if 'content' in entry ]
1717 if len(response['feed']['entry']) < self._MAX_RESULTS:
1721 videos = [v[1] for v in sorted(videos)]
1723 url_results = [self.url_result(url, 'Youtube') for url in videos]
1724 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1727 class YoutubeChannelIE(InfoExtractor):
1728 """Information Extractor for YouTube channels."""
1730 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1731 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1732 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1733 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1734 IE_NAME = u'youtube:channel'
1736 def extract_videos_from_page(self, page):
1738 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1739 if mobj.group(1) not in ids_in_page:
1740 ids_in_page.append(mobj.group(1))
1743 def _real_extract(self, url):
1744 # Extract channel id
1745 mobj = re.match(self._VALID_URL, url)
1747 self._downloader.report_error(u'invalid url: %s' % url)
1750 # Download channel page
1751 channel_id = mobj.group(1)
1755 url = self._TEMPLATE_URL % (channel_id, pagenum)
1756 page = self._download_webpage(url, channel_id,
1757 u'Downloading page #%s' % pagenum)
1759 # Extract video identifiers
1760 ids_in_page = self.extract_videos_from_page(page)
1761 video_ids.extend(ids_in_page)
1763 # Download any subsequent channel pages using the json-based channel_ajax query
1764 if self._MORE_PAGES_INDICATOR in page:
1766 pagenum = pagenum + 1
1768 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1769 page = self._download_webpage(url, channel_id,
1770 u'Downloading page #%s' % pagenum)
1772 page = json.loads(page)
1774 ids_in_page = self.extract_videos_from_page(page['content_html'])
1775 video_ids.extend(ids_in_page)
1777 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1780 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1782 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1783 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1784 return [self.playlist_result(url_entries, channel_id)]
1787 class YoutubeUserIE(InfoExtractor):
1788 """Information Extractor for YouTube users."""
1790 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1791 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1792 _GDATA_PAGE_SIZE = 50
1793 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1794 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1795 IE_NAME = u'youtube:user'
1797 def _real_extract(self, url):
1799 mobj = re.match(self._VALID_URL, url)
1801 self._downloader.report_error(u'invalid url: %s' % url)
1804 username = mobj.group(1)
1806 # Download video ids using YouTube Data API. Result size per
1807 # query is limited (currently to 50 videos) so we need to query
1808 # page by page until there are no video ids - it means we got
1815 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1817 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1818 page = self._download_webpage(gdata_url, username,
1819 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1821 # Extract video identifiers
1824 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1825 if mobj.group(1) not in ids_in_page:
1826 ids_in_page.append(mobj.group(1))
1828 video_ids.extend(ids_in_page)
1830 # A little optimization - if current page is not
1831 # "full", ie. does not contain PAGE_SIZE video ids then
1832 # we can assume that this page is the last one - there
1833 # are no more ids on further pages - no need to query
1836 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1841 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1842 url_results = [self.url_result(url, 'Youtube') for url in urls]
1843 return [self.playlist_result(url_results, playlist_title = username)]
1846 class BlipTVUserIE(InfoExtractor):
1847 """Information Extractor for blip.tv users."""
1849 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1851 IE_NAME = u'blip.tv:user'
1853 def _real_extract(self, url):
1855 mobj = re.match(self._VALID_URL, url)
1857 self._downloader.report_error(u'invalid url: %s' % url)
1860 username = mobj.group(1)
1862 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1864 page = self._download_webpage(url, username, u'Downloading user page')
1865 mobj = re.search(r'data-users-id="([^"]+)"', page)
1866 page_base = page_base % mobj.group(1)
1869 # Download video ids using BlipTV Ajax calls. Result size per
1870 # query is limited (currently to 12 videos) so we need to query
1871 # page by page until there are no video ids - it means we got
1878 url = page_base + "&page=" + str(pagenum)
1879 page = self._download_webpage(url, username,
1880 u'Downloading video ids from page %d' % pagenum)
1882 # Extract video identifiers
1885 for mobj in re.finditer(r'href="/([^"]+)"', page):
1886 if mobj.group(1) not in ids_in_page:
1887 ids_in_page.append(unescapeHTML(mobj.group(1)))
1889 video_ids.extend(ids_in_page)
1891 # A little optimization - if current page is not
1892 # "full", ie. does not contain PAGE_SIZE video ids then
1893 # we can assume that this page is the last one - there
1894 # are no more ids on further pages - no need to query
1897 if len(ids_in_page) < self._PAGE_SIZE:
1902 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1903 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1904 return [self.playlist_result(url_entries, playlist_title = username)]
1907 class DepositFilesIE(InfoExtractor):
1908 """Information extractor for depositfiles.com"""
1910 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1912 def _real_extract(self, url):
1913 file_id = url.split('/')[-1]
1914 # Rebuild url in english locale
1915 url = 'http://depositfiles.com/en/files/' + file_id
1917 # Retrieve file webpage with 'Free download' button pressed
1918 free_download_indication = { 'gateway_result' : '1' }
1919 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1921 self.report_download_webpage(file_id)
1922 webpage = compat_urllib_request.urlopen(request).read()
1923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1927 # Search for the real file URL
1928 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1929 if (mobj is None) or (mobj.group(1) is None):
1930 # Try to figure out reason of the error.
1931 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1932 if (mobj is not None) and (mobj.group(1) is not None):
1933 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1934 self._downloader.report_error(u'%s' % restriction_message)
1936 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1939 file_url = mobj.group(1)
1940 file_extension = os.path.splitext(file_url)[1][1:]
1942 # Search for file title
1943 mobj = re.search(r'<b title="(.*?)">', webpage)
1945 self._downloader.report_error(u'unable to extract title')
1947 file_title = mobj.group(1).decode('utf-8')
1950 'id': file_id.decode('utf-8'),
1951 'url': file_url.decode('utf-8'),
1953 'upload_date': None,
1954 'title': file_title,
1955 'ext': file_extension.decode('utf-8'),
1959 class FacebookIE(InfoExtractor):
1960 """Information Extractor for Facebook"""
1962 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1963 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1964 _NETRC_MACHINE = 'facebook'
1965 IE_NAME = u'facebook'
1967 def report_login(self):
1968 """Report attempt to log in."""
1969 self.to_screen(u'Logging in')
1971 def _real_initialize(self):
1972 if self._downloader is None:
1977 downloader_params = self._downloader.params
1979 # Attempt to use provided username and password or .netrc data
1980 if downloader_params.get('username', None) is not None:
1981 useremail = downloader_params['username']
1982 password = downloader_params['password']
1983 elif downloader_params.get('usenetrc', False):
1985 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1986 if info is not None:
1990 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1991 except (IOError, netrc.NetrcParseError) as err:
1992 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1995 if useremail is None:
2004 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2007 login_results = compat_urllib_request.urlopen(request).read()
2008 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2009 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2015 def _real_extract(self, url):
2016 mobj = re.match(self._VALID_URL, url)
2018 self._downloader.report_error(u'invalid URL: %s' % url)
2020 video_id = mobj.group('ID')
2022 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2023 webpage = self._download_webpage(url, video_id)
2025 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2026 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2027 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2029 raise ExtractorError(u'Cannot parse data')
2030 data = dict(json.loads(m.group(1)))
2031 params_raw = compat_urllib_parse.unquote(data['params'])
2032 params = json.loads(params_raw)
2033 video_data = params['video_data'][0]
2034 video_url = video_data.get('hd_src')
2036 video_url = video_data['sd_src']
2038 raise ExtractorError(u'Cannot find video URL')
2039 video_duration = int(video_data['video_duration'])
2040 thumbnail = video_data['thumbnail_src']
2042 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2044 raise ExtractorError(u'Cannot find title in webpage')
2045 video_title = unescapeHTML(m.group(1))
2049 'title': video_title,
2052 'duration': video_duration,
2053 'thumbnail': thumbnail,
2058 class BlipTVIE(InfoExtractor):
2059 """Information extractor for blip.tv"""
2061 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2062 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2063 IE_NAME = u'blip.tv'
2065 def report_direct_download(self, title):
2066 """Report information extraction."""
2067 self.to_screen(u'%s: Direct download detected' % title)
2069 def _real_extract(self, url):
2070 mobj = re.match(self._VALID_URL, url)
2072 self._downloader.report_error(u'invalid URL: %s' % url)
2075 urlp = compat_urllib_parse_urlparse(url)
2076 if urlp.path.startswith('/play/'):
2077 request = compat_urllib_request.Request(url)
2078 response = compat_urllib_request.urlopen(request)
2079 redirecturl = response.geturl()
2080 rurlp = compat_urllib_parse_urlparse(redirecturl)
2081 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2082 url = 'http://blip.tv/a/a-' + file_id
2083 return self._real_extract(url)
2090 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2091 request = compat_urllib_request.Request(json_url)
2092 request.add_header('User-Agent', 'iTunes/10.6.1')
2093 self.report_extraction(mobj.group(1))
2096 urlh = compat_urllib_request.urlopen(request)
2097 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2098 basename = url.split('/')[-1]
2099 title,ext = os.path.splitext(basename)
2100 title = title.decode('UTF-8')
2101 ext = ext.replace('.', '')
2102 self.report_direct_download(title)
2107 'upload_date': None,
2112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2113 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2114 if info is None: # Regular URL
2116 json_code_bytes = urlh.read()
2117 json_code = json_code_bytes.decode('utf-8')
2118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2119 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2123 json_data = json.loads(json_code)
2124 if 'Post' in json_data:
2125 data = json_data['Post']
2129 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2130 video_url = data['media']['url']
2131 umobj = re.match(self._URL_EXT, video_url)
2133 raise ValueError('Can not determine filename extension')
2134 ext = umobj.group(1)
2137 'id': data['item_id'],
2139 'uploader': data['display_name'],
2140 'upload_date': upload_date,
2141 'title': data['title'],
2143 'format': data['media']['mimeType'],
2144 'thumbnail': data['thumbnailUrl'],
2145 'description': data['description'],
2146 'player_url': data['embedUrl'],
2147 'user_agent': 'iTunes/10.6.1',
2149 except (ValueError,KeyError) as err:
2150 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2156 class MyVideoIE(InfoExtractor):
2157 """Information Extractor for myvideo.de."""
2159 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2160 IE_NAME = u'myvideo'
2162 def _real_extract(self,url):
2163 mobj = re.match(self._VALID_URL, url)
2165 self._download.report_error(u'invalid URL: %s' % url)
2168 video_id = mobj.group(1)
2171 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2172 webpage = self._download_webpage(webpage_url, video_id)
2174 self.report_extraction(video_id)
2175 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2178 self._downloader.report_error(u'unable to extract media URL')
2180 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2182 mobj = re.search('<title>([^<]+)</title>', webpage)
2184 self._downloader.report_error(u'unable to extract title')
2187 video_title = mobj.group(1)
2193 'upload_date': None,
2194 'title': video_title,
2198 class ComedyCentralIE(InfoExtractor):
2199 """Information extractor for The Daily Show and Colbert Report """
2201 # urls can be abbreviations like :thedailyshow or :colbert
2202 # urls for episodes like:
2203 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2204 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2205 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2206 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2207 |(https?://)?(www\.)?
2208 (?P<showname>thedailyshow|colbertnation)\.com/
2209 (full-episodes/(?P<episode>.*)|
2211 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2212 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2215 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2217 _video_extensions = {
2225 _video_dimensions = {
2235 def suitable(cls, url):
2236 """Receives a URL and returns True if suitable for this IE."""
2237 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2239 def _print_formats(self, formats):
2240 print('Available formats:')
2242 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2245 def _real_extract(self, url):
2246 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2248 self._downloader.report_error(u'invalid URL: %s' % url)
2251 if mobj.group('shortname'):
2252 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253 url = u'http://www.thedailyshow.com/full-episodes/'
2255 url = u'http://www.colbertnation.com/full-episodes/'
2256 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257 assert mobj is not None
2259 if mobj.group('clip'):
2260 if mobj.group('showname') == 'thedailyshow':
2261 epTitle = mobj.group('tdstitle')
2263 epTitle = mobj.group('cntitle')
2266 dlNewest = not mobj.group('episode')
2268 epTitle = mobj.group('showname')
2270 epTitle = mobj.group('episode')
2272 self.report_extraction(epTitle)
2273 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2275 url = htmlHandle.geturl()
2276 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2278 raise ExtractorError(u'Invalid redirected URL: ' + url)
2279 if mobj.group('episode') == '':
2280 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2281 epTitle = mobj.group('episode')
2283 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2285 if len(mMovieParams) == 0:
2286 # The Colbert Report embeds the information in a without
2287 # a URL prefix; so extract the alternate reference
2288 # and then add the URL prefix manually.
2290 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2291 if len(altMovieParams) == 0:
2292 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2294 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2296 uri = mMovieParams[0][1]
2297 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2298 indexXml = self._download_webpage(indexUrl, epTitle,
2299 u'Downloading show index',
2300 u'unable to download episode index')
2304 idoc = xml.etree.ElementTree.fromstring(indexXml)
2305 itemEls = idoc.findall('.//item')
2306 for partNum,itemEl in enumerate(itemEls):
2307 mediaId = itemEl.findall('./guid')[0].text
2308 shortMediaId = mediaId.split(':')[-1]
2309 showId = mediaId.split(':')[-2].replace('.com', '')
2310 officialTitle = itemEl.findall('./title')[0].text
2311 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2313 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2314 compat_urllib_parse.urlencode({'uri': mediaId}))
2315 configXml = self._download_webpage(configUrl, epTitle,
2316 u'Downloading configuration for %s' % shortMediaId)
2318 cdoc = xml.etree.ElementTree.fromstring(configXml)
2320 for rendition in cdoc.findall('.//rendition'):
2321 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2325 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2328 if self._downloader.params.get('listformats', None):
2329 self._print_formats([i[0] for i in turls])
2332 # For now, just pick the highest bitrate
2333 format,rtmp_video_url = turls[-1]
2335 # Get the format arg from the arg stream
2336 req_format = self._downloader.params.get('format', None)
2338 # Select format if we can find one
2341 format, rtmp_video_url = f, v
2344 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2346 raise ExtractorError(u'Cannot transform RTMP url')
2347 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2348 video_url = base + m.group('finalid')
2350 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2355 'upload_date': officialDate,
2360 'description': officialTitle,
2362 results.append(info)
2367 class EscapistIE(InfoExtractor):
2368 """Information extractor for The Escapist """
2370 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371 IE_NAME = u'escapist'
2373 def _real_extract(self, url):
2374 mobj = re.match(self._VALID_URL, url)
2376 self._downloader.report_error(u'invalid URL: %s' % url)
2378 showName = mobj.group('showname')
2379 videoId = mobj.group('episode')
2381 self.report_extraction(showName)
2382 webPage = self._download_webpage(url, showName)
2384 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2385 description = unescapeHTML(descMatch.group(1))
2386 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2387 imgUrl = unescapeHTML(imgMatch.group(1))
2388 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2389 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2390 configUrlMatch = re.search('config=(.*)$', playerUrl)
2391 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2393 configJSON = self._download_webpage(configUrl, showName,
2394 u'Downloading configuration',
2395 u'unable to download configuration')
2397 # Technically, it's JavaScript, not JSON
2398 configJSON = configJSON.replace("'", '"')
2401 config = json.loads(configJSON)
2402 except (ValueError,) as err:
2403 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2406 playlist = config['playlist']
2407 videoUrl = playlist[1]['url']
2412 'uploader': showName,
2413 'upload_date': None,
2416 'thumbnail': imgUrl,
2417 'description': description,
2418 'player_url': playerUrl,
2423 class CollegeHumorIE(InfoExtractor):
2424 """Information extractor for collegehumor.com"""
2427 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2428 IE_NAME = u'collegehumor'
2430 def report_manifest(self, video_id):
2431 """Report information extraction."""
2432 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2434 def _real_extract(self, url):
2435 mobj = re.match(self._VALID_URL, url)
2437 self._downloader.report_error(u'invalid URL: %s' % url)
2439 video_id = mobj.group('videoid')
2444 'upload_date': None,
2447 self.report_extraction(video_id)
2448 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2450 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2455 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2457 videoNode = mdoc.findall('./video')[0]
2458 info['description'] = videoNode.findall('./description')[0].text
2459 info['title'] = videoNode.findall('./caption')[0].text
2460 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461 manifest_url = videoNode.findall('./file')[0].text
2463 self._downloader.report_error(u'Invalid metadata XML file')
2466 manifest_url += '?hdcore=2.10.3'
2467 self.report_manifest(video_id)
2469 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2474 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2476 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2477 node_id = media_node.attrib['url']
2478 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2479 except IndexError as err:
2480 self._downloader.report_error(u'Invalid manifest file')
2483 url_pr = compat_urllib_parse_urlparse(manifest_url)
2484 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2491 class XVideosIE(InfoExtractor):
2492 """Information extractor for xvideos.com"""
2494 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2495 IE_NAME = u'xvideos'
2497 def _real_extract(self, url):
2498 mobj = re.match(self._VALID_URL, url)
2500 self._downloader.report_error(u'invalid URL: %s' % url)
2502 video_id = mobj.group(1)
2504 webpage = self._download_webpage(url, video_id)
2506 self.report_extraction(video_id)
2510 mobj = re.search(r'flv_url=(.+?)&', webpage)
2512 self._downloader.report_error(u'unable to extract video url')
2514 video_url = compat_urllib_parse.unquote(mobj.group(1))
2518 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2520 self._downloader.report_error(u'unable to extract video title')
2522 video_title = mobj.group(1)
2525 # Extract video thumbnail
2526 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2528 self._downloader.report_error(u'unable to extract video thumbnail')
2530 video_thumbnail = mobj.group(0)
2536 'upload_date': None,
2537 'title': video_title,
2539 'thumbnail': video_thumbnail,
2540 'description': None,
2546 class SoundcloudIE(InfoExtractor):
2547 """Information extractor for soundcloud.com
2548 To access the media, the uid of the song and a stream token
2549 must be extracted from the page source and the script must make
2550 a request to media.soundcloud.com/crossdomain.xml. Then
2551 the media can be grabbed by requesting from an url composed
2552 of the stream token and uid
2555 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2556 IE_NAME = u'soundcloud'
2558 def report_resolve(self, video_id):
2559 """Report information extraction."""
2560 self.to_screen(u'%s: Resolving id' % video_id)
2562 def _real_extract(self, url):
2563 mobj = re.match(self._VALID_URL, url)
2565 self._downloader.report_error(u'invalid URL: %s' % url)
2568 # extract uploader (which is in the url)
2569 uploader = mobj.group(1)
2570 # extract simple title (uploader + slug of song title)
2571 slug_title = mobj.group(2)
2572 simple_title = uploader + u'-' + slug_title
2573 full_title = '%s/%s' % (uploader, slug_title)
2575 self.report_resolve(full_title)
2577 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2578 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2579 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2581 info = json.loads(info_json)
2582 video_id = info['id']
2583 self.report_extraction(full_title)
2585 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2586 stream_json = self._download_webpage(streams_url, full_title,
2587 u'Downloading stream definitions',
2588 u'unable to download stream definitions')
2590 streams = json.loads(stream_json)
2591 mediaURL = streams['http_mp3_128_url']
2592 upload_date = unified_strdate(info['created_at'])
2597 'uploader': info['user']['username'],
2598 'upload_date': upload_date,
2599 'title': info['title'],
2601 'description': info['description'],
2604 class SoundcloudSetIE(InfoExtractor):
2605 """Information extractor for soundcloud.com sets
2606 To access the media, the uid of the song and a stream token
2607 must be extracted from the page source and the script must make
2608 a request to media.soundcloud.com/crossdomain.xml. Then
2609 the media can be grabbed by requesting from an url composed
2610 of the stream token and uid
2613 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2614 IE_NAME = u'soundcloud:set'
2616 def report_resolve(self, video_id):
2617 """Report information extraction."""
2618 self.to_screen(u'%s: Resolving id' % video_id)
2620 def _real_extract(self, url):
2621 mobj = re.match(self._VALID_URL, url)
2623 self._downloader.report_error(u'invalid URL: %s' % url)
2626 # extract uploader (which is in the url)
2627 uploader = mobj.group(1)
2628 # extract simple title (uploader + slug of song title)
2629 slug_title = mobj.group(2)
2630 simple_title = uploader + u'-' + slug_title
2631 full_title = '%s/sets/%s' % (uploader, slug_title)
2633 self.report_resolve(full_title)
2635 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2636 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2637 info_json = self._download_webpage(resolv_url, full_title)
2640 info = json.loads(info_json)
2641 if 'errors' in info:
2642 for err in info['errors']:
2643 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2646 self.report_extraction(full_title)
2647 for track in info['tracks']:
2648 video_id = track['id']
2650 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2653 self.report_extraction(video_id)
2654 streams = json.loads(stream_json)
2655 mediaURL = streams['http_mp3_128_url']
2660 'uploader': track['user']['username'],
2661 'upload_date': unified_strdate(track['created_at']),
2662 'title': track['title'],
2664 'description': track['description'],
2669 class InfoQIE(InfoExtractor):
2670 """Information extractor for infoq.com"""
2671 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2673 def _real_extract(self, url):
2674 mobj = re.match(self._VALID_URL, url)
2676 self._downloader.report_error(u'invalid URL: %s' % url)
2679 webpage = self._download_webpage(url, video_id=url)
2680 self.report_extraction(url)
2683 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2685 self._downloader.report_error(u'unable to extract video url')
2687 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2688 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2691 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2693 self._downloader.report_error(u'unable to extract video title')
2695 video_title = mobj.group(1)
2697 # Extract description
2698 video_description = u'No description available.'
2699 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2700 if mobj is not None:
2701 video_description = mobj.group(1)
2703 video_filename = video_url.split('/')[-1]
2704 video_id, extension = video_filename.split('.')
2710 'upload_date': None,
2711 'title': video_title,
2712 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2714 'description': video_description,
2719 class MixcloudIE(InfoExtractor):
2720 """Information extractor for www.mixcloud.com"""
2722 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2723 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2724 IE_NAME = u'mixcloud'
2726 def report_download_json(self, file_id):
2727 """Report JSON download."""
2728 self.to_screen(u'Downloading json')
2730 def get_urls(self, jsonData, fmt, bitrate='best'):
2731 """Get urls from 'audio_formats' section in json"""
2734 bitrate_list = jsonData[fmt]
2735 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2736 bitrate = max(bitrate_list) # select highest
2738 url_list = jsonData[fmt][bitrate]
2739 except TypeError: # we have no bitrate info.
2740 url_list = jsonData[fmt]
2743 def check_urls(self, url_list):
2744 """Returns 1st active url from list"""
2745 for url in url_list:
2747 compat_urllib_request.urlopen(url)
2749 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754 def _print_formats(self, formats):
2755 print('Available formats:')
2756 for fmt in formats.keys():
2757 for b in formats[fmt]:
2759 ext = formats[fmt][b][0]
2760 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2761 except TypeError: # we have no bitrate info
2762 ext = formats[fmt][0]
2763 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2766 def _real_extract(self, url):
2767 mobj = re.match(self._VALID_URL, url)
2769 self._downloader.report_error(u'invalid URL: %s' % url)
2771 # extract uploader & filename from url
2772 uploader = mobj.group(1).decode('utf-8')
2773 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2775 # construct API request
2776 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2777 # retrieve .json file with links to files
2778 request = compat_urllib_request.Request(file_url)
2780 self.report_download_json(file_url)
2781 jsonData = compat_urllib_request.urlopen(request).read()
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2787 json_data = json.loads(jsonData)
2788 player_url = json_data['player_swf_url']
2789 formats = dict(json_data['audio_formats'])
2791 req_format = self._downloader.params.get('format', None)
2794 if self._downloader.params.get('listformats', None):
2795 self._print_formats(formats)
2798 if req_format is None or req_format == 'best':
2799 for format_param in formats.keys():
2800 url_list = self.get_urls(formats, format_param)
2802 file_url = self.check_urls(url_list)
2803 if file_url is not None:
2806 if req_format not in formats:
2807 self._downloader.report_error(u'format is not available')
2810 url_list = self.get_urls(formats, req_format)
2811 file_url = self.check_urls(url_list)
2812 format_param = req_format
2815 'id': file_id.decode('utf-8'),
2816 'url': file_url.decode('utf-8'),
2817 'uploader': uploader.decode('utf-8'),
2818 'upload_date': None,
2819 'title': json_data['name'],
2820 'ext': file_url.split('.')[-1].decode('utf-8'),
2821 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2822 'thumbnail': json_data['thumbnail_url'],
2823 'description': json_data['description'],
2824 'player_url': player_url.decode('utf-8'),
2827 class StanfordOpenClassroomIE(InfoExtractor):
2828 """Information extractor for Stanford's Open ClassRoom"""
2830 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2831 IE_NAME = u'stanfordoc'
2833 def _real_extract(self, url):
2834 mobj = re.match(self._VALID_URL, url)
2836 raise ExtractorError(u'Invalid URL: %s' % url)
2838 if mobj.group('course') and mobj.group('video'): # A specific video
2839 course = mobj.group('course')
2840 video = mobj.group('video')
2842 'id': course + '_' + video,
2844 'upload_date': None,
2847 self.report_extraction(info['id'])
2848 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2849 xmlUrl = baseUrl + video + '.xml'
2851 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2855 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2857 info['title'] = mdoc.findall('./title')[0].text
2858 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2860 self._downloader.report_error(u'Invalid metadata XML file')
2862 info['ext'] = info['url'].rpartition('.')[2]
2864 elif mobj.group('course'): # A course page
2865 course = mobj.group('course')
2870 'upload_date': None,
2873 coursepage = self._download_webpage(url, info['id'],
2874 note='Downloading course info page',
2875 errnote='Unable to download course info page')
2877 m = re.search('<h1>([^<]+)</h1>', coursepage)
2879 info['title'] = unescapeHTML(m.group(1))
2881 info['title'] = info['id']
2883 m = re.search('<description>([^<]+)</description>', coursepage)
2885 info['description'] = unescapeHTML(m.group(1))
2887 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2890 'type': 'reference',
2891 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2895 for entry in info['list']:
2896 assert entry['type'] == 'reference'
2897 results += self.extract(entry['url'])
2901 'id': 'Stanford OpenClassroom',
2904 'upload_date': None,
2907 self.report_download_webpage(info['id'])
2908 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2910 rootpage = compat_urllib_request.urlopen(rootURL).read()
2911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2912 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2915 info['title'] = info['id']
2917 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2920 'type': 'reference',
2921 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2926 for entry in info['list']:
2927 assert entry['type'] == 'reference'
2928 results += self.extract(entry['url'])
2931 class MTVIE(InfoExtractor):
2932 """Information extractor for MTV.com"""
2934 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2937 def _real_extract(self, url):
2938 mobj = re.match(self._VALID_URL, url)
2940 self._downloader.report_error(u'invalid URL: %s' % url)
2942 if not mobj.group('proto'):
2943 url = 'http://' + url
2944 video_id = mobj.group('videoid')
2946 webpage = self._download_webpage(url, video_id)
2948 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2950 self._downloader.report_error(u'unable to extract song name')
2952 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2953 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2955 self._downloader.report_error(u'unable to extract performer')
2957 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2958 video_title = performer + ' - ' + song_name
2960 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2962 self._downloader.report_error(u'unable to mtvn_uri')
2964 mtvn_uri = mobj.group(1)
2966 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2968 self._downloader.report_error(u'unable to extract content id')
2970 content_id = mobj.group(1)
2972 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2973 self.report_extraction(video_id)
2974 request = compat_urllib_request.Request(videogen_url)
2976 metadataXml = compat_urllib_request.urlopen(request).read()
2977 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2978 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2981 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2982 renditions = mdoc.findall('.//rendition')
2984 # For now, always pick the highest quality.
2985 rendition = renditions[-1]
2988 _,_,ext = rendition.attrib['type'].partition('/')
2989 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2990 video_url = rendition.find('./src').text
2992 self._downloader.report_error('Invalid rendition field.')
2998 'uploader': performer,
2999 'upload_date': None,
3000 'title': video_title,
3008 class YoukuIE(InfoExtractor):
3009 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3012 nowTime = int(time.time() * 1000)
3013 random1 = random.randint(1000,1998)
3014 random2 = random.randint(1000,9999)
3016 return "%d%d%d" %(nowTime,random1,random2)
3018 def _get_file_ID_mix_string(self, seed):
3020 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3022 for i in range(len(source)):
3023 seed = (seed * 211 + 30031 ) % 65536
3024 index = math.floor(seed / 65536 * len(source) )
3025 mixed.append(source[int(index)])
3026 source.remove(source[int(index)])
3027 #return ''.join(mixed)
3030 def _get_file_id(self, fileId, seed):
3031 mixed = self._get_file_ID_mix_string(seed)
3032 ids = fileId.split('*')
3036 realId.append(mixed[int(ch)])
3037 return ''.join(realId)
3039 def _real_extract(self, url):
3040 mobj = re.match(self._VALID_URL, url)
3042 self._downloader.report_error(u'invalid URL: %s' % url)
3044 video_id = mobj.group('ID')
3046 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3048 jsondata = self._download_webpage(info_url, video_id)
3050 self.report_extraction(video_id)
3052 config = json.loads(jsondata)
3054 video_title = config['data'][0]['title']
3055 seed = config['data'][0]['seed']
3057 format = self._downloader.params.get('format', None)
3058 supported_format = list(config['data'][0]['streamfileids'].keys())
3060 if format is None or format == 'best':
3061 if 'hd2' in supported_format:
3066 elif format == 'worst':
3074 fileid = config['data'][0]['streamfileids'][format]
3075 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3076 except (UnicodeDecodeError, ValueError, KeyError):
3077 self._downloader.report_error(u'unable to extract info section')
3081 sid = self._gen_sid()
3082 fileid = self._get_file_id(fileid, seed)
3084 #column 8,9 of fileid represent the segment number
3085 #fileid[7:9] should be changed
3086 for index, key in enumerate(keys):
3088 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3089 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3092 'id': '%s_part%02d' % (video_id, index),
3093 'url': download_url,
3095 'upload_date': None,
3096 'title': video_title,
3099 files_info.append(info)
3104 class XNXXIE(InfoExtractor):
3105 """Information extractor for xnxx.com"""
3107 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3109 VIDEO_URL_RE = r'flv_url=(.*?)&'
3110 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3111 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3113 def _real_extract(self, url):
3114 mobj = re.match(self._VALID_URL, url)
3116 self._downloader.report_error(u'invalid URL: %s' % url)
3118 video_id = mobj.group(1)
3120 # Get webpage content
3121 webpage = self._download_webpage(url, video_id)
3123 result = re.search(self.VIDEO_URL_RE, webpage)
3125 self._downloader.report_error(u'unable to extract video url')
3127 video_url = compat_urllib_parse.unquote(result.group(1))
3129 result = re.search(self.VIDEO_TITLE_RE, webpage)
3131 self._downloader.report_error(u'unable to extract video title')
3133 video_title = result.group(1)
3135 result = re.search(self.VIDEO_THUMB_RE, webpage)
3137 self._downloader.report_error(u'unable to extract video thumbnail')
3139 video_thumbnail = result.group(1)
3145 'upload_date': None,
3146 'title': video_title,
3148 'thumbnail': video_thumbnail,
3149 'description': None,
3153 class GooglePlusIE(InfoExtractor):
3154 """Information extractor for plus.google.com."""
3156 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3157 IE_NAME = u'plus.google'
3159 def report_extract_entry(self, url):
3160 """Report downloading extry"""
3161 self.to_screen(u'Downloading entry: %s' % url)
3163 def report_date(self, upload_date):
3164 """Report downloading extry"""
3165 self.to_screen(u'Entry date: %s' % upload_date)
3167 def report_uploader(self, uploader):
3168 """Report downloading extry"""
3169 self.to_screen(u'Uploader: %s' % uploader)
3171 def report_title(self, video_title):
3172 """Report downloading extry"""
3173 self.to_screen(u'Title: %s' % video_title)
3175 def report_extract_vid_page(self, video_page):
3176 """Report information extraction."""
3177 self.to_screen(u'Extracting video page: %s' % video_page)
3179 def _real_extract(self, url):
3180 # Extract id from URL
3181 mobj = re.match(self._VALID_URL, url)
3183 self._downloader.report_error(u'Invalid URL: %s' % url)
3186 post_url = mobj.group(0)
3187 video_id = mobj.group(1)
3189 video_extension = 'flv'
3191 # Step 1, Retrieve post webpage to extract further information
3192 self.report_extract_entry(post_url)
3193 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3195 # Extract update date
3197 pattern = 'title="Timestamp">(.*?)</a>'
3198 mobj = re.search(pattern, webpage)
3200 upload_date = mobj.group(1)
3201 # Convert timestring to a format suitable for filename
3202 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3203 upload_date = upload_date.strftime('%Y%m%d')
3204 self.report_date(upload_date)
3208 pattern = r'rel\="author".*?>(.*?)</a>'
3209 mobj = re.search(pattern, webpage)
3211 uploader = mobj.group(1)
3212 self.report_uploader(uploader)
3215 # Get the first line for title
3217 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3218 mobj = re.search(pattern, webpage)
3220 video_title = mobj.group(1)
3221 self.report_title(video_title)
3223 # Step 2, Stimulate clicking the image box to launch video
3224 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3225 mobj = re.search(pattern, webpage)
3227 self._downloader.report_error(u'unable to extract video page URL')
3229 video_page = mobj.group(1)
3230 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3231 self.report_extract_vid_page(video_page)
3234 # Extract video links on video page
3235 """Extract video links of all sizes"""
3236 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3237 mobj = re.findall(pattern, webpage)
3239 self._downloader.report_error(u'unable to extract video links')
3241 # Sort in resolution
3242 links = sorted(mobj)
3244 # Choose the lowest of the sort, i.e. highest resolution
3245 video_url = links[-1]
3246 # Only get the url. The resolution part in the tuple has no use anymore
3247 video_url = video_url[-1]
3248 # Treat escaped \u0026 style hex
3250 video_url = video_url.decode("unicode_escape")
3251 except AttributeError: # Python 3
3252 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3258 'uploader': uploader,
3259 'upload_date': upload_date,
3260 'title': video_title,
3261 'ext': video_extension,
3264 class NBAIE(InfoExtractor):
3265 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3268 def _real_extract(self, url):
3269 mobj = re.match(self._VALID_URL, url)
3271 self._downloader.report_error(u'invalid URL: %s' % url)
3274 video_id = mobj.group(1)
3275 if video_id.endswith('/index.html'):
3276 video_id = video_id[:-len('/index.html')]
3278 webpage = self._download_webpage(url, video_id)
3280 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3281 def _findProp(rexp, default=None):
3282 m = re.search(rexp, webpage)
3284 return unescapeHTML(m.group(1))
3288 shortened_video_id = video_id.rpartition('/')[2]
3289 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3291 'id': shortened_video_id,
3295 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3296 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3300 class JustinTVIE(InfoExtractor):
3301 """Information extractor for justin.tv and twitch.tv"""
3302 # TODO: One broadcast may be split into multiple videos. The key
3303 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3304 # starts at 1 and increases. Can we treat all parts as one video?
3306 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3308 (?P<channelid>[^/]+)|
3309 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3310 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3314 _JUSTIN_PAGE_LIMIT = 100
3315 IE_NAME = u'justin.tv'
3317 def report_download_page(self, channel, offset):
3318 """Report attempt to download a single page of videos."""
3319 self.to_screen(u'%s: Downloading video information from %d to %d' %
3320 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3322 # Return count of items, list of *valid* items
3323 def _parse_page(self, url, video_id):
3324 webpage = self._download_webpage(url, video_id,
3325 u'Downloading video info JSON',
3326 u'unable to download video info JSON')
3328 response = json.loads(webpage)
3329 if type(response) != list:
3330 error_text = response.get('error', 'unknown error')
3331 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3333 for clip in response:
3334 video_url = clip['video_file_url']
3336 video_extension = os.path.splitext(video_url)[1][1:]
3337 video_date = re.sub('-', '', clip['start_time'][:10])
3338 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3339 video_id = clip['id']
3340 video_title = clip.get('title', video_id)
3344 'title': video_title,
3345 'uploader': clip.get('channel_name', video_uploader_id),
3346 'uploader_id': video_uploader_id,
3347 'upload_date': video_date,
3348 'ext': video_extension,
3350 return (len(response), info)
3352 def _real_extract(self, url):
3353 mobj = re.match(self._VALID_URL, url)
3355 raise ExtractorError(u'invalid URL: %s' % url)
3357 api_base = 'http://api.justin.tv'
3359 if mobj.group('channelid'):
3361 video_id = mobj.group('channelid')
3362 api = api_base + '/channel/archives/%s.json' % video_id
3363 elif mobj.group('chapterid'):
3364 chapter_id = mobj.group('chapterid')
3366 webpage = self._download_webpage(url, chapter_id)
3367 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3369 raise ExtractorError(u'Cannot find archive of a chapter')
3370 archive_id = m.group(1)
3371 m = re.search(r"<h2 class='js-title'>([^<]*)</h2>", webpage)
3373 raise ExtractorError(u'Cannot find chapter title')
3374 video_title = m.group(1)
3376 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3377 chapter_info_xml = self._download_webpage(api, chapter_id,
3378 note=u'Downloading chapter information',
3379 errnote=u'Chapter information download failed')
3380 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3381 for a in doc.findall('.//archive'):
3382 if archive_id == a.find('./id').text:
3385 raise ExtractorError(u'Could not find chapter in chapter information')
3387 video_url = a.find('./video_file_url').text
3388 video_ext = video_url.rpartition('.')[2] or u'flv'
3390 # TODO determine start (and probably fix up file)
3391 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3392 #video_url += u'?start=' + a.find('./start_timestamp').text
3393 self._downloader.report_warning(u'Chapter detected, but we do not know how to calculate start position. Downloading the whole file ... (See https://github.com/rg3/youtube-dl/issues/810 )')
3396 'id': u'c' + chapter_id,
3399 'title': video_title,
3403 video_id = mobj.group('videoid')
3404 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3406 self.report_extraction(video_id)
3410 limit = self._JUSTIN_PAGE_LIMIT
3413 self.report_download_page(video_id, offset)
3414 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3415 page_count, page_info = self._parse_page(page_url, video_id)
3416 info.extend(page_info)
3417 if not paged or page_count != limit:
3422 class FunnyOrDieIE(InfoExtractor):
3423 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3425 def _real_extract(self, url):
3426 mobj = re.match(self._VALID_URL, url)
3428 raise ExtractorError(u'invalid URL: %s' % url)
3430 video_id = mobj.group('id')
3431 webpage = self._download_webpage(url, video_id)
3433 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3435 self._downloader.report_error(u'unable to find video information')
3436 video_url = unescapeHTML(m.group('url'))
3438 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3440 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3442 self._downloader.report_error(u'Cannot find video title')
3443 title = clean_html(m.group('title'))
3445 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3447 desc = unescapeHTML(m.group('desc'))
3456 'description': desc,
3460 class SteamIE(InfoExtractor):
3461 _VALID_URL = r"""http://store\.steampowered\.com/
3463 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3465 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3469 def suitable(cls, url):
3470 """Receives a URL and returns True if suitable for this IE."""
3471 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3473 def _real_extract(self, url):
3474 m = re.match(self._VALID_URL, url, re.VERBOSE)
3475 gameID = m.group('gameID')
3476 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3477 self.report_age_confirmation()
3478 webpage = self._download_webpage(videourl, gameID)
3479 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3481 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3482 mweb = re.finditer(urlRE, webpage)
3483 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3484 titles = re.finditer(namesRE, webpage)
3485 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3486 thumbs = re.finditer(thumbsRE, webpage)
3488 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3489 video_id = vid.group('videoID')
3490 title = vtitle.group('videoName')
3491 video_url = vid.group('videoURL')
3492 video_thumb = thumb.group('thumbnail')
3494 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3499 'title': unescapeHTML(title),
3500 'thumbnail': video_thumb
3503 return [self.playlist_result(videos, gameID, game_title)]
3505 class UstreamIE(InfoExtractor):
3506 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3507 IE_NAME = u'ustream'
3509 def _real_extract(self, url):
3510 m = re.match(self._VALID_URL, url)
3511 video_id = m.group('videoID')
3512 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3513 webpage = self._download_webpage(url, video_id)
3514 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3515 title = m.group('title')
3516 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3517 uploader = m.group('uploader')
3523 'uploader': uploader
3527 class WorldStarHipHopIE(InfoExtractor):
3528 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3529 IE_NAME = u'WorldStarHipHop'
3531 def _real_extract(self, url):
3532 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3534 m = re.match(self._VALID_URL, url)
3535 video_id = m.group('id')
3537 webpage_src = self._download_webpage(url, video_id)
3539 mobj = re.search(_src_url, webpage_src)
3541 if mobj is not None:
3542 video_url = mobj.group(1)
3543 if 'mp4' in video_url:
3548 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3550 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3553 raise ExtractorError(u'Cannot determine title')
3554 title = mobj.group(1)
3556 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3557 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3558 if mobj is not None:
3559 thumbnail = mobj.group(1)
3561 _title = r"""candytitles.*>(.*)</span>"""
3562 mobj = re.search(_title, webpage_src)
3563 if mobj is not None:
3564 title = mobj.group(1)
3571 'thumbnail' : thumbnail,
3576 class RBMARadioIE(InfoExtractor):
3577 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3579 def _real_extract(self, url):
3580 m = re.match(self._VALID_URL, url)
3581 video_id = m.group('videoID')
3583 webpage = self._download_webpage(url, video_id)
3584 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3586 raise ExtractorError(u'Cannot find metadata')
3587 json_data = m.group(1)
3590 data = json.loads(json_data)
3591 except ValueError as e:
3592 raise ExtractorError(u'Invalid JSON: ' + str(e))
3594 video_url = data['akamai_url'] + '&cbr=256'
3595 url_parts = compat_urllib_parse_urlparse(video_url)
3596 video_ext = url_parts.path.rpartition('.')[2]
3601 'title': data['title'],
3602 'description': data.get('teaser_text'),
3603 'location': data.get('country_of_origin'),
3604 'uploader': data.get('host', {}).get('name'),
3605 'uploader_id': data.get('host', {}).get('slug'),
3606 'thumbnail': data.get('image', {}).get('large_url_2x'),
3607 'duration': data.get('duration'),
3612 class YouPornIE(InfoExtractor):
3613 """Information extractor for youporn.com."""
3614 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3616 def _print_formats(self, formats):
3617 """Print all available formats"""
3618 print(u'Available formats:')
3619 print(u'ext\t\tformat')
3620 print(u'---------------------------------')
3621 for format in formats:
3622 print(u'%s\t\t%s' % (format['ext'], format['format']))
3624 def _specific(self, req_format, formats):
3626 if(x["format"]==req_format):
3630 def _real_extract(self, url):
3631 mobj = re.match(self._VALID_URL, url)
3633 self._downloader.report_error(u'invalid URL: %s' % url)
3636 video_id = mobj.group('videoid')
3638 req = compat_urllib_request.Request(url)
3639 req.add_header('Cookie', 'age_verified=1')
3640 webpage = self._download_webpage(req, video_id)
3642 # Get the video title
3643 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3645 raise ExtractorError(u'Unable to extract video title')
3646 video_title = result.group('title').strip()
3648 # Get the video date
3649 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3651 self._downloader.report_warning(u'unable to extract video date')
3654 upload_date = unified_strdate(result.group('date').strip())
3656 # Get the video uploader
3657 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3659 self._downloader.report_warning(u'unable to extract uploader')
3660 video_uploader = None
3662 video_uploader = result.group('uploader').strip()
3663 video_uploader = clean_html( video_uploader )
3665 # Get all of the formats available
3666 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3667 result = re.search(DOWNLOAD_LIST_RE, webpage)
3669 raise ExtractorError(u'Unable to extract download list')
3670 download_list_html = result.group('download_list').strip()
3672 # Get all of the links from the page
3673 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3674 links = re.findall(LINK_RE, download_list_html)
3675 if(len(links) == 0):
3676 raise ExtractorError(u'ERROR: no known formats available for video')
3678 self.to_screen(u'Links found: %d' % len(links))
3683 # A link looks like this:
3684 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3685 # A path looks like this:
3686 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3687 video_url = unescapeHTML( link )
3688 path = compat_urllib_parse_urlparse( video_url ).path
3689 extension = os.path.splitext( path )[1][1:]
3690 format = path.split('/')[4].split('_')[:2]
3693 format = "-".join( format )
3694 title = u'%s-%s-%s' % (video_title, size, bitrate)
3699 'uploader': video_uploader,
3700 'upload_date': upload_date,
3705 'description': None,
3709 if self._downloader.params.get('listformats', None):
3710 self._print_formats(formats)
3713 req_format = self._downloader.params.get('format', None)
3714 self.to_screen(u'Format: %s' % req_format)
3716 if req_format is None or req_format == 'best':
3718 elif req_format == 'worst':
3719 return [formats[-1]]
3720 elif req_format in ('-1', 'all'):
3723 format = self._specific( req_format, formats )
3725 self._downloader.report_error(u'requested format not available')
3731 class PornotubeIE(InfoExtractor):
3732 """Information extractor for pornotube.com."""
3733 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3735 def _real_extract(self, url):
3736 mobj = re.match(self._VALID_URL, url)
3738 self._downloader.report_error(u'invalid URL: %s' % url)
3741 video_id = mobj.group('videoid')
3742 video_title = mobj.group('title')
3744 # Get webpage content
3745 webpage = self._download_webpage(url, video_id)
3748 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3749 result = re.search(VIDEO_URL_RE, webpage)
3751 self._downloader.report_error(u'unable to extract video url')
3753 video_url = compat_urllib_parse.unquote(result.group('url'))
3755 #Get the uploaded date
3756 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3757 result = re.search(VIDEO_UPLOADED_RE, webpage)
3759 self._downloader.report_error(u'unable to extract video title')
3761 upload_date = unified_strdate(result.group('date'))
3763 info = {'id': video_id,
3766 'upload_date': upload_date,
3767 'title': video_title,
3773 class YouJizzIE(InfoExtractor):
3774 """Information extractor for youjizz.com."""
3775 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3777 def _real_extract(self, url):
3778 mobj = re.match(self._VALID_URL, url)
3780 self._downloader.report_error(u'invalid URL: %s' % url)
3783 video_id = mobj.group('videoid')
3785 # Get webpage content
3786 webpage = self._download_webpage(url, video_id)
3788 # Get the video title
3789 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3791 raise ExtractorError(u'ERROR: unable to extract video title')
3792 video_title = result.group('title').strip()
3794 # Get the embed page
3795 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3797 raise ExtractorError(u'ERROR: unable to extract embed page')
3799 embed_page_url = result.group(0).strip()
3800 video_id = result.group('videoid')
3802 webpage = self._download_webpage(embed_page_url, video_id)
3805 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3807 raise ExtractorError(u'ERROR: unable to extract video url')
3808 video_url = result.group('source')
3810 info = {'id': video_id,
3812 'title': video_title,
3815 'player_url': embed_page_url}
3819 class EightTracksIE(InfoExtractor):
3821 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3823 def _real_extract(self, url):
3824 mobj = re.match(self._VALID_URL, url)
3826 raise ExtractorError(u'Invalid URL: %s' % url)
3827 playlist_id = mobj.group('id')
3829 webpage = self._download_webpage(url, playlist_id)
3831 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3833 raise ExtractorError(u'Cannot find trax information')
3834 json_like = m.group(1)
3835 data = json.loads(json_like)
3837 session = str(random.randint(0, 1000000000))
3839 track_count = data['tracks_count']
3840 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3841 next_url = first_url
3843 for i in itertools.count():
3844 api_json = self._download_webpage(next_url, playlist_id,
3845 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3846 errnote=u'Failed to download song information')
3847 api_data = json.loads(api_json)
3848 track_data = api_data[u'set']['track']
3850 'id': track_data['id'],
3851 'url': track_data['track_file_stream_url'],
3852 'title': track_data['performer'] + u' - ' + track_data['name'],
3853 'raw_title': track_data['name'],
3854 'uploader_id': data['user']['login'],
3858 if api_data['set']['at_last_track']:
3860 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3863 class KeekIE(InfoExtractor):
3864 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3867 def _real_extract(self, url):
3868 m = re.match(self._VALID_URL, url)
3869 video_id = m.group('videoID')
3870 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3871 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3872 webpage = self._download_webpage(url, video_id)
3873 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3874 title = unescapeHTML(m.group('title'))
3875 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3876 uploader = clean_html(m.group('uploader'))
3882 'thumbnail': thumbnail,
3883 'uploader': uploader
3887 class TEDIE(InfoExtractor):
3888 _VALID_URL=r'''http://www\.ted\.com/
3890 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3892 ((?P<type_talk>talks)) # We have a simple talk
3894 (/lang/(.*?))? # The url may contain the language
3895 /(?P<name>\w+) # Here goes the name and then ".html"
3899 def suitable(cls, url):
3900 """Receives a URL and returns True if suitable for this IE."""
3901 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3903 def _real_extract(self, url):
3904 m=re.match(self._VALID_URL, url, re.VERBOSE)
3905 if m.group('type_talk'):
3906 return [self._talk_info(url)]
3908 playlist_id=m.group('playlist_id')
3909 name=m.group('name')
3910 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3911 return [self._playlist_videos_info(url,name,playlist_id)]
3913 def _talk_video_link(self,mediaSlug):
3914 '''Returns the video link for that mediaSlug'''
3915 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3917 def _playlist_videos_info(self,url,name,playlist_id=0):
3918 '''Returns the videos of the playlist'''
3920 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3921 ([.\s]*?)data-playlist_item_id="(\d+)"
3922 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3924 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3925 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3926 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3927 m_names=re.finditer(video_name_RE,webpage)
3929 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3930 m_playlist = re.search(playlist_RE, webpage)
3931 playlist_title = m_playlist.group('playlist_title')
3933 playlist_entries = []
3934 for m_video, m_name in zip(m_videos,m_names):
3935 video_id=m_video.group('video_id')
3936 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3937 playlist_entries.append(self.url_result(talk_url, 'TED'))
3938 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3940 def _talk_info(self, url, video_id=0):
3941 """Return the video for the talk in the url"""
3942 m=re.match(self._VALID_URL, url,re.VERBOSE)
3943 videoName=m.group('name')
3944 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3945 # If the url includes the language we get the title translated
3946 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3947 title=re.search(title_RE, webpage).group('title')
3948 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3949 "id":(?P<videoID>[\d]+).*?
3950 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3951 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3952 thumb_match=re.search(thumb_RE,webpage)
3953 info_match=re.search(info_RE,webpage,re.VERBOSE)
3954 video_id=info_match.group('videoID')
3955 mediaSlug=info_match.group('mediaSlug')
3956 video_url=self._talk_video_link(mediaSlug)
3962 'thumbnail': thumb_match.group('thumbnail')
3966 class MySpassIE(InfoExtractor):
3967 _VALID_URL = r'http://www.myspass.de/.*'
3969 def _real_extract(self, url):
3970 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3972 # video id is the last path element of the URL
3973 # usually there is a trailing slash, so also try the second but last
3974 url_path = compat_urllib_parse_urlparse(url).path
3975 url_parent_path, video_id = os.path.split(url_path)
3977 _, video_id = os.path.split(url_parent_path)
3980 metadata_url = META_DATA_URL_TEMPLATE % video_id
3981 metadata_text = self._download_webpage(metadata_url, video_id)
3982 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3984 # extract values from metadata
3985 url_flv_el = metadata.find('url_flv')
3986 if url_flv_el is None:
3987 self._downloader.report_error(u'unable to extract download url')
3989 video_url = url_flv_el.text
3990 extension = os.path.splitext(video_url)[1][1:]
3991 title_el = metadata.find('title')
3992 if title_el is None:
3993 self._downloader.report_error(u'unable to extract title')
3995 title = title_el.text
3996 format_id_el = metadata.find('format_id')
3997 if format_id_el is None:
4000 format = format_id_el.text
4001 description_el = metadata.find('description')
4002 if description_el is not None:
4003 description = description_el.text
4006 imagePreview_el = metadata.find('imagePreview')
4007 if imagePreview_el is not None:
4008 thumbnail = imagePreview_el.text
4017 'thumbnail': thumbnail,
4018 'description': description
4022 class SpiegelIE(InfoExtractor):
4023 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4025 def _real_extract(self, url):
4026 m = re.match(self._VALID_URL, url)
4027 video_id = m.group('videoID')
4029 webpage = self._download_webpage(url, video_id)
4030 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4032 raise ExtractorError(u'Cannot find title')
4033 video_title = unescapeHTML(m.group(1))
4035 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4036 xml_code = self._download_webpage(xml_url, video_id,
4037 note=u'Downloading XML', errnote=u'Failed to download XML')
4039 idoc = xml.etree.ElementTree.fromstring(xml_code)
4040 last_type = idoc[-1]
4041 filename = last_type.findall('./filename')[0].text
4042 duration = float(last_type.findall('./duration')[0].text)
4044 video_url = 'http://video2.spiegel.de/flash/' + filename
4045 video_ext = filename.rpartition('.')[2]
4050 'title': video_title,
4051 'duration': duration,
4055 class LiveLeakIE(InfoExtractor):
4057 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4058 IE_NAME = u'liveleak'
4060 def _real_extract(self, url):
4061 mobj = re.match(self._VALID_URL, url)
4063 self._downloader.report_error(u'invalid URL: %s' % url)
4066 video_id = mobj.group('video_id')
4068 webpage = self._download_webpage(url, video_id)
4070 m = re.search(r'file: "(.*?)",', webpage)
4072 self._downloader.report_error(u'unable to find video url')
4074 video_url = m.group(1)
4076 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4078 self._downloader.report_error(u'Cannot find video title')
4079 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4081 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4083 desc = unescapeHTML(m.group('desc'))
4087 m = re.search(r'By:.*?(\w+)</a>', webpage)
4089 uploader = clean_html(m.group(1))
4098 'description': desc,
4099 'uploader': uploader
4104 class ARDIE(InfoExtractor):
4105 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4106 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4107 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4109 def _real_extract(self, url):
4110 # determine video id from url
4111 m = re.match(self._VALID_URL, url)
4113 numid = re.search(r'documentId=([0-9]+)', url)
4115 video_id = numid.group(1)
4117 video_id = m.group('video_id')
4119 # determine title and media streams from webpage
4120 html = self._download_webpage(url, video_id)
4121 title = re.search(self._TITLE, html).group('title')
4122 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4124 assert '"fsk"' in html
4125 self._downloader.report_error(u'this video is only available after 8:00 pm')
4128 # choose default media type and highest quality for now
4129 stream = max([s for s in streams if int(s["media_type"]) == 0],
4130 key=lambda s: int(s["quality"]))
4132 # there's two possibilities: RTMP stream or HTTP download
4133 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4134 if stream['rtmp_url']:
4135 self.to_screen(u'RTMP download detected')
4136 assert stream['video_url'].startswith('mp4:')
4137 info["url"] = stream["rtmp_url"]
4138 info["play_path"] = stream['video_url']
4140 assert stream["video_url"].endswith('.mp4')
4141 info["url"] = stream["video_url"]
4144 class TumblrIE(InfoExtractor):
4145 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4147 def _real_extract(self, url):
4148 m_url = re.match(self._VALID_URL, url)
4149 video_id = m_url.group('id')
4150 blog = m_url.group('blog_name')
4152 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4153 webpage = self._download_webpage(url, video_id)
4155 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4156 video = re.search(re_video, webpage)
4158 self.to_screen("No video founded")
4160 video_url = video.group('video_url')
4161 ext = video.group('ext')
4163 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4164 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4166 # The only place where you can get a title, it's not complete,
4167 # but searching in other places doesn't work for all videos
4168 re_title = r'<title>(?P<title>.*?)</title>'
4169 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4171 return [{'id': video_id,
4178 class BandcampIE(InfoExtractor):
4179 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4181 def _real_extract(self, url):
4182 mobj = re.match(self._VALID_URL, url)
4183 title = mobj.group('title')
4184 webpage = self._download_webpage(url, title)
4185 # We get the link to the free download page
4186 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4187 if m_download is None:
4188 self._downloader.report_error('No free songs founded')
4190 download_link = m_download.group(1)
4191 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4192 webpage, re.MULTILINE|re.DOTALL).group('id')
4194 download_webpage = self._download_webpage(download_link, id,
4195 'Downloading free downloads page')
4196 # We get the dictionary of the track from some javascrip code
4197 info = re.search(r'items: (.*?),$',
4198 download_webpage, re.MULTILINE).group(1)
4199 info = json.loads(info)[0]
4200 # We pick mp3-320 for now, until format selection can be easily implemented.
4201 mp3_info = info[u'downloads'][u'mp3-320']
4202 # If we try to use this url it says the link has expired
4203 initial_url = mp3_info[u'url']
4204 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4205 m_url = re.match(re_url, initial_url)
4206 #We build the url we will use to get the final track url
4207 # This url is build in Bandcamp in the script download_bunde_*.js
4208 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4209 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4210 # If we could correctly generate the .rand field the url would be
4211 #in the "download_url" key
4212 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4214 track_info = {'id':id,
4215 'title' : info[u'title'],
4218 'thumbnail' : info[u'thumb_url'],
4219 'uploader' : info[u'artist']
4224 class RedTubeIE(InfoExtractor):
4225 """Information Extractor for redtube"""
4226 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4228 def _real_extract(self,url):
4229 mobj = re.match(self._VALID_URL, url)
4231 raise ExtractorError(u'Invalid URL: %s' % url)
4233 video_id = mobj.group('id')
4234 video_extension = 'mp4'
4235 webpage = self._download_webpage(url, video_id)
4236 self.report_extraction(video_id)
4237 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4240 raise ExtractorError(u'Unable to extract media URL')
4242 video_url = mobj.group(1)
4243 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4245 raise ExtractorError(u'Unable to extract title')
4246 video_title = mobj.group(1)
4251 'ext': video_extension,
4252 'title': video_title,
4256 def gen_extractors():
4257 """ Return a list of an instance of every supported extractor.
4258 The order does matter; the first extractor matched is the one handling the URL.
4261 YoutubePlaylistIE(),
4286 StanfordOpenClassroomIE(),
4296 WorldStarHipHopIE(),
4315 def get_info_extractor(ie_name):
4316 """Returns the info extractor class with the given ie_name"""
4317 return globals()[ie_name+'IE']