2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
138 #Methods for following #608
139 #They set the correct value of the '_type' key
140 def video_result(self, video_info):
141 """Returns a video"""
142 video_info['_type'] = 'video'
144 def url_result(self, url, ie=None):
145 """Returns a url that points to a page that should be processed"""
146 #TODO: ie should be the class used for getting the info
147 video_info = {'_type': 'url',
150 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151 """Returns a playlist"""
152 video_info = {'_type': 'playlist',
155 video_info['id'] = playlist_id
157 video_info['title'] = playlist_title
161 class YoutubeIE(InfoExtractor):
162 """Information extractor for youtube.com."""
166 (?:https?://)? # http(s):// (optional)
167 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
169 (?:.*?\#/)? # handle anchor (#/) redirect urls
170 (?: # the various things that can precede the ID:
171 (?:(?:v|embed|e)/) # v/ or embed/ or e/
172 |(?: # or the v= param in all its forms
173 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174 (?:\?|\#!?) # the params delimiter ? or # or #!
175 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
178 )? # optional -> youtube.com/xxxx is OK
179 )? # all until now is optional -> you can pass the naked ID
180 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
181 (?(1).+)? # if we found the ID, everything can follow
183 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187 _NETRC_MACHINE = 'youtube'
188 # Listed in order of quality
189 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191 _video_extensions = {
197 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
203 _video_dimensions = {
222 def suitable(cls, url):
223 """Receives a URL and returns True if suitable for this IE."""
224 if YoutubePlaylistIE.suitable(url): return False
225 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
227 def report_lang(self):
228 """Report attempt to set language."""
229 self._downloader.to_screen(u'[youtube] Setting language')
231 def report_login(self):
232 """Report attempt to log in."""
233 self._downloader.to_screen(u'[youtube] Logging in')
235 def report_age_confirmation(self):
236 """Report attempt to confirm age."""
237 self._downloader.to_screen(u'[youtube] Confirming age')
239 def report_video_webpage_download(self, video_id):
240 """Report attempt to download video webpage."""
241 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
243 def report_video_info_webpage_download(self, video_id):
244 """Report attempt to download video info webpage."""
245 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
247 def report_video_subtitles_download(self, video_id):
248 """Report attempt to download video info webpage."""
249 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
251 def report_video_subtitles_request(self, video_id, sub_lang, format):
252 """Report attempt to download video info webpage."""
253 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
255 def report_video_subtitles_available(self, video_id, sub_lang_list):
256 """Report available subtitles."""
257 sub_lang = ",".join(list(sub_lang_list.keys()))
258 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
260 def report_information_extraction(self, video_id):
261 """Report attempt to extract video information."""
262 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
264 def report_unavailable_format(self, video_id, format):
265 """Report extracted video URL."""
266 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
268 def report_rtmp_download(self):
269 """Indicate the download will use the RTMP protocol."""
270 self._downloader.to_screen(u'[youtube] RTMP download detected')
272 def _get_available_subtitles(self, video_id):
273 self.report_video_subtitles_download(video_id)
274 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
276 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
279 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281 if not sub_lang_list:
282 return (u'WARNING: video doesn\'t have subtitles', None)
285 def _list_available_subtitles(self, video_id):
286 sub_lang_list = self._get_available_subtitles(video_id)
287 self.report_video_subtitles_available(video_id, sub_lang_list)
289 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
290 self.report_video_subtitles_request(video_id, sub_lang, format)
291 params = compat_urllib_parse.urlencode({
297 url = 'http://www.youtube.com/api/timedtext?' + params
299 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
301 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
303 return (u'WARNING: Did not fetch video subtitles', None)
304 return (None, sub_lang, sub)
306 def _extract_subtitle(self, video_id):
307 sub_lang_list = self._get_available_subtitles(video_id)
308 sub_format = self._downloader.params.get('subtitlesformat')
309 if self._downloader.params.get('subtitleslang', False):
310 sub_lang = self._downloader.params.get('subtitleslang')
311 elif 'en' in sub_lang_list:
314 sub_lang = list(sub_lang_list.keys())[0]
315 if not sub_lang in sub_lang_list:
316 return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
318 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
321 def _extract_all_subtitles(self, video_id):
322 sub_lang_list = self._get_available_subtitles(video_id)
323 sub_format = self._downloader.params.get('subtitlesformat')
325 for sub_lang in sub_lang_list:
326 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
327 subtitles.append(subtitle)
330 def _print_formats(self, formats):
331 print('Available formats:')
333 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
335 def _real_initialize(self):
336 if self._downloader is None:
341 downloader_params = self._downloader.params
343 # Attempt to use provided username and password or .netrc data
344 if downloader_params.get('username', None) is not None:
345 username = downloader_params['username']
346 password = downloader_params['password']
347 elif downloader_params.get('usenetrc', False):
349 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
354 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
355 except (IOError, netrc.NetrcParseError) as err:
356 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
360 request = compat_urllib_request.Request(self._LANG_URL)
363 compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
368 # No authentication to be performed
372 request = compat_urllib_request.Request(self._LOGIN_URL)
374 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
375 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
376 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
381 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
383 galx = match.group(1)
385 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
391 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
395 u'PersistentCookie': u'yes',
397 u'bgresponse': u'js_disabled',
398 u'checkConnection': u'',
399 u'checkedDomains': u'youtube',
405 u'signIn': u'Sign in',
407 u'service': u'youtube',
411 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
413 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
414 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
415 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
418 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
419 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
420 self._downloader.report_warning(u'unable to log in: bad username or password')
422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
423 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
429 'action_confirm': 'Confirm',
431 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
433 self.report_age_confirmation()
434 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
435 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
436 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
439 def _extract_id(self, url):
440 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
442 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
444 video_id = mobj.group(2)
447 def _real_extract(self, url):
448 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
449 mobj = re.search(self._NEXT_URL_RE, url)
451 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
452 video_id = self._extract_id(url)
455 self.report_video_webpage_download(video_id)
456 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
457 request = compat_urllib_request.Request(url)
459 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
461 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
464 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
466 # Attempt to extract SWF player URL
467 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
469 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
474 self.report_video_info_webpage_download(video_id)
475 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
476 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
477 % (video_id, el_type))
478 request = compat_urllib_request.Request(video_info_url)
480 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
481 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
482 video_info = compat_parse_qs(video_info_webpage)
483 if 'token' in video_info:
485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
486 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
488 if 'token' not in video_info:
489 if 'reason' in video_info:
490 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
492 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
495 # Check for "rental" videos
496 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
497 self._downloader.trouble(u'ERROR: "rental" videos not supported')
500 # Start extracting information
501 self.report_information_extraction(video_id)
504 if 'author' not in video_info:
505 self._downloader.trouble(u'ERROR: unable to extract uploader name')
507 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
510 video_uploader_id = None
511 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
513 video_uploader_id = mobj.group(1)
515 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
518 if 'title' not in video_info:
519 self._downloader.trouble(u'ERROR: unable to extract video title')
521 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
524 if 'thumbnail_url' not in video_info:
525 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
527 else: # don't panic if we can't find it
528 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
532 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
534 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
535 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
536 for expression in format_expressions:
538 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
543 video_description = get_element_by_id("eow-description", video_webpage)
544 if video_description:
545 video_description = clean_html(video_description)
547 video_description = ''
550 video_subtitles = None
552 if self._downloader.params.get('writesubtitles', False):
553 video_subtitles = self._extract_subtitle(video_id)
555 (sub_error, sub_lang, sub) = video_subtitles[0]
557 self._downloader.trouble(sub_error)
559 if self._downloader.params.get('allsubtitles', False):
560 video_subtitles = self._extract_all_subtitles(video_id)
561 for video_subtitle in video_subtitles:
562 (sub_error, sub_lang, sub) = video_subtitle
564 self._downloader.trouble(sub_error)
566 if self._downloader.params.get('listsubtitles', False):
567 sub_lang_list = self._list_available_subtitles(video_id)
570 if 'length_seconds' not in video_info:
571 self._downloader.trouble(u'WARNING: unable to extract video duration')
574 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
577 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
579 # Decide which formats to download
580 req_format = self._downloader.params.get('format', None)
582 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
583 self.report_rtmp_download()
584 video_url_list = [(None, video_info['conn'][0])]
585 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
586 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
587 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
588 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
589 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
591 format_limit = self._downloader.params.get('format_limit', None)
592 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
593 if format_limit is not None and format_limit in available_formats:
594 format_list = available_formats[available_formats.index(format_limit):]
596 format_list = available_formats
597 existing_formats = [x for x in format_list if x in url_map]
598 if len(existing_formats) == 0:
599 self._downloader.trouble(u'ERROR: no known formats available for video')
601 if self._downloader.params.get('listformats', None):
602 self._print_formats(existing_formats)
604 if req_format is None or req_format == 'best':
605 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
606 elif req_format == 'worst':
607 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
608 elif req_format in ('-1', 'all'):
609 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
611 # Specific formats. We pick the first in a slash-delimeted sequence.
612 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
613 req_formats = req_format.split('/')
614 video_url_list = None
615 for rf in req_formats:
617 video_url_list = [(rf, url_map[rf])]
619 if video_url_list is None:
620 self._downloader.trouble(u'ERROR: requested format not available')
623 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
627 for format_param, video_real_url in video_url_list:
629 video_extension = self._video_extensions.get(format_param, 'flv')
631 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
632 self._video_dimensions.get(format_param, '???'))
636 'url': video_real_url,
637 'uploader': video_uploader,
638 'uploader_id': video_uploader_id,
639 'upload_date': upload_date,
640 'title': video_title,
641 'ext': video_extension,
642 'format': video_format,
643 'thumbnail': video_thumbnail,
644 'description': video_description,
645 'player_url': player_url,
646 'subtitles': video_subtitles,
647 'duration': video_duration
652 class MetacafeIE(InfoExtractor):
653 """Information Extractor for metacafe.com."""
655 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
656 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
657 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
658 IE_NAME = u'metacafe'
660 def __init__(self, downloader=None):
661 InfoExtractor.__init__(self, downloader)
663 def report_disclaimer(self):
664 """Report disclaimer retrieval."""
665 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
667 def report_age_confirmation(self):
668 """Report attempt to confirm age."""
669 self._downloader.to_screen(u'[metacafe] Confirming age')
671 def report_download_webpage(self, video_id):
672 """Report webpage download."""
673 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
675 def report_extraction(self, video_id):
676 """Report information extraction."""
677 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
679 def _real_initialize(self):
680 # Retrieve disclaimer
681 request = compat_urllib_request.Request(self._DISCLAIMER)
683 self.report_disclaimer()
684 disclaimer = compat_urllib_request.urlopen(request).read()
685 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
686 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
692 'submit': "Continue - I'm over 18",
694 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
696 self.report_age_confirmation()
697 disclaimer = compat_urllib_request.urlopen(request).read()
698 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
699 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
702 def _real_extract(self, url):
703 # Extract id and simplified title from URL
704 mobj = re.match(self._VALID_URL, url)
706 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
709 video_id = mobj.group(1)
711 # Check if video comes from YouTube
712 mobj2 = re.match(r'^yt-(.*)$', video_id)
713 if mobj2 is not None:
714 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
717 # Retrieve video webpage to extract further information
718 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
720 self.report_download_webpage(video_id)
721 webpage = compat_urllib_request.urlopen(request).read()
722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
723 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
726 # Extract URL, uploader and title from webpage
727 self.report_extraction(video_id)
728 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
730 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
731 video_extension = mediaURL[-3:]
733 # Extract gdaKey if available
734 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
738 gdaKey = mobj.group(1)
739 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
741 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
743 self._downloader.trouble(u'ERROR: unable to extract media URL')
745 vardict = compat_parse_qs(mobj.group(1))
746 if 'mediaData' not in vardict:
747 self._downloader.trouble(u'ERROR: unable to extract media URL')
749 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
751 self._downloader.trouble(u'ERROR: unable to extract media URL')
753 mediaURL = mobj.group(1).replace('\\/', '/')
754 video_extension = mediaURL[-3:]
755 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
757 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
759 self._downloader.trouble(u'ERROR: unable to extract title')
761 video_title = mobj.group(1).decode('utf-8')
763 mobj = re.search(r'submitter=(.*?);', webpage)
765 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
767 video_uploader = mobj.group(1)
770 'id': video_id.decode('utf-8'),
771 'url': video_url.decode('utf-8'),
772 'uploader': video_uploader.decode('utf-8'),
774 'title': video_title,
775 'ext': video_extension.decode('utf-8'),
779 class DailymotionIE(InfoExtractor):
780 """Information Extractor for Dailymotion"""
782 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
783 IE_NAME = u'dailymotion'
786 def __init__(self, downloader=None):
787 InfoExtractor.__init__(self, downloader)
789 def report_extraction(self, video_id):
790 """Report information extraction."""
791 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
793 def _real_extract(self, url):
794 # Extract id and simplified title from URL
795 mobj = re.match(self._VALID_URL, url)
797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
800 video_id = mobj.group(1).split('_')[0].split('?')[0]
802 video_extension = 'mp4'
804 # Retrieve video webpage to extract further information
805 request = compat_urllib_request.Request(url)
806 request.add_header('Cookie', 'family_filter=off')
807 webpage = self._download_webpage(request, video_id)
809 # Extract URL, uploader and title from webpage
810 self.report_extraction(video_id)
811 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
813 self._downloader.trouble(u'ERROR: unable to extract media URL')
815 flashvars = compat_urllib_parse.unquote(mobj.group(1))
817 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
820 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
823 self._downloader.trouble(u'ERROR: unable to extract video URL')
826 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
828 self._downloader.trouble(u'ERROR: unable to extract video URL')
831 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
833 # TODO: support choosing qualities
835 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
837 self._downloader.trouble(u'ERROR: unable to extract title')
839 video_title = unescapeHTML(mobj.group('title'))
841 video_uploader = None
842 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
844 # lookin for official user
845 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
846 if mobj_official is None:
847 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
849 video_uploader = mobj_official.group(1)
851 video_uploader = mobj.group(1)
853 video_upload_date = None
854 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
856 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
861 'uploader': video_uploader,
862 'upload_date': video_upload_date,
863 'title': video_title,
864 'ext': video_extension,
868 class PhotobucketIE(InfoExtractor):
869 """Information extractor for photobucket.com."""
871 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
872 IE_NAME = u'photobucket'
874 def __init__(self, downloader=None):
875 InfoExtractor.__init__(self, downloader)
877 def report_download_webpage(self, video_id):
878 """Report webpage download."""
879 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
881 def report_extraction(self, video_id):
882 """Report information extraction."""
883 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
885 def _real_extract(self, url):
886 # Extract id from URL
887 mobj = re.match(self._VALID_URL, url)
889 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
892 video_id = mobj.group(1)
894 video_extension = 'flv'
896 # Retrieve video webpage to extract further information
897 request = compat_urllib_request.Request(url)
899 self.report_download_webpage(video_id)
900 webpage = compat_urllib_request.urlopen(request).read()
901 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
902 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
905 # Extract URL, uploader, and title from webpage
906 self.report_extraction(video_id)
907 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
909 self._downloader.trouble(u'ERROR: unable to extract media URL')
911 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
915 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
917 self._downloader.trouble(u'ERROR: unable to extract title')
919 video_title = mobj.group(1).decode('utf-8')
921 video_uploader = mobj.group(2).decode('utf-8')
924 'id': video_id.decode('utf-8'),
925 'url': video_url.decode('utf-8'),
926 'uploader': video_uploader,
928 'title': video_title,
929 'ext': video_extension.decode('utf-8'),
933 class YahooIE(InfoExtractor):
934 """Information extractor for video.yahoo.com."""
937 # _VALID_URL matches all Yahoo! Video URLs
938 # _VPAGE_URL matches only the extractable '/watch/' URLs
939 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
940 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
941 IE_NAME = u'video.yahoo'
943 def __init__(self, downloader=None):
944 InfoExtractor.__init__(self, downloader)
946 def report_download_webpage(self, video_id):
947 """Report webpage download."""
948 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
950 def report_extraction(self, video_id):
951 """Report information extraction."""
952 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
954 def _real_extract(self, url, new_video=True):
955 # Extract ID from URL
956 mobj = re.match(self._VALID_URL, url)
958 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
961 video_id = mobj.group(2)
962 video_extension = 'flv'
964 # Rewrite valid but non-extractable URLs as
965 # extractable English language /watch/ URLs
966 if re.match(self._VPAGE_URL, url) is None:
967 request = compat_urllib_request.Request(url)
969 webpage = compat_urllib_request.urlopen(request).read()
970 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
971 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
974 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
976 self._downloader.trouble(u'ERROR: Unable to extract id field')
978 yahoo_id = mobj.group(1)
980 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
982 self._downloader.trouble(u'ERROR: Unable to extract vid field')
984 yahoo_vid = mobj.group(1)
986 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
987 return self._real_extract(url, new_video=False)
989 # Retrieve video webpage to extract further information
990 request = compat_urllib_request.Request(url)
992 self.report_download_webpage(video_id)
993 webpage = compat_urllib_request.urlopen(request).read()
994 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
995 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
998 # Extract uploader and title from webpage
999 self.report_extraction(video_id)
1000 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1002 self._downloader.trouble(u'ERROR: unable to extract video title')
1004 video_title = mobj.group(1).decode('utf-8')
1006 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1008 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1010 video_uploader = mobj.group(1).decode('utf-8')
1012 # Extract video thumbnail
1013 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1015 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1017 video_thumbnail = mobj.group(1).decode('utf-8')
1019 # Extract video description
1020 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1022 self._downloader.trouble(u'ERROR: unable to extract video description')
1024 video_description = mobj.group(1).decode('utf-8')
1025 if not video_description:
1026 video_description = 'No description available.'
1028 # Extract video height and width
1029 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1031 self._downloader.trouble(u'ERROR: unable to extract video height')
1033 yv_video_height = mobj.group(1)
1035 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1037 self._downloader.trouble(u'ERROR: unable to extract video width')
1039 yv_video_width = mobj.group(1)
1041 # Retrieve video playlist to extract media URL
1042 # I'm not completely sure what all these options are, but we
1043 # seem to need most of them, otherwise the server sends a 401.
1044 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1045 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1046 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1047 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1048 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1050 self.report_download_webpage(video_id)
1051 webpage = compat_urllib_request.urlopen(request).read()
1052 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1053 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056 # Extract media URL from playlist XML
1057 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1059 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1061 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1062 video_url = unescapeHTML(video_url)
1065 'id': video_id.decode('utf-8'),
1067 'uploader': video_uploader,
1068 'upload_date': None,
1069 'title': video_title,
1070 'ext': video_extension.decode('utf-8'),
1071 'thumbnail': video_thumbnail.decode('utf-8'),
1072 'description': video_description,
1076 class VimeoIE(InfoExtractor):
1077 """Information extractor for vimeo.com."""
1079 # _VALID_URL matches Vimeo URLs
1080 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1083 def __init__(self, downloader=None):
1084 InfoExtractor.__init__(self, downloader)
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1090 def report_extraction(self, video_id):
1091 """Report information extraction."""
1092 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1094 def _real_extract(self, url, new_video=True):
1095 # Extract ID from URL
1096 mobj = re.match(self._VALID_URL, url)
1098 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1101 video_id = mobj.group('id')
1102 if not mobj.group('proto'):
1103 url = 'https://' + url
1104 if mobj.group('direct_link'):
1105 url = 'https://vimeo.com/' + video_id
1107 # Retrieve video webpage to extract further information
1108 request = compat_urllib_request.Request(url, None, std_headers)
1110 self.report_download_webpage(video_id)
1111 webpage_bytes = compat_urllib_request.urlopen(request).read()
1112 webpage = webpage_bytes.decode('utf-8')
1113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1114 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1117 # Now we begin extracting as much information as we can from what we
1118 # retrieved. First we extract the information common to all extractors,
1119 # and latter we extract those that are Vimeo specific.
1120 self.report_extraction(video_id)
1122 # Extract the config JSON
1124 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1125 config = json.loads(config)
1127 self._downloader.trouble(u'ERROR: unable to extract info section')
1131 video_title = config["video"]["title"]
1133 # Extract uploader and uploader_id
1134 video_uploader = config["video"]["owner"]["name"]
1135 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1137 # Extract video thumbnail
1138 video_thumbnail = config["video"]["thumbnail"]
1140 # Extract video description
1141 video_description = get_element_by_attribute("itemprop", "description", webpage)
1142 if video_description: video_description = clean_html(video_description)
1143 else: video_description = ''
1145 # Extract upload date
1146 video_upload_date = None
1147 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148 if mobj is not None:
1149 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1151 # Vimeo specific: extract request signature and timestamp
1152 sig = config['request']['signature']
1153 timestamp = config['request']['timestamp']
1155 # Vimeo specific: extract video codec and quality information
1156 # First consider quality, then codecs, then take everything
1157 # TODO bind to format param
1158 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159 files = { 'hd': [], 'sd': [], 'other': []}
1160 for codec_name, codec_extension in codecs:
1161 if codec_name in config["video"]["files"]:
1162 if 'hd' in config["video"]["files"][codec_name]:
1163 files['hd'].append((codec_name, codec_extension, 'hd'))
1164 elif 'sd' in config["video"]["files"][codec_name]:
1165 files['sd'].append((codec_name, codec_extension, 'sd'))
1167 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1169 for quality in ('hd', 'sd', 'other'):
1170 if len(files[quality]) > 0:
1171 video_quality = files[quality][0][2]
1172 video_codec = files[quality][0][0]
1173 video_extension = files[quality][0][1]
1174 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1177 self._downloader.trouble(u'ERROR: no known codec found')
1180 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1181 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1186 'uploader': video_uploader,
1187 'uploader_id': video_uploader_id,
1188 'upload_date': video_upload_date,
1189 'title': video_title,
1190 'ext': video_extension,
1191 'thumbnail': video_thumbnail,
1192 'description': video_description,
1196 class ArteTvIE(InfoExtractor):
1197 """arte.tv information extractor."""
1199 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1200 _LIVE_URL = r'index-[0-9]+\.html$'
1202 IE_NAME = u'arte.tv'
1204 def __init__(self, downloader=None):
1205 InfoExtractor.__init__(self, downloader)
1207 def report_download_webpage(self, video_id):
1208 """Report webpage download."""
1209 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1211 def report_extraction(self, video_id):
1212 """Report information extraction."""
1213 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1215 def fetch_webpage(self, url):
1216 request = compat_urllib_request.Request(url)
1218 self.report_download_webpage(url)
1219 webpage = compat_urllib_request.urlopen(request).read()
1220 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1221 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1223 except ValueError as err:
1224 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1228 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1229 page = self.fetch_webpage(url)
1230 mobj = re.search(regex, page, regexFlags)
1234 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1237 for (i, key, err) in matchTuples:
1238 if mobj.group(i) is None:
1239 self._downloader.trouble(err)
1242 info[key] = mobj.group(i)
1246 def extractLiveStream(self, url):
1247 video_lang = url.split('/')[-4]
1248 info = self.grep_webpage(
1250 r'src="(.*?/videothek_js.*?\.js)',
1253 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1256 http_host = url.split('/')[2]
1257 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1258 info = self.grep_webpage(
1260 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1261 '(http://.*?\.swf).*?' +
1265 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1266 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1267 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1270 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1272 def extractPlus7Stream(self, url):
1273 video_lang = url.split('/')[-3]
1274 info = self.grep_webpage(
1276 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1279 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1282 next_url = compat_urllib_parse.unquote(info.get('url'))
1283 info = self.grep_webpage(
1285 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1288 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1291 next_url = compat_urllib_parse.unquote(info.get('url'))
1293 info = self.grep_webpage(
1295 r'<video id="(.*?)".*?>.*?' +
1296 '<name>(.*?)</name>.*?' +
1297 '<dateVideo>(.*?)</dateVideo>.*?' +
1298 '<url quality="hd">(.*?)</url>',
1301 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1302 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1303 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1304 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1309 'id': info.get('id'),
1310 'url': compat_urllib_parse.unquote(info.get('url')),
1311 'uploader': u'arte.tv',
1312 'upload_date': info.get('date'),
1313 'title': info.get('title').decode('utf-8'),
1319 def _real_extract(self, url):
1320 video_id = url.split('/')[-1]
1321 self.report_extraction(video_id)
1323 if re.search(self._LIVE_URL, video_id) is not None:
1324 self.extractLiveStream(url)
1327 info = self.extractPlus7Stream(url)
1332 class GenericIE(InfoExtractor):
1333 """Generic last-resort information extractor."""
1336 IE_NAME = u'generic'
1338 def __init__(self, downloader=None):
1339 InfoExtractor.__init__(self, downloader)
1341 def report_download_webpage(self, video_id):
1342 """Report webpage download."""
1343 if not self._downloader.params.get('test', False):
1344 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1345 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1347 def report_extraction(self, video_id):
1348 """Report information extraction."""
1349 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1351 def report_following_redirect(self, new_url):
1352 """Report information extraction."""
1353 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1355 def _test_redirect(self, url):
1356 """Check if it is a redirect, like url shorteners, in case return the new url."""
1357 class HeadRequest(compat_urllib_request.Request):
1358 def get_method(self):
1361 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1363 Subclass the HTTPRedirectHandler to make it use our
1364 HeadRequest also on the redirected URL
1366 def redirect_request(self, req, fp, code, msg, headers, newurl):
1367 if code in (301, 302, 303, 307):
1368 newurl = newurl.replace(' ', '%20')
1369 newheaders = dict((k,v) for k,v in req.headers.items()
1370 if k.lower() not in ("content-length", "content-type"))
1371 return HeadRequest(newurl,
1373 origin_req_host=req.get_origin_req_host(),
1376 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1378 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1380 Fallback to GET if HEAD is not allowed (405 HTTP error)
1382 def http_error_405(self, req, fp, code, msg, headers):
1386 newheaders = dict((k,v) for k,v in req.headers.items()
1387 if k.lower() not in ("content-length", "content-type"))
1388 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1390 origin_req_host=req.get_origin_req_host(),
1394 opener = compat_urllib_request.OpenerDirector()
1395 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1396 HTTPMethodFallback, HEADRedirectHandler,
1397 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1398 opener.add_handler(handler())
1400 response = opener.open(HeadRequest(url))
1401 new_url = response.geturl()
1406 self.report_following_redirect(new_url)
1409 def _real_extract(self, url):
1410 new_url = self._test_redirect(url)
1411 if new_url: return [self.url_result(new_url)]
1413 video_id = url.split('/')[-1]
1415 webpage = self._download_webpage(url, video_id)
1416 except ValueError as err:
1417 # since this is the last-resort InfoExtractor, if
1418 # this error is thrown, it'll be thrown here
1419 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1422 self.report_extraction(video_id)
1423 # Start with something easy: JW Player in SWFObject
1424 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1426 # Broaden the search a little bit
1427 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1429 # Broaden the search a little bit: JWPlayer JS loader
1430 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1432 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1435 # It's possible that one of the regexes
1436 # matched, but returned an empty group:
1437 if mobj.group(1) is None:
1438 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1441 video_url = compat_urllib_parse.unquote(mobj.group(1))
1442 video_id = os.path.basename(video_url)
1444 # here's a fun little line of code for you:
1445 video_extension = os.path.splitext(video_id)[1][1:]
1446 video_id = os.path.splitext(video_id)[0]
1448 # it's tempting to parse this further, but you would
1449 # have to take into account all the variations like
1450 # Video Title - Site Name
1451 # Site Name | Video Title
1452 # Video Title - Tagline | Site Name
1453 # and so on and so forth; it's just not practical
1454 mobj = re.search(r'<title>(.*)</title>', webpage)
1456 self._downloader.trouble(u'ERROR: unable to extract title')
1458 video_title = mobj.group(1)
1460 # video uploader is domain name
1461 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1463 self._downloader.trouble(u'ERROR: unable to extract title')
1465 video_uploader = mobj.group(1)
1470 'uploader': video_uploader,
1471 'upload_date': None,
1472 'title': video_title,
1473 'ext': video_extension,
1477 class YoutubeSearchIE(InfoExtractor):
1478 """Information Extractor for YouTube search queries."""
1479 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1480 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1481 _max_youtube_results = 1000
1482 IE_NAME = u'youtube:search'
1484 def __init__(self, downloader=None):
1485 InfoExtractor.__init__(self, downloader)
1487 def report_download_page(self, query, pagenum):
1488 """Report attempt to download search page with given number."""
1489 query = query.decode(preferredencoding())
1490 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1492 def _real_extract(self, query):
1493 mobj = re.match(self._VALID_URL, query)
1495 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1498 prefix, query = query.split(':')
1500 query = query.encode('utf-8')
1502 self._download_n_results(query, 1)
1504 elif prefix == 'all':
1505 self._download_n_results(query, self._max_youtube_results)
1511 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1513 elif n > self._max_youtube_results:
1514 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1515 n = self._max_youtube_results
1516 self._download_n_results(query, n)
1518 except ValueError: # parsing prefix as integer fails
1519 self._download_n_results(query, 1)
1522 def _download_n_results(self, query, n):
1523 """Downloads a specified number of results for a query"""
1529 while (50 * pagenum) < limit:
1530 self.report_download_page(query, pagenum+1)
1531 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1532 request = compat_urllib_request.Request(result_url)
1534 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1535 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1536 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1538 api_response = json.loads(data)['data']
1540 if not 'items' in api_response:
1541 self._downloader.trouble(u'[youtube] No video results')
1544 new_ids = list(video['id'] for video in api_response['items'])
1545 video_ids += new_ids
1547 limit = min(n, api_response['totalItems'])
1550 if len(video_ids) > n:
1551 video_ids = video_ids[:n]
1552 for id in video_ids:
1553 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1557 class GoogleSearchIE(InfoExtractor):
1558 """Information Extractor for Google Video search queries."""
1559 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1560 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1561 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1562 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1563 _max_google_results = 1000
1564 IE_NAME = u'video.google:search'
1566 def __init__(self, downloader=None):
1567 InfoExtractor.__init__(self, downloader)
1569 def report_download_page(self, query, pagenum):
1570 """Report attempt to download playlist page with given number."""
1571 query = query.decode(preferredencoding())
1572 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1574 def _real_extract(self, query):
1575 mobj = re.match(self._VALID_URL, query)
1577 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1580 prefix, query = query.split(':')
1582 query = query.encode('utf-8')
1584 self._download_n_results(query, 1)
1586 elif prefix == 'all':
1587 self._download_n_results(query, self._max_google_results)
1593 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1595 elif n > self._max_google_results:
1596 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1597 n = self._max_google_results
1598 self._download_n_results(query, n)
1600 except ValueError: # parsing prefix as integer fails
1601 self._download_n_results(query, 1)
1604 def _download_n_results(self, query, n):
1605 """Downloads a specified number of results for a query"""
1611 self.report_download_page(query, pagenum)
1612 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1613 request = compat_urllib_request.Request(result_url)
1615 page = compat_urllib_request.urlopen(request).read()
1616 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1617 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1620 # Extract video identifiers
1621 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1622 video_id = mobj.group(1)
1623 if video_id not in video_ids:
1624 video_ids.append(video_id)
1625 if len(video_ids) == n:
1626 # Specified n videos reached
1627 for id in video_ids:
1628 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1631 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1632 for id in video_ids:
1633 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1636 pagenum = pagenum + 1
1639 class YahooSearchIE(InfoExtractor):
1640 """Information Extractor for Yahoo! Video search queries."""
1643 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1644 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1645 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1646 _MORE_PAGES_INDICATOR = r'\s*Next'
1647 _max_yahoo_results = 1000
1648 IE_NAME = u'video.yahoo:search'
1650 def __init__(self, downloader=None):
1651 InfoExtractor.__init__(self, downloader)
1653 def report_download_page(self, query, pagenum):
1654 """Report attempt to download playlist page with given number."""
1655 query = query.decode(preferredencoding())
1656 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1658 def _real_extract(self, query):
1659 mobj = re.match(self._VALID_URL, query)
1661 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1664 prefix, query = query.split(':')
1666 query = query.encode('utf-8')
1668 self._download_n_results(query, 1)
1670 elif prefix == 'all':
1671 self._download_n_results(query, self._max_yahoo_results)
1677 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1679 elif n > self._max_yahoo_results:
1680 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1681 n = self._max_yahoo_results
1682 self._download_n_results(query, n)
1684 except ValueError: # parsing prefix as integer fails
1685 self._download_n_results(query, 1)
1688 def _download_n_results(self, query, n):
1689 """Downloads a specified number of results for a query"""
1692 already_seen = set()
1696 self.report_download_page(query, pagenum)
1697 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1698 request = compat_urllib_request.Request(result_url)
1700 page = compat_urllib_request.urlopen(request).read()
1701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1702 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1705 # Extract video identifiers
1706 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1707 video_id = mobj.group(1)
1708 if video_id not in already_seen:
1709 video_ids.append(video_id)
1710 already_seen.add(video_id)
1711 if len(video_ids) == n:
1712 # Specified n videos reached
1713 for id in video_ids:
1714 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1717 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1718 for id in video_ids:
1719 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1722 pagenum = pagenum + 1
1725 class YoutubePlaylistIE(InfoExtractor):
1726 """Information Extractor for YouTube playlists."""
1728 _VALID_URL = r"""(?:
1733 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1734 \? (?:.*?&)*? (?:p|a|list)=
1739 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1742 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1744 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1746 IE_NAME = u'youtube:playlist'
1748 def __init__(self, downloader=None):
1749 InfoExtractor.__init__(self, downloader)
1752 def suitable(cls, url):
1753 """Receives a URL and returns True if suitable for this IE."""
1754 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1756 def report_download_page(self, playlist_id, pagenum):
1757 """Report attempt to download playlist page with given number."""
1758 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1760 def _real_extract(self, url):
1761 # Extract playlist id
1762 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1764 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1767 # Download playlist videos from API
1768 playlist_id = mobj.group(1) or mobj.group(2)
1773 self.report_download_page(playlist_id, page_num)
1775 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1777 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1778 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1779 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1783 response = json.loads(page)
1784 except ValueError as err:
1785 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1788 if not 'feed' in response or not 'entry' in response['feed']:
1789 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1791 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1792 for entry in response['feed']['entry']
1793 if 'content' in entry ]
1795 if len(response['feed']['entry']) < self._MAX_RESULTS:
1799 videos = [v[1] for v in sorted(videos)]
1802 playliststart = self._downloader.params.get('playliststart', 1) - 1
1803 playlistend = self._downloader.params.get('playlistend', -1)
1804 if playlistend == -1:
1805 videos = videos[playliststart:]
1807 videos = videos[playliststart:playlistend]
1809 if len(videos) == total:
1810 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1812 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1814 url_results = [self.url_result(url) for url in videos]
1815 return [self.playlist_result(url_results, playlist_id)]
1818 class YoutubeChannelIE(InfoExtractor):
1819 """Information Extractor for YouTube channels."""
1821 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1822 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1823 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1824 IE_NAME = u'youtube:channel'
1826 def report_download_page(self, channel_id, pagenum):
1827 """Report attempt to download channel page with given number."""
1828 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1830 def _real_extract(self, url):
1831 # Extract channel id
1832 mobj = re.match(self._VALID_URL, url)
1834 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837 # Download channel pages
1838 channel_id = mobj.group(1)
1843 self.report_download_page(channel_id, pagenum)
1844 url = self._TEMPLATE_URL % (channel_id, pagenum)
1845 request = compat_urllib_request.Request(url)
1847 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1849 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1852 # Extract video identifiers
1854 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1855 if mobj.group(1) not in ids_in_page:
1856 ids_in_page.append(mobj.group(1))
1857 video_ids.extend(ids_in_page)
1859 if self._MORE_PAGES_INDICATOR not in page:
1861 pagenum = pagenum + 1
1863 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1865 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1866 url_entries = [self.url_result(url) for url in urls]
1867 return [self.playlist_result(url_entries, channel_id)]
1870 class YoutubeUserIE(InfoExtractor):
1871 """Information Extractor for YouTube users."""
1873 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1874 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1875 _GDATA_PAGE_SIZE = 50
1876 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1877 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1878 IE_NAME = u'youtube:user'
1880 def __init__(self, downloader=None):
1881 InfoExtractor.__init__(self, downloader)
1883 def report_download_page(self, username, start_index):
1884 """Report attempt to download user page."""
1885 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1886 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1888 def _real_extract(self, url):
1890 mobj = re.match(self._VALID_URL, url)
1892 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895 username = mobj.group(1)
1897 # Download video ids using YouTube Data API. Result size per
1898 # query is limited (currently to 50 videos) so we need to query
1899 # page by page until there are no video ids - it means we got
1906 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1907 self.report_download_page(username, start_index)
1909 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1912 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1914 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1917 # Extract video identifiers
1920 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1921 if mobj.group(1) not in ids_in_page:
1922 ids_in_page.append(mobj.group(1))
1924 video_ids.extend(ids_in_page)
1926 # A little optimization - if current page is not
1927 # "full", ie. does not contain PAGE_SIZE video ids then
1928 # we can assume that this page is the last one - there
1929 # are no more ids on further pages - no need to query
1932 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1937 all_ids_count = len(video_ids)
1938 playliststart = self._downloader.params.get('playliststart', 1) - 1
1939 playlistend = self._downloader.params.get('playlistend', -1)
1941 if playlistend == -1:
1942 video_ids = video_ids[playliststart:]
1944 video_ids = video_ids[playliststart:playlistend]
1946 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1947 (username, all_ids_count, len(video_ids)))
1949 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1950 url_results = [self.url_result(url) for url in urls]
1951 return [self.playlist_result(url_results, playlist_title = username)]
1954 class BlipTVUserIE(InfoExtractor):
1955 """Information Extractor for blip.tv users."""
1957 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1959 IE_NAME = u'blip.tv:user'
1961 def __init__(self, downloader=None):
1962 InfoExtractor.__init__(self, downloader)
1964 def report_download_page(self, username, pagenum):
1965 """Report attempt to download user page."""
1966 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1967 (self.IE_NAME, username, pagenum))
1969 def _real_extract(self, url):
1971 mobj = re.match(self._VALID_URL, url)
1973 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1976 username = mobj.group(1)
1978 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1980 request = compat_urllib_request.Request(url)
1983 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1984 mobj = re.search(r'data-users-id="([^"]+)"', page)
1985 page_base = page_base % mobj.group(1)
1986 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1987 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1991 # Download video ids using BlipTV Ajax calls. Result size per
1992 # query is limited (currently to 12 videos) so we need to query
1993 # page by page until there are no video ids - it means we got
2000 self.report_download_page(username, pagenum)
2001 url = page_base + "&page=" + str(pagenum)
2002 request = compat_urllib_request.Request( url )
2004 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2009 # Extract video identifiers
2012 for mobj in re.finditer(r'href="/([^"]+)"', page):
2013 if mobj.group(1) not in ids_in_page:
2014 ids_in_page.append(unescapeHTML(mobj.group(1)))
2016 video_ids.extend(ids_in_page)
2018 # A little optimization - if current page is not
2019 # "full", ie. does not contain PAGE_SIZE video ids then
2020 # we can assume that this page is the last one - there
2021 # are no more ids on further pages - no need to query
2024 if len(ids_in_page) < self._PAGE_SIZE:
2029 all_ids_count = len(video_ids)
2030 playliststart = self._downloader.params.get('playliststart', 1) - 1
2031 playlistend = self._downloader.params.get('playlistend', -1)
2033 if playlistend == -1:
2034 video_ids = video_ids[playliststart:]
2036 video_ids = video_ids[playliststart:playlistend]
2038 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2039 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2041 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2042 url_entries = [self.url_result(url) for url in urls]
2043 return [self.playlist_result(url_entries, playlist_title = username)]
2046 class DepositFilesIE(InfoExtractor):
2047 """Information extractor for depositfiles.com"""
2049 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2051 def report_download_webpage(self, file_id):
2052 """Report webpage download."""
2053 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2055 def report_extraction(self, file_id):
2056 """Report information extraction."""
2057 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2059 def _real_extract(self, url):
2060 file_id = url.split('/')[-1]
2061 # Rebuild url in english locale
2062 url = 'http://depositfiles.com/en/files/' + file_id
2064 # Retrieve file webpage with 'Free download' button pressed
2065 free_download_indication = { 'gateway_result' : '1' }
2066 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2068 self.report_download_webpage(file_id)
2069 webpage = compat_urllib_request.urlopen(request).read()
2070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2074 # Search for the real file URL
2075 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2076 if (mobj is None) or (mobj.group(1) is None):
2077 # Try to figure out reason of the error.
2078 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2079 if (mobj is not None) and (mobj.group(1) is not None):
2080 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2081 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2083 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2086 file_url = mobj.group(1)
2087 file_extension = os.path.splitext(file_url)[1][1:]
2089 # Search for file title
2090 mobj = re.search(r'<b title="(.*?)">', webpage)
2092 self._downloader.trouble(u'ERROR: unable to extract title')
2094 file_title = mobj.group(1).decode('utf-8')
2097 'id': file_id.decode('utf-8'),
2098 'url': file_url.decode('utf-8'),
2100 'upload_date': None,
2101 'title': file_title,
2102 'ext': file_extension.decode('utf-8'),
2106 class FacebookIE(InfoExtractor):
2107 """Information Extractor for Facebook"""
2109 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2110 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2111 _NETRC_MACHINE = 'facebook'
2112 IE_NAME = u'facebook'
2114 def report_login(self):
2115 """Report attempt to log in."""
2116 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2118 def _real_initialize(self):
2119 if self._downloader is None:
2124 downloader_params = self._downloader.params
2126 # Attempt to use provided username and password or .netrc data
2127 if downloader_params.get('username', None) is not None:
2128 useremail = downloader_params['username']
2129 password = downloader_params['password']
2130 elif downloader_params.get('usenetrc', False):
2132 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2133 if info is not None:
2137 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2138 except (IOError, netrc.NetrcParseError) as err:
2139 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2142 if useremail is None:
2151 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2154 login_results = compat_urllib_request.urlopen(request).read()
2155 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2156 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2159 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2162 def _real_extract(self, url):
2163 mobj = re.match(self._VALID_URL, url)
2165 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2167 video_id = mobj.group('ID')
2169 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2170 webpage = self._download_webpage(url, video_id)
2172 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2173 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2174 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2176 raise ExtractorError(u'Cannot parse data')
2177 data = dict(json.loads(m.group(1)))
2178 params_raw = compat_urllib_parse.unquote(data['params'])
2179 params = json.loads(params_raw)
2180 video_url = params['hd_src']
2182 video_url = params['sd_src']
2184 raise ExtractorError(u'Cannot find video URL')
2185 video_duration = int(params['video_duration'])
2187 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2189 raise ExtractorError(u'Cannot find title in webpage')
2190 video_title = unescapeHTML(m.group(1))
2194 'title': video_title,
2197 'duration': video_duration,
2198 'thumbnail': params['thumbnail_src'],
2203 class BlipTVIE(InfoExtractor):
2204 """Information extractor for blip.tv"""
2206 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2207 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2208 IE_NAME = u'blip.tv'
2210 def report_extraction(self, file_id):
2211 """Report information extraction."""
2212 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2214 def report_direct_download(self, title):
2215 """Report information extraction."""
2216 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2218 def _real_extract(self, url):
2219 mobj = re.match(self._VALID_URL, url)
2221 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2224 urlp = compat_urllib_parse_urlparse(url)
2225 if urlp.path.startswith('/play/'):
2226 request = compat_urllib_request.Request(url)
2227 response = compat_urllib_request.urlopen(request)
2228 redirecturl = response.geturl()
2229 rurlp = compat_urllib_parse_urlparse(redirecturl)
2230 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2231 url = 'http://blip.tv/a/a-' + file_id
2232 return self._real_extract(url)
2239 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2240 request = compat_urllib_request.Request(json_url)
2241 request.add_header('User-Agent', 'iTunes/10.6.1')
2242 self.report_extraction(mobj.group(1))
2245 urlh = compat_urllib_request.urlopen(request)
2246 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2247 basename = url.split('/')[-1]
2248 title,ext = os.path.splitext(basename)
2249 title = title.decode('UTF-8')
2250 ext = ext.replace('.', '')
2251 self.report_direct_download(title)
2256 'upload_date': None,
2261 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2263 if info is None: # Regular URL
2265 json_code_bytes = urlh.read()
2266 json_code = json_code_bytes.decode('utf-8')
2267 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2268 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2272 json_data = json.loads(json_code)
2273 if 'Post' in json_data:
2274 data = json_data['Post']
2278 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2279 video_url = data['media']['url']
2280 umobj = re.match(self._URL_EXT, video_url)
2282 raise ValueError('Can not determine filename extension')
2283 ext = umobj.group(1)
2286 'id': data['item_id'],
2288 'uploader': data['display_name'],
2289 'upload_date': upload_date,
2290 'title': data['title'],
2292 'format': data['media']['mimeType'],
2293 'thumbnail': data['thumbnailUrl'],
2294 'description': data['description'],
2295 'player_url': data['embedUrl'],
2296 'user_agent': 'iTunes/10.6.1',
2298 except (ValueError,KeyError) as err:
2299 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2305 class MyVideoIE(InfoExtractor):
2306 """Information Extractor for myvideo.de."""
2308 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2309 IE_NAME = u'myvideo'
2311 def __init__(self, downloader=None):
2312 InfoExtractor.__init__(self, downloader)
2314 def report_extraction(self, video_id):
2315 """Report information extraction."""
2316 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2318 def _real_extract(self,url):
2319 mobj = re.match(self._VALID_URL, url)
2321 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2324 video_id = mobj.group(1)
2327 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2328 webpage = self._download_webpage(webpage_url, video_id)
2330 self.report_extraction(video_id)
2331 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2334 self._downloader.trouble(u'ERROR: unable to extract media URL')
2336 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2338 mobj = re.search('<title>([^<]+)</title>', webpage)
2340 self._downloader.trouble(u'ERROR: unable to extract title')
2343 video_title = mobj.group(1)
2349 'upload_date': None,
2350 'title': video_title,
2354 class ComedyCentralIE(InfoExtractor):
2355 """Information extractor for The Daily Show and Colbert Report """
2357 # urls can be abbreviations like :thedailyshow or :colbert
2358 # urls for episodes like:
2359 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2360 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2361 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2362 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2363 |(https?://)?(www\.)?
2364 (?P<showname>thedailyshow|colbertnation)\.com/
2365 (full-episodes/(?P<episode>.*)|
2367 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2368 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2371 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2373 _video_extensions = {
2381 _video_dimensions = {
2391 def suitable(cls, url):
2392 """Receives a URL and returns True if suitable for this IE."""
2393 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2395 def report_extraction(self, episode_id):
2396 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2398 def report_config_download(self, episode_id, media_id):
2399 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2401 def report_index_download(self, episode_id):
2402 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2404 def _print_formats(self, formats):
2405 print('Available formats:')
2407 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2410 def _real_extract(self, url):
2411 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2413 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2416 if mobj.group('shortname'):
2417 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2418 url = u'http://www.thedailyshow.com/full-episodes/'
2420 url = u'http://www.colbertnation.com/full-episodes/'
2421 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422 assert mobj is not None
2424 if mobj.group('clip'):
2425 if mobj.group('showname') == 'thedailyshow':
2426 epTitle = mobj.group('tdstitle')
2428 epTitle = mobj.group('cntitle')
2431 dlNewest = not mobj.group('episode')
2433 epTitle = mobj.group('showname')
2435 epTitle = mobj.group('episode')
2437 req = compat_urllib_request.Request(url)
2438 self.report_extraction(epTitle)
2440 htmlHandle = compat_urllib_request.urlopen(req)
2441 html = htmlHandle.read()
2442 webpage = html.decode('utf-8')
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2447 url = htmlHandle.geturl()
2448 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2450 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2452 if mobj.group('episode') == '':
2453 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2455 epTitle = mobj.group('episode')
2457 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2459 if len(mMovieParams) == 0:
2460 # The Colbert Report embeds the information in a without
2461 # a URL prefix; so extract the alternate reference
2462 # and then add the URL prefix manually.
2464 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2465 if len(altMovieParams) == 0:
2466 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2469 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2471 uri = mMovieParams[0][1]
2472 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2473 self.report_index_download(epTitle)
2475 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2477 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2482 idoc = xml.etree.ElementTree.fromstring(indexXml)
2483 itemEls = idoc.findall('.//item')
2484 for partNum,itemEl in enumerate(itemEls):
2485 mediaId = itemEl.findall('./guid')[0].text
2486 shortMediaId = mediaId.split(':')[-1]
2487 showId = mediaId.split(':')[-2].replace('.com', '')
2488 officialTitle = itemEl.findall('./title')[0].text
2489 officialDate = itemEl.findall('./pubDate')[0].text
2491 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2492 compat_urllib_parse.urlencode({'uri': mediaId}))
2493 configReq = compat_urllib_request.Request(configUrl)
2494 self.report_config_download(epTitle, shortMediaId)
2496 configXml = compat_urllib_request.urlopen(configReq).read()
2497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2501 cdoc = xml.etree.ElementTree.fromstring(configXml)
2503 for rendition in cdoc.findall('.//rendition'):
2504 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2508 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2511 if self._downloader.params.get('listformats', None):
2512 self._print_formats([i[0] for i in turls])
2515 # For now, just pick the highest bitrate
2516 format,rtmp_video_url = turls[-1]
2518 # Get the format arg from the arg stream
2519 req_format = self._downloader.params.get('format', None)
2521 # Select format if we can find one
2524 format, rtmp_video_url = f, v
2527 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2529 raise ExtractorError(u'Cannot transform RTMP url')
2530 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2531 video_url = base + m.group('finalid')
2533 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2538 'upload_date': officialDate,
2543 'description': officialTitle,
2545 results.append(info)
2550 class EscapistIE(InfoExtractor):
2551 """Information extractor for The Escapist """
2553 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2554 IE_NAME = u'escapist'
2556 def report_extraction(self, showName):
2557 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2559 def report_config_download(self, showName):
2560 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2562 def _real_extract(self, url):
2563 mobj = re.match(self._VALID_URL, url)
2565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2567 showName = mobj.group('showname')
2568 videoId = mobj.group('episode')
2570 self.report_extraction(showName)
2572 webPage = compat_urllib_request.urlopen(url)
2573 webPageBytes = webPage.read()
2574 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2575 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2580 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2581 description = unescapeHTML(descMatch.group(1))
2582 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2583 imgUrl = unescapeHTML(imgMatch.group(1))
2584 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2585 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2586 configUrlMatch = re.search('config=(.*)$', playerUrl)
2587 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2589 self.report_config_download(showName)
2591 configJSON = compat_urllib_request.urlopen(configUrl)
2592 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2593 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2595 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2598 # Technically, it's JavaScript, not JSON
2599 configJSON = configJSON.replace("'", '"')
2602 config = json.loads(configJSON)
2603 except (ValueError,) as err:
2604 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2607 playlist = config['playlist']
2608 videoUrl = playlist[1]['url']
2613 'uploader': showName,
2614 'upload_date': None,
2617 'thumbnail': imgUrl,
2618 'description': description,
2619 'player_url': playerUrl,
2624 class CollegeHumorIE(InfoExtractor):
2625 """Information extractor for collegehumor.com"""
2628 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2629 IE_NAME = u'collegehumor'
2631 def report_manifest(self, video_id):
2632 """Report information extraction."""
2633 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2635 def report_extraction(self, video_id):
2636 """Report information extraction."""
2637 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2639 def _real_extract(self, url):
2640 mobj = re.match(self._VALID_URL, url)
2642 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2644 video_id = mobj.group('videoid')
2649 'upload_date': None,
2652 self.report_extraction(video_id)
2653 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2655 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2656 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2657 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2660 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2662 videoNode = mdoc.findall('./video')[0]
2663 info['description'] = videoNode.findall('./description')[0].text
2664 info['title'] = videoNode.findall('./caption')[0].text
2665 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2666 manifest_url = videoNode.findall('./file')[0].text
2668 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2671 manifest_url += '?hdcore=2.10.3'
2672 self.report_manifest(video_id)
2674 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2679 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2681 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2682 node_id = media_node.attrib['url']
2683 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2684 except IndexError as err:
2685 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2688 url_pr = compat_urllib_parse_urlparse(manifest_url)
2689 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2696 class XVideosIE(InfoExtractor):
2697 """Information extractor for xvideos.com"""
2699 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2700 IE_NAME = u'xvideos'
2702 def report_extraction(self, video_id):
2703 """Report information extraction."""
2704 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2706 def _real_extract(self, url):
2707 mobj = re.match(self._VALID_URL, url)
2709 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2711 video_id = mobj.group(1)
2713 webpage = self._download_webpage(url, video_id)
2715 self.report_extraction(video_id)
2719 mobj = re.search(r'flv_url=(.+?)&', webpage)
2721 self._downloader.trouble(u'ERROR: unable to extract video url')
2723 video_url = compat_urllib_parse.unquote(mobj.group(1))
2727 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2729 self._downloader.trouble(u'ERROR: unable to extract video title')
2731 video_title = mobj.group(1)
2734 # Extract video thumbnail
2735 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2737 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2739 video_thumbnail = mobj.group(0)
2745 'upload_date': None,
2746 'title': video_title,
2748 'thumbnail': video_thumbnail,
2749 'description': None,
2755 class SoundcloudIE(InfoExtractor):
2756 """Information extractor for soundcloud.com
2757 To access the media, the uid of the song and a stream token
2758 must be extracted from the page source and the script must make
2759 a request to media.soundcloud.com/crossdomain.xml. Then
2760 the media can be grabbed by requesting from an url composed
2761 of the stream token and uid
2764 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2765 IE_NAME = u'soundcloud'
2767 def __init__(self, downloader=None):
2768 InfoExtractor.__init__(self, downloader)
2770 def report_resolve(self, video_id):
2771 """Report information extraction."""
2772 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2774 def report_extraction(self, video_id):
2775 """Report information extraction."""
2776 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2778 def _real_extract(self, url):
2779 mobj = re.match(self._VALID_URL, url)
2781 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2784 # extract uploader (which is in the url)
2785 uploader = mobj.group(1)
2786 # extract simple title (uploader + slug of song title)
2787 slug_title = mobj.group(2)
2788 simple_title = uploader + u'-' + slug_title
2790 self.report_resolve('%s/%s' % (uploader, slug_title))
2792 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2793 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794 request = compat_urllib_request.Request(resolv_url)
2796 info_json_bytes = compat_urllib_request.urlopen(request).read()
2797 info_json = info_json_bytes.decode('utf-8')
2798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2802 info = json.loads(info_json)
2803 video_id = info['id']
2804 self.report_extraction('%s/%s' % (uploader, slug_title))
2806 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807 request = compat_urllib_request.Request(streams_url)
2809 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2810 stream_json = stream_json_bytes.decode('utf-8')
2811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2815 streams = json.loads(stream_json)
2816 mediaURL = streams['http_mp3_128_url']
2821 'uploader': info['user']['username'],
2822 'upload_date': info['created_at'],
2823 'title': info['title'],
2825 'description': info['description'],
2829 class InfoQIE(InfoExtractor):
2830 """Information extractor for infoq.com"""
2831 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2833 def report_extraction(self, video_id):
2834 """Report information extraction."""
2835 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2837 def _real_extract(self, url):
2838 mobj = re.match(self._VALID_URL, url)
2840 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2843 webpage = self._download_webpage(url, video_id=url)
2844 self.report_extraction(url)
2847 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2849 self._downloader.trouble(u'ERROR: unable to extract video url')
2851 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2852 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2855 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2857 self._downloader.trouble(u'ERROR: unable to extract video title')
2859 video_title = mobj.group(1)
2861 # Extract description
2862 video_description = u'No description available.'
2863 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2864 if mobj is not None:
2865 video_description = mobj.group(1)
2867 video_filename = video_url.split('/')[-1]
2868 video_id, extension = video_filename.split('.')
2874 'upload_date': None,
2875 'title': video_title,
2876 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2878 'description': video_description,
2883 class MixcloudIE(InfoExtractor):
2884 """Information extractor for www.mixcloud.com"""
2886 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2887 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2888 IE_NAME = u'mixcloud'
2890 def __init__(self, downloader=None):
2891 InfoExtractor.__init__(self, downloader)
2893 def report_download_json(self, file_id):
2894 """Report JSON download."""
2895 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2897 def report_extraction(self, file_id):
2898 """Report information extraction."""
2899 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2901 def get_urls(self, jsonData, fmt, bitrate='best'):
2902 """Get urls from 'audio_formats' section in json"""
2905 bitrate_list = jsonData[fmt]
2906 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2907 bitrate = max(bitrate_list) # select highest
2909 url_list = jsonData[fmt][bitrate]
2910 except TypeError: # we have no bitrate info.
2911 url_list = jsonData[fmt]
2914 def check_urls(self, url_list):
2915 """Returns 1st active url from list"""
2916 for url in url_list:
2918 compat_urllib_request.urlopen(url)
2920 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2925 def _print_formats(self, formats):
2926 print('Available formats:')
2927 for fmt in formats.keys():
2928 for b in formats[fmt]:
2930 ext = formats[fmt][b][0]
2931 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2932 except TypeError: # we have no bitrate info
2933 ext = formats[fmt][0]
2934 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2937 def _real_extract(self, url):
2938 mobj = re.match(self._VALID_URL, url)
2940 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2942 # extract uploader & filename from url
2943 uploader = mobj.group(1).decode('utf-8')
2944 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2946 # construct API request
2947 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2948 # retrieve .json file with links to files
2949 request = compat_urllib_request.Request(file_url)
2951 self.report_download_json(file_url)
2952 jsonData = compat_urllib_request.urlopen(request).read()
2953 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2954 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2958 json_data = json.loads(jsonData)
2959 player_url = json_data['player_swf_url']
2960 formats = dict(json_data['audio_formats'])
2962 req_format = self._downloader.params.get('format', None)
2965 if self._downloader.params.get('listformats', None):
2966 self._print_formats(formats)
2969 if req_format is None or req_format == 'best':
2970 for format_param in formats.keys():
2971 url_list = self.get_urls(formats, format_param)
2973 file_url = self.check_urls(url_list)
2974 if file_url is not None:
2977 if req_format not in formats:
2978 self._downloader.trouble(u'ERROR: format is not available')
2981 url_list = self.get_urls(formats, req_format)
2982 file_url = self.check_urls(url_list)
2983 format_param = req_format
2986 'id': file_id.decode('utf-8'),
2987 'url': file_url.decode('utf-8'),
2988 'uploader': uploader.decode('utf-8'),
2989 'upload_date': None,
2990 'title': json_data['name'],
2991 'ext': file_url.split('.')[-1].decode('utf-8'),
2992 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2993 'thumbnail': json_data['thumbnail_url'],
2994 'description': json_data['description'],
2995 'player_url': player_url.decode('utf-8'),
2998 class StanfordOpenClassroomIE(InfoExtractor):
2999 """Information extractor for Stanford's Open ClassRoom"""
3001 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3002 IE_NAME = u'stanfordoc'
3004 def report_download_webpage(self, objid):
3005 """Report information extraction."""
3006 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3008 def report_extraction(self, video_id):
3009 """Report information extraction."""
3010 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3012 def _real_extract(self, url):
3013 mobj = re.match(self._VALID_URL, url)
3015 raise ExtractorError(u'Invalid URL: %s' % url)
3017 if mobj.group('course') and mobj.group('video'): # A specific video
3018 course = mobj.group('course')
3019 video = mobj.group('video')
3021 'id': course + '_' + video,
3023 'upload_date': None,
3026 self.report_extraction(info['id'])
3027 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3028 xmlUrl = baseUrl + video + '.xml'
3030 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3031 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3034 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3036 info['title'] = mdoc.findall('./title')[0].text
3037 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3039 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3041 info['ext'] = info['url'].rpartition('.')[2]
3043 elif mobj.group('course'): # A course page
3044 course = mobj.group('course')
3049 'upload_date': None,
3052 coursepage = self._download_webpage(url, info['id'],
3053 note='Downloading course info page',
3054 errnote='Unable to download course info page')
3056 m = re.search('<h1>([^<]+)</h1>', coursepage)
3058 info['title'] = unescapeHTML(m.group(1))
3060 info['title'] = info['id']
3062 m = re.search('<description>([^<]+)</description>', coursepage)
3064 info['description'] = unescapeHTML(m.group(1))
3066 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3069 'type': 'reference',
3070 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3074 for entry in info['list']:
3075 assert entry['type'] == 'reference'
3076 results += self.extract(entry['url'])
3080 'id': 'Stanford OpenClassroom',
3083 'upload_date': None,
3086 self.report_download_webpage(info['id'])
3087 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3089 rootpage = compat_urllib_request.urlopen(rootURL).read()
3090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3094 info['title'] = info['id']
3096 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3099 'type': 'reference',
3100 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3105 for entry in info['list']:
3106 assert entry['type'] == 'reference'
3107 results += self.extract(entry['url'])
3110 class MTVIE(InfoExtractor):
3111 """Information extractor for MTV.com"""
3113 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3116 def report_extraction(self, video_id):
3117 """Report information extraction."""
3118 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3120 def _real_extract(self, url):
3121 mobj = re.match(self._VALID_URL, url)
3123 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3125 if not mobj.group('proto'):
3126 url = 'http://' + url
3127 video_id = mobj.group('videoid')
3129 webpage = self._download_webpage(url, video_id)
3131 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3133 self._downloader.trouble(u'ERROR: unable to extract song name')
3135 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3136 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3138 self._downloader.trouble(u'ERROR: unable to extract performer')
3140 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3141 video_title = performer + ' - ' + song_name
3143 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3145 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3147 mtvn_uri = mobj.group(1)
3149 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3151 self._downloader.trouble(u'ERROR: unable to extract content id')
3153 content_id = mobj.group(1)
3155 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3156 self.report_extraction(video_id)
3157 request = compat_urllib_request.Request(videogen_url)
3159 metadataXml = compat_urllib_request.urlopen(request).read()
3160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3164 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3165 renditions = mdoc.findall('.//rendition')
3167 # For now, always pick the highest quality.
3168 rendition = renditions[-1]
3171 _,_,ext = rendition.attrib['type'].partition('/')
3172 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3173 video_url = rendition.find('./src').text
3175 self._downloader.trouble('Invalid rendition field.')
3181 'uploader': performer,
3182 'upload_date': None,
3183 'title': video_title,
3191 class YoukuIE(InfoExtractor):
3192 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3194 def report_download_webpage(self, file_id):
3195 """Report webpage download."""
3196 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3198 def report_extraction(self, file_id):
3199 """Report information extraction."""
3200 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3203 nowTime = int(time.time() * 1000)
3204 random1 = random.randint(1000,1998)
3205 random2 = random.randint(1000,9999)
3207 return "%d%d%d" %(nowTime,random1,random2)
3209 def _get_file_ID_mix_string(self, seed):
3211 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3213 for i in range(len(source)):
3214 seed = (seed * 211 + 30031 ) % 65536
3215 index = math.floor(seed / 65536 * len(source) )
3216 mixed.append(source[int(index)])
3217 source.remove(source[int(index)])
3218 #return ''.join(mixed)
3221 def _get_file_id(self, fileId, seed):
3222 mixed = self._get_file_ID_mix_string(seed)
3223 ids = fileId.split('*')
3227 realId.append(mixed[int(ch)])
3228 return ''.join(realId)
3230 def _real_extract(self, url):
3231 mobj = re.match(self._VALID_URL, url)
3233 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3235 video_id = mobj.group('ID')
3237 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3239 request = compat_urllib_request.Request(info_url, None, std_headers)
3241 self.report_download_webpage(video_id)
3242 jsondata = compat_urllib_request.urlopen(request).read()
3243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3247 self.report_extraction(video_id)
3249 jsonstr = jsondata.decode('utf-8')
3250 config = json.loads(jsonstr)
3252 video_title = config['data'][0]['title']
3253 seed = config['data'][0]['seed']
3255 format = self._downloader.params.get('format', None)
3256 supported_format = list(config['data'][0]['streamfileids'].keys())
3258 if format is None or format == 'best':
3259 if 'hd2' in supported_format:
3264 elif format == 'worst':
3272 fileid = config['data'][0]['streamfileids'][format]
3273 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3274 except (UnicodeDecodeError, ValueError, KeyError):
3275 self._downloader.trouble(u'ERROR: unable to extract info section')
3279 sid = self._gen_sid()
3280 fileid = self._get_file_id(fileid, seed)
3282 #column 8,9 of fileid represent the segment number
3283 #fileid[7:9] should be changed
3284 for index, key in enumerate(keys):
3286 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3287 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3290 'id': '%s_part%02d' % (video_id, index),
3291 'url': download_url,
3293 'upload_date': None,
3294 'title': video_title,
3297 files_info.append(info)
3302 class XNXXIE(InfoExtractor):
3303 """Information extractor for xnxx.com"""
3305 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3307 VIDEO_URL_RE = r'flv_url=(.*?)&'
3308 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3309 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3311 def report_webpage(self, video_id):
3312 """Report information extraction"""
3313 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3315 def report_extraction(self, video_id):
3316 """Report information extraction"""
3317 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3319 def _real_extract(self, url):
3320 mobj = re.match(self._VALID_URL, url)
3322 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3324 video_id = mobj.group(1)
3326 self.report_webpage(video_id)
3328 # Get webpage content
3330 webpage_bytes = compat_urllib_request.urlopen(url).read()
3331 webpage = webpage_bytes.decode('utf-8')
3332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3336 result = re.search(self.VIDEO_URL_RE, webpage)
3338 self._downloader.trouble(u'ERROR: unable to extract video url')
3340 video_url = compat_urllib_parse.unquote(result.group(1))
3342 result = re.search(self.VIDEO_TITLE_RE, webpage)
3344 self._downloader.trouble(u'ERROR: unable to extract video title')
3346 video_title = result.group(1)
3348 result = re.search(self.VIDEO_THUMB_RE, webpage)
3350 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3352 video_thumbnail = result.group(1)
3358 'upload_date': None,
3359 'title': video_title,
3361 'thumbnail': video_thumbnail,
3362 'description': None,
3366 class GooglePlusIE(InfoExtractor):
3367 """Information extractor for plus.google.com."""
3369 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3370 IE_NAME = u'plus.google'
3372 def __init__(self, downloader=None):
3373 InfoExtractor.__init__(self, downloader)
3375 def report_extract_entry(self, url):
3376 """Report downloading extry"""
3377 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3379 def report_date(self, upload_date):
3380 """Report downloading extry"""
3381 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3383 def report_uploader(self, uploader):
3384 """Report downloading extry"""
3385 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3387 def report_title(self, video_title):
3388 """Report downloading extry"""
3389 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3391 def report_extract_vid_page(self, video_page):
3392 """Report information extraction."""
3393 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3395 def _real_extract(self, url):
3396 # Extract id from URL
3397 mobj = re.match(self._VALID_URL, url)
3399 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3402 post_url = mobj.group(0)
3403 video_id = mobj.group(1)
3405 video_extension = 'flv'
3407 # Step 1, Retrieve post webpage to extract further information
3408 self.report_extract_entry(post_url)
3409 request = compat_urllib_request.Request(post_url)
3411 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3416 # Extract update date
3418 pattern = 'title="Timestamp">(.*?)</a>'
3419 mobj = re.search(pattern, webpage)
3421 upload_date = mobj.group(1)
3422 # Convert timestring to a format suitable for filename
3423 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3424 upload_date = upload_date.strftime('%Y%m%d')
3425 self.report_date(upload_date)
3429 pattern = r'rel\="author".*?>(.*?)</a>'
3430 mobj = re.search(pattern, webpage)
3432 uploader = mobj.group(1)
3433 self.report_uploader(uploader)
3436 # Get the first line for title
3438 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3439 mobj = re.search(pattern, webpage)
3441 video_title = mobj.group(1)
3442 self.report_title(video_title)
3444 # Step 2, Stimulate clicking the image box to launch video
3445 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3446 mobj = re.search(pattern, webpage)
3448 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3450 video_page = mobj.group(1)
3451 request = compat_urllib_request.Request(video_page)
3453 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3455 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3457 self.report_extract_vid_page(video_page)
3460 # Extract video links on video page
3461 """Extract video links of all sizes"""
3462 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3463 mobj = re.findall(pattern, webpage)
3465 self._downloader.trouble(u'ERROR: unable to extract video links')
3467 # Sort in resolution
3468 links = sorted(mobj)
3470 # Choose the lowest of the sort, i.e. highest resolution
3471 video_url = links[-1]
3472 # Only get the url. The resolution part in the tuple has no use anymore
3473 video_url = video_url[-1]
3474 # Treat escaped \u0026 style hex
3476 video_url = video_url.decode("unicode_escape")
3477 except AttributeError: # Python 3
3478 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3484 'uploader': uploader,
3485 'upload_date': upload_date,
3486 'title': video_title,
3487 'ext': video_extension,
3490 class NBAIE(InfoExtractor):
3491 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3494 def _real_extract(self, url):
3495 mobj = re.match(self._VALID_URL, url)
3497 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3500 video_id = mobj.group(1)
3501 if video_id.endswith('/index.html'):
3502 video_id = video_id[:-len('/index.html')]
3504 webpage = self._download_webpage(url, video_id)
3506 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3507 def _findProp(rexp, default=None):
3508 m = re.search(rexp, webpage)
3510 return unescapeHTML(m.group(1))
3514 shortened_video_id = video_id.rpartition('/')[2]
3515 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3517 'id': shortened_video_id,
3521 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3522 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3526 class JustinTVIE(InfoExtractor):
3527 """Information extractor for justin.tv and twitch.tv"""
3528 # TODO: One broadcast may be split into multiple videos. The key
3529 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3530 # starts at 1 and increases. Can we treat all parts as one video?
3532 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3533 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3534 _JUSTIN_PAGE_LIMIT = 100
3535 IE_NAME = u'justin.tv'
3537 def report_extraction(self, file_id):
3538 """Report information extraction."""
3539 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3541 def report_download_page(self, channel, offset):
3542 """Report attempt to download a single page of videos."""
3543 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3544 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3546 # Return count of items, list of *valid* items
3547 def _parse_page(self, url):
3549 urlh = compat_urllib_request.urlopen(url)
3550 webpage_bytes = urlh.read()
3551 webpage = webpage_bytes.decode('utf-8', 'ignore')
3552 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3553 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3556 response = json.loads(webpage)
3557 if type(response) != list:
3558 error_text = response.get('error', 'unknown error')
3559 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3562 for clip in response:
3563 video_url = clip['video_file_url']
3565 video_extension = os.path.splitext(video_url)[1][1:]
3566 video_date = re.sub('-', '', clip['start_time'][:10])
3567 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3568 video_id = clip['id']
3569 video_title = clip.get('title', video_id)
3573 'title': video_title,
3574 'uploader': clip.get('channel_name', video_uploader_id),
3575 'uploader_id': video_uploader_id,
3576 'upload_date': video_date,
3577 'ext': video_extension,
3579 return (len(response), info)
3581 def _real_extract(self, url):
3582 mobj = re.match(self._VALID_URL, url)
3584 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3587 api = 'http://api.justin.tv'
3588 video_id = mobj.group(mobj.lastindex)
3590 if mobj.lastindex == 1:
3592 api += '/channel/archives/%s.json'
3594 api += '/broadcast/by_archive/%s.json'
3595 api = api % (video_id,)
3597 self.report_extraction(video_id)
3601 limit = self._JUSTIN_PAGE_LIMIT
3604 self.report_download_page(video_id, offset)
3605 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3606 page_count, page_info = self._parse_page(page_url)
3607 info.extend(page_info)
3608 if not paged or page_count != limit:
3613 class FunnyOrDieIE(InfoExtractor):
3614 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3616 def _real_extract(self, url):
3617 mobj = re.match(self._VALID_URL, url)
3619 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3622 video_id = mobj.group('id')
3623 webpage = self._download_webpage(url, video_id)
3625 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3627 self._downloader.trouble(u'ERROR: unable to find video information')
3628 video_url = unescapeHTML(m.group('url'))
3630 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3632 self._downloader.trouble(u'Cannot find video title')
3633 title = unescapeHTML(m.group('title'))
3635 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3637 desc = unescapeHTML(m.group('desc'))
3646 'description': desc,
3650 class SteamIE(InfoExtractor):
3651 _VALID_URL = r"""http://store.steampowered.com/
3652 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3654 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3658 def suitable(cls, url):
3659 """Receives a URL and returns True if suitable for this IE."""
3660 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3662 def _real_extract(self, url):
3663 m = re.match(self._VALID_URL, url, re.VERBOSE)
3664 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3665 gameID = m.group('gameID')
3666 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3667 webpage = self._download_webpage(videourl, gameID)
3668 mweb = re.finditer(urlRE, webpage)
3669 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3670 titles = re.finditer(namesRE, webpage)
3671 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3672 thumbs = re.finditer(thumbsRE, webpage)
3674 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3675 video_id = vid.group('videoID')
3676 title = vtitle.group('videoName')
3677 video_url = vid.group('videoURL')
3678 video_thumb = thumb.group('thumbnail')
3680 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3685 'title': unescapeHTML(title),
3686 'thumbnail': video_thumb
3691 class UstreamIE(InfoExtractor):
3692 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3693 IE_NAME = u'ustream'
3695 def _real_extract(self, url):
3696 m = re.match(self._VALID_URL, url)
3697 video_id = m.group('videoID')
3698 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3699 webpage = self._download_webpage(url, video_id)
3700 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3701 title = m.group('title')
3702 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3703 uploader = m.group('uploader')
3709 'uploader': uploader
3713 class RBMARadioIE(InfoExtractor):
3714 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3716 def _real_extract(self, url):
3717 m = re.match(self._VALID_URL, url)
3718 video_id = m.group('videoID')
3720 webpage = self._download_webpage(url, video_id)
3721 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3723 raise ExtractorError(u'Cannot find metadata')
3724 json_data = m.group(1)
3727 data = json.loads(json_data)
3728 except ValueError as e:
3729 raise ExtractorError(u'Invalid JSON: ' + str(e))
3731 video_url = data['akamai_url'] + '&cbr=256'
3732 url_parts = compat_urllib_parse_urlparse(video_url)
3733 video_ext = url_parts.path.rpartition('.')[2]
3738 'title': data['title'],
3739 'description': data.get('teaser_text'),
3740 'location': data.get('country_of_origin'),
3741 'uploader': data.get('host', {}).get('name'),
3742 'uploader_id': data.get('host', {}).get('slug'),
3743 'thumbnail': data.get('image', {}).get('large_url_2x'),
3744 'duration': data.get('duration'),
3749 class YouPornIE(InfoExtractor):
3750 """Information extractor for youporn.com."""
3751 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3753 def _print_formats(self, formats):
3754 """Print all available formats"""
3755 print(u'Available formats:')
3756 print(u'ext\t\tformat')
3757 print(u'---------------------------------')
3758 for format in formats:
3759 print(u'%s\t\t%s' % (format['ext'], format['format']))
3761 def _specific(self, req_format, formats):
3763 if(x["format"]==req_format):
3767 def _real_extract(self, url):
3768 mobj = re.match(self._VALID_URL, url)
3770 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3773 video_id = mobj.group('videoid')
3775 req = compat_urllib_request.Request(url)
3776 req.add_header('Cookie', 'age_verified=1')
3777 webpage = self._download_webpage(req, video_id)
3779 # Get the video title
3780 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3782 raise ExtractorError(u'Unable to extract video title')
3783 video_title = result.group('title').strip()
3785 # Get the video date
3786 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3788 self._downloader.report_warning(u'unable to extract video date')
3791 upload_date = result.group('date').strip()
3793 # Get the video uploader
3794 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3796 self._downloader.report_warning(u'unable to extract uploader')
3797 video_uploader = None
3799 video_uploader = result.group('uploader').strip()
3800 video_uploader = clean_html( video_uploader )
3802 # Get all of the formats available
3803 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3804 result = re.search(DOWNLOAD_LIST_RE, webpage)
3806 raise ExtractorError(u'Unable to extract download list')
3807 download_list_html = result.group('download_list').strip()
3809 # Get all of the links from the page
3810 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3811 links = re.findall(LINK_RE, download_list_html)
3812 if(len(links) == 0):
3813 raise ExtractorError(u'ERROR: no known formats available for video')
3815 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3820 # A link looks like this:
3821 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3822 # A path looks like this:
3823 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3824 video_url = unescapeHTML( link )
3825 path = compat_urllib_parse_urlparse( video_url ).path
3826 extension = os.path.splitext( path )[1][1:]
3827 format = path.split('/')[4].split('_')[:2]
3830 format = "-".join( format )
3831 title = u'%s-%s-%s' % (video_title, size, bitrate)
3836 'uploader': video_uploader,
3837 'upload_date': upload_date,
3842 'description': None,
3846 if self._downloader.params.get('listformats', None):
3847 self._print_formats(formats)
3850 req_format = self._downloader.params.get('format', None)
3851 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3853 if req_format is None or req_format == 'best':
3855 elif req_format == 'worst':
3856 return [formats[-1]]
3857 elif req_format in ('-1', 'all'):
3860 format = self._specific( req_format, formats )
3862 self._downloader.trouble(u'ERROR: requested format not available')
3868 class PornotubeIE(InfoExtractor):
3869 """Information extractor for pornotube.com."""
3870 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3872 def _real_extract(self, url):
3873 mobj = re.match(self._VALID_URL, url)
3875 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3878 video_id = mobj.group('videoid')
3879 video_title = mobj.group('title')
3881 # Get webpage content
3882 webpage = self._download_webpage(url, video_id)
3885 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3886 result = re.search(VIDEO_URL_RE, webpage)
3888 self._downloader.trouble(u'ERROR: unable to extract video url')
3890 video_url = compat_urllib_parse.unquote(result.group('url'))
3892 #Get the uploaded date
3893 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3894 result = re.search(VIDEO_UPLOADED_RE, webpage)
3896 self._downloader.trouble(u'ERROR: unable to extract video title')
3898 upload_date = result.group('date')
3900 info = {'id': video_id,
3903 'upload_date': upload_date,
3904 'title': video_title,
3910 class YouJizzIE(InfoExtractor):
3911 """Information extractor for youjizz.com."""
3912 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3914 def _real_extract(self, url):
3915 mobj = re.match(self._VALID_URL, url)
3917 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3920 video_id = mobj.group('videoid')
3922 # Get webpage content
3923 webpage = self._download_webpage(url, video_id)
3925 # Get the video title
3926 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3928 raise ExtractorError(u'ERROR: unable to extract video title')
3929 video_title = result.group('title').strip()
3931 # Get the embed page
3932 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3934 raise ExtractorError(u'ERROR: unable to extract embed page')
3936 embed_page_url = result.group(0).strip()
3937 video_id = result.group('videoid')
3939 webpage = self._download_webpage(embed_page_url, video_id)
3942 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3944 raise ExtractorError(u'ERROR: unable to extract video url')
3945 video_url = result.group('source')
3947 info = {'id': video_id,
3949 'title': video_title,
3952 'player_url': embed_page_url}
3956 class EightTracksIE(InfoExtractor):
3958 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3960 def _real_extract(self, url):
3961 mobj = re.match(self._VALID_URL, url)
3963 raise ExtractorError(u'Invalid URL: %s' % url)
3964 playlist_id = mobj.group('id')
3966 webpage = self._download_webpage(url, playlist_id)
3968 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3970 raise ExtractorError(u'Cannot find trax information')
3971 json_like = m.group(1)
3972 data = json.loads(json_like)
3974 session = str(random.randint(0, 1000000000))
3976 track_count = data['tracks_count']
3977 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3978 next_url = first_url
3980 for i in itertools.count():
3981 api_json = self._download_webpage(next_url, playlist_id,
3982 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3983 errnote=u'Failed to download song information')
3984 api_data = json.loads(api_json)
3985 track_data = api_data[u'set']['track']
3987 'id': track_data['id'],
3988 'url': track_data['track_file_stream_url'],
3989 'title': track_data['performer'] + u' - ' + track_data['name'],
3990 'raw_title': track_data['name'],
3991 'uploader_id': data['user']['login'],
3995 if api_data['set']['at_last_track']:
3997 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4000 class KeekIE(InfoExtractor):
4001 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4004 def _real_extract(self, url):
4005 m = re.match(self._VALID_URL, url)
4006 video_id = m.group('videoID')
4007 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4008 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4009 webpage = self._download_webpage(url, video_id)
4010 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4011 title = unescapeHTML(m.group('title'))
4012 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4013 uploader = clean_html(m.group('uploader'))
4019 'thumbnail': thumbnail,
4020 'uploader': uploader
4024 class TEDIE(InfoExtractor):
4025 _VALID_URL=r'''http://www.ted.com/
4027 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4029 ((?P<type_talk>talks)) # We have a simple talk
4031 /(?P<name>\w+) # Here goes the name and then ".html"
4035 def suitable(cls, url):
4036 """Receives a URL and returns True if suitable for this IE."""
4037 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4039 def _real_extract(self, url):
4040 m=re.match(self._VALID_URL, url, re.VERBOSE)
4041 if m.group('type_talk'):
4042 return [self._talk_info(url)]
4044 playlist_id=m.group('playlist_id')
4045 name=m.group('name')
4046 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4047 return self._playlist_videos_info(url,name,playlist_id)
4049 def _talk_video_link(self,mediaSlug):
4050 '''Returns the video link for that mediaSlug'''
4051 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4053 def _playlist_videos_info(self,url,name,playlist_id=0):
4054 '''Returns the videos of the playlist'''
4056 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4057 ([.\s]*?)data-playlist_item_id="(\d+)"
4058 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4060 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4061 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4062 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4063 m_names=re.finditer(video_name_RE,webpage)
4065 for m_video, m_name in zip(m_videos,m_names):
4066 video_id=m_video.group('video_id')
4067 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4068 info.append(self._talk_info(talk_url,video_id))
4071 def _talk_info(self, url, video_id=0):
4072 """Return the video for the talk in the url"""
4073 m=re.match(self._VALID_URL, url,re.VERBOSE)
4074 videoName=m.group('name')
4075 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4076 # If the url includes the language we get the title translated
4077 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4078 title=re.search(title_RE, webpage).group('title')
4079 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4080 "id":(?P<videoID>[\d]+).*?
4081 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4082 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4083 thumb_match=re.search(thumb_RE,webpage)
4084 info_match=re.search(info_RE,webpage,re.VERBOSE)
4085 video_id=info_match.group('videoID')
4086 mediaSlug=info_match.group('mediaSlug')
4087 video_url=self._talk_video_link(mediaSlug)
4093 'thumbnail': thumb_match.group('thumbnail')
4097 class MySpassIE(InfoExtractor):
4098 _VALID_URL = r'http://www.myspass.de/.*'
4100 def _real_extract(self, url):
4101 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4103 # video id is the last path element of the URL
4104 # usually there is a trailing slash, so also try the second but last
4105 url_path = compat_urllib_parse_urlparse(url).path
4106 url_parent_path, video_id = os.path.split(url_path)
4108 _, video_id = os.path.split(url_parent_path)
4111 metadata_url = META_DATA_URL_TEMPLATE % video_id
4112 metadata_text = self._download_webpage(metadata_url, video_id)
4113 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4115 # extract values from metadata
4116 url_flv_el = metadata.find('url_flv')
4117 if url_flv_el is None:
4118 self._downloader.trouble(u'ERROR: unable to extract download url')
4120 video_url = url_flv_el.text
4121 extension = os.path.splitext(video_url)[1][1:]
4122 title_el = metadata.find('title')
4123 if title_el is None:
4124 self._downloader.trouble(u'ERROR: unable to extract title')
4126 title = title_el.text
4127 format_id_el = metadata.find('format_id')
4128 if format_id_el is None:
4131 format = format_id_el.text
4132 description_el = metadata.find('description')
4133 if description_el is not None:
4134 description = description_el.text
4137 imagePreview_el = metadata.find('imagePreview')
4138 if imagePreview_el is not None:
4139 thumbnail = imagePreview_el.text
4148 'thumbnail': thumbnail,
4149 'description': description
4153 class SpiegelIE(InfoExtractor):
4154 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?$'
4156 def _real_extract(self, url):
4157 m = re.match(self._VALID_URL, url)
4158 video_id = m.group('videoID')
4160 webpage = self._download_webpage(url, video_id)
4161 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4163 raise ExtractorError(u'Cannot find title')
4164 video_title = unescapeHTML(m.group(1))
4166 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4167 xml_code = self._download_webpage(xml_url, video_id,
4168 note=u'Downloading XML', errnote=u'Failed to download XML')
4170 idoc = xml.etree.ElementTree.fromstring(xml_code)
4171 last_type = idoc[-1]
4172 filename = last_type.findall('./filename')[0].text
4173 duration = float(last_type.findall('./duration')[0].text)
4175 video_url = 'http://video2.spiegel.de/flash/' + filename
4176 video_ext = filename.rpartition('.')[2]
4181 'title': video_title,
4182 'duration': duration,
4187 def gen_extractors():
4188 """ Return a list of an instance of every supported extractor.
4189 The order does matter; the first extractor matched is the one handling the URL.
4192 YoutubePlaylistIE(),
4216 StanfordOpenClassroomIE(),