2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 #Methods for following #608
148 #They set the correct value of the '_type' key
149 def video_result(self, video_info):
150 """Returns a video"""
151 video_info['_type'] = 'video'
153 def url_result(self, url, ie=None):
154 """Returns a url that points to a page that should be processed"""
155 #TODO: ie should be the class used for getting the info
156 video_info = {'_type': 'url',
160 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
161 """Returns a playlist"""
162 video_info = {'_type': 'playlist',
165 video_info['id'] = playlist_id
167 video_info['title'] = playlist_title
171 class YoutubeIE(InfoExtractor):
172 """Information extractor for youtube.com."""
176 (?:https?://)? # http(s):// (optional)
177 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
178 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
179 (?:.*?\#/)? # handle anchor (#/) redirect urls
180 (?: # the various things that can precede the ID:
181 (?:(?:v|embed|e)/) # v/ or embed/ or e/
182 |(?: # or the v= param in all its forms
183 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
184 (?:\?|\#!?) # the params delimiter ? or # or #!
185 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
188 )? # optional -> youtube.com/xxxx is OK
189 )? # all until now is optional -> you can pass the naked ID
190 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
191 (?(1).+)? # if we found the ID, everything can follow
193 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
194 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
195 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
196 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
197 _NETRC_MACHINE = 'youtube'
198 # Listed in order of quality
199 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
200 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
201 _video_extensions = {
207 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
213 _video_dimensions = {
232 def suitable(cls, url):
233 """Receives a URL and returns True if suitable for this IE."""
234 if YoutubePlaylistIE.suitable(url): return False
235 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
237 def report_lang(self):
238 """Report attempt to set language."""
239 self._downloader.to_screen(u'[youtube] Setting language')
241 def report_login(self):
242 """Report attempt to log in."""
243 self._downloader.to_screen(u'[youtube] Logging in')
245 def report_age_confirmation(self):
246 """Report attempt to confirm age."""
247 self._downloader.to_screen(u'[youtube] Confirming age')
249 def report_video_webpage_download(self, video_id):
250 """Report attempt to download video webpage."""
251 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
253 def report_video_info_webpage_download(self, video_id):
254 """Report attempt to download video info webpage."""
255 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
257 def report_video_subtitles_download(self, video_id):
258 """Report attempt to download video info webpage."""
259 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
261 def report_video_subtitles_request(self, video_id, sub_lang, format):
262 """Report attempt to download video info webpage."""
263 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
265 def report_video_subtitles_available(self, video_id, sub_lang_list):
266 """Report available subtitles."""
267 sub_lang = ",".join(list(sub_lang_list.keys()))
268 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
270 def report_information_extraction(self, video_id):
271 """Report attempt to extract video information."""
272 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
274 def report_unavailable_format(self, video_id, format):
275 """Report extracted video URL."""
276 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
278 def report_rtmp_download(self):
279 """Indicate the download will use the RTMP protocol."""
280 self._downloader.to_screen(u'[youtube] RTMP download detected')
282 def _get_available_subtitles(self, video_id):
283 self.report_video_subtitles_download(video_id)
284 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
286 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
287 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
288 return (u'unable to download video subtitles: %s' % compat_str(err), None)
289 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
290 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
291 if not sub_lang_list:
292 return (u'video doesn\'t have subtitles', None)
295 def _list_available_subtitles(self, video_id):
296 sub_lang_list = self._get_available_subtitles(video_id)
297 self.report_video_subtitles_available(video_id, sub_lang_list)
299 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
302 (error_message, sub_lang, sub)
304 self.report_video_subtitles_request(video_id, sub_lang, format)
305 params = compat_urllib_parse.urlencode({
311 url = 'http://www.youtube.com/api/timedtext?' + params
313 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
317 return (u'Did not fetch video subtitles', None, None)
318 return (None, sub_lang, sub)
320 def _extract_subtitle(self, video_id):
322 Return a list with a tuple:
323 [(error_message, sub_lang, sub)]
325 sub_lang_list = self._get_available_subtitles(video_id)
326 sub_format = self._downloader.params.get('subtitlesformat')
327 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
328 return [(sub_lang_list[0], None, None)]
329 if self._downloader.params.get('subtitleslang', False):
330 sub_lang = self._downloader.params.get('subtitleslang')
331 elif 'en' in sub_lang_list:
334 sub_lang = list(sub_lang_list.keys())[0]
335 if not sub_lang in sub_lang_list:
336 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
338 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
341 def _extract_all_subtitles(self, video_id):
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
347 for sub_lang in sub_lang_list:
348 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 subtitles.append(subtitle)
352 def _print_formats(self, formats):
353 print('Available formats:')
355 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
357 def _real_initialize(self):
358 if self._downloader is None:
363 downloader_params = self._downloader.params
365 # Attempt to use provided username and password or .netrc data
366 if downloader_params.get('username', None) is not None:
367 username = downloader_params['username']
368 password = downloader_params['password']
369 elif downloader_params.get('usenetrc', False):
371 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
376 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
377 except (IOError, netrc.NetrcParseError) as err:
378 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
382 request = compat_urllib_request.Request(self._LANG_URL)
385 compat_urllib_request.urlopen(request).read()
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
390 # No authentication to be performed
394 request = compat_urllib_request.Request(self._LOGIN_URL)
396 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
398 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
403 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
405 galx = match.group(1)
407 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
413 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
417 u'PersistentCookie': u'yes',
419 u'bgresponse': u'js_disabled',
420 u'checkConnection': u'',
421 u'checkedDomains': u'youtube',
427 u'signIn': u'Sign in',
429 u'service': u'youtube',
433 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
435 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
436 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
437 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
440 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
441 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
442 self._downloader.report_warning(u'unable to log in: bad username or password')
444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
451 'action_confirm': 'Confirm',
453 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
455 self.report_age_confirmation()
456 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
458 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
461 def _extract_id(self, url):
462 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
464 self._downloader.report_error(u'invalid URL: %s' % url)
466 video_id = mobj.group(2)
469 def _real_extract(self, url):
470 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
471 mobj = re.search(self._NEXT_URL_RE, url)
473 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
474 video_id = self._extract_id(url)
477 self.report_video_webpage_download(video_id)
478 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
479 request = compat_urllib_request.Request(url)
481 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
483 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
486 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
488 # Attempt to extract SWF player URL
489 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
491 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
496 self.report_video_info_webpage_download(video_id)
497 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
498 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
499 % (video_id, el_type))
500 video_info_webpage = self._download_webpage(video_info_url, video_id,
502 errnote='unable to download video info webpage')
503 video_info = compat_parse_qs(video_info_webpage)
504 if 'token' in video_info:
506 if 'token' not in video_info:
507 if 'reason' in video_info:
508 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
510 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
513 # Check for "rental" videos
514 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
515 self._downloader.report_error(u'"rental" videos not supported')
518 # Start extracting information
519 self.report_information_extraction(video_id)
522 if 'author' not in video_info:
523 self._downloader.report_error(u'unable to extract uploader name')
525 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
528 video_uploader_id = None
529 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
531 video_uploader_id = mobj.group(1)
533 self._downloader.report_warning(u'unable to extract uploader nickname')
536 if 'title' not in video_info:
537 self._downloader.report_error(u'unable to extract video title')
539 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
542 if 'thumbnail_url' not in video_info:
543 self._downloader.report_warning(u'unable to extract video thumbnail')
545 else: # don't panic if we can't find it
546 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
550 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
552 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
553 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
554 for expression in format_expressions:
556 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
561 video_description = get_element_by_id("eow-description", video_webpage)
562 if video_description:
563 video_description = clean_html(video_description)
565 video_description = ''
568 video_subtitles = None
570 if self._downloader.params.get('writesubtitles', False):
571 video_subtitles = self._extract_subtitle(video_id)
573 (sub_error, sub_lang, sub) = video_subtitles[0]
575 self._downloader.report_error(sub_error)
577 if self._downloader.params.get('allsubtitles', False):
578 video_subtitles = self._extract_all_subtitles(video_id)
579 for video_subtitle in video_subtitles:
580 (sub_error, sub_lang, sub) = video_subtitle
582 self._downloader.report_error(sub_error)
584 if self._downloader.params.get('listsubtitles', False):
585 sub_lang_list = self._list_available_subtitles(video_id)
588 if 'length_seconds' not in video_info:
589 self._downloader.report_warning(u'unable to extract video duration')
592 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
595 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
597 # Decide which formats to download
598 req_format = self._downloader.params.get('format', None)
600 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
601 self.report_rtmp_download()
602 video_url_list = [(None, video_info['conn'][0])]
603 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
604 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
605 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
606 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
607 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
609 format_limit = self._downloader.params.get('format_limit', None)
610 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
611 if format_limit is not None and format_limit in available_formats:
612 format_list = available_formats[available_formats.index(format_limit):]
614 format_list = available_formats
615 existing_formats = [x for x in format_list if x in url_map]
616 if len(existing_formats) == 0:
617 self._downloader.report_error(u'no known formats available for video')
619 if self._downloader.params.get('listformats', None):
620 self._print_formats(existing_formats)
622 if req_format is None or req_format == 'best':
623 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
624 elif req_format == 'worst':
625 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
626 elif req_format in ('-1', 'all'):
627 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
629 # Specific formats. We pick the first in a slash-delimeted sequence.
630 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
631 req_formats = req_format.split('/')
632 video_url_list = None
633 for rf in req_formats:
635 video_url_list = [(rf, url_map[rf])]
637 if video_url_list is None:
638 self._downloader.report_error(u'requested format not available')
641 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
645 for format_param, video_real_url in video_url_list:
647 video_extension = self._video_extensions.get(format_param, 'flv')
649 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
650 self._video_dimensions.get(format_param, '???'))
654 'url': video_real_url,
655 'uploader': video_uploader,
656 'uploader_id': video_uploader_id,
657 'upload_date': upload_date,
658 'title': video_title,
659 'ext': video_extension,
660 'format': video_format,
661 'thumbnail': video_thumbnail,
662 'description': video_description,
663 'player_url': player_url,
664 'subtitles': video_subtitles,
665 'duration': video_duration
670 class MetacafeIE(InfoExtractor):
671 """Information Extractor for metacafe.com."""
673 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
674 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
675 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
676 IE_NAME = u'metacafe'
678 def __init__(self, downloader=None):
679 InfoExtractor.__init__(self, downloader)
681 def report_disclaimer(self):
682 """Report disclaimer retrieval."""
683 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
685 def report_age_confirmation(self):
686 """Report attempt to confirm age."""
687 self._downloader.to_screen(u'[metacafe] Confirming age')
689 def report_download_webpage(self, video_id):
690 """Report webpage download."""
691 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
693 def report_extraction(self, video_id):
694 """Report information extraction."""
695 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
697 def _real_initialize(self):
698 # Retrieve disclaimer
699 request = compat_urllib_request.Request(self._DISCLAIMER)
701 self.report_disclaimer()
702 disclaimer = compat_urllib_request.urlopen(request).read()
703 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
710 'submit': "Continue - I'm over 18",
712 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
714 self.report_age_confirmation()
715 disclaimer = compat_urllib_request.urlopen(request).read()
716 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
717 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
720 def _real_extract(self, url):
721 # Extract id and simplified title from URL
722 mobj = re.match(self._VALID_URL, url)
724 self._downloader.report_error(u'invalid URL: %s' % url)
727 video_id = mobj.group(1)
729 # Check if video comes from YouTube
730 mobj2 = re.match(r'^yt-(.*)$', video_id)
731 if mobj2 is not None:
732 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
734 # Retrieve video webpage to extract further information
735 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892 def report_extraction(self, video_id):
893 """Report information extraction."""
894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896 def _real_extract(self, url):
897 # Extract id from URL
898 mobj = re.match(self._VALID_URL, url)
900 self._downloader.report_error(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 video_extension = 'flv'
907 # Retrieve video webpage to extract further information
908 request = compat_urllib_request.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = compat_urllib_request.urlopen(request).read()
912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
916 # Extract URL, uploader, and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920 self._downloader.report_error(u'unable to extract media URL')
922 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928 self._downloader.report_error(u'unable to extract title')
930 video_title = mobj.group(1).decode('utf-8')
932 video_uploader = mobj.group(2).decode('utf-8')
935 'id': video_id.decode('utf-8'),
936 'url': video_url.decode('utf-8'),
937 'uploader': video_uploader,
939 'title': video_title,
940 'ext': video_extension.decode('utf-8'),
944 class YahooIE(InfoExtractor):
945 """Information extractor for video.yahoo.com."""
948 # _VALID_URL matches all Yahoo! Video URLs
949 # _VPAGE_URL matches only the extractable '/watch/' URLs
950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952 IE_NAME = u'video.yahoo'
954 def __init__(self, downloader=None):
955 InfoExtractor.__init__(self, downloader)
957 def report_download_webpage(self, video_id):
958 """Report webpage download."""
959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961 def report_extraction(self, video_id):
962 """Report information extraction."""
963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965 def _real_extract(self, url, new_video=True):
966 # Extract ID from URL
967 mobj = re.match(self._VALID_URL, url)
969 self._downloader.report_error(u'Invalid URL: %s' % url)
972 video_id = mobj.group(2)
973 video_extension = 'flv'
975 # Rewrite valid but non-extractable URLs as
976 # extractable English language /watch/ URLs
977 if re.match(self._VPAGE_URL, url) is None:
978 request = compat_urllib_request.Request(url)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987 self._downloader.report_error(u'Unable to extract id field')
989 yahoo_id = mobj.group(1)
991 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993 self._downloader.report_error(u'Unable to extract vid field')
995 yahoo_vid = mobj.group(1)
997 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998 return self._real_extract(url, new_video=False)
1000 # Retrieve video webpage to extract further information
1001 request = compat_urllib_request.Request(url)
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract uploader and title from webpage
1010 self.report_extraction(video_id)
1011 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013 self._downloader.report_error(u'unable to extract video title')
1015 video_title = mobj.group(1).decode('utf-8')
1017 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019 self._downloader.report_error(u'unable to extract video uploader')
1021 video_uploader = mobj.group(1).decode('utf-8')
1023 # Extract video thumbnail
1024 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video thumbnail')
1028 video_thumbnail = mobj.group(1).decode('utf-8')
1030 # Extract video description
1031 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video description')
1035 video_description = mobj.group(1).decode('utf-8')
1036 if not video_description:
1037 video_description = 'No description available.'
1039 # Extract video height and width
1040 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042 self._downloader.report_error(u'unable to extract video height')
1044 yv_video_height = mobj.group(1)
1046 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048 self._downloader.report_error(u'unable to extract video width')
1050 yv_video_width = mobj.group(1)
1052 # Retrieve video playlist to extract media URL
1053 # I'm not completely sure what all these options are, but we
1054 # seem to need most of them, otherwise the server sends a 401.
1055 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1056 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1057 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Extract media URL from playlist XML
1068 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070 self._downloader.report_error(u'Unable to extract media URL')
1072 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073 video_url = unescapeHTML(video_url)
1076 'id': video_id.decode('utf-8'),
1078 'uploader': video_uploader,
1079 'upload_date': None,
1080 'title': video_title,
1081 'ext': video_extension.decode('utf-8'),
1082 'thumbnail': video_thumbnail.decode('utf-8'),
1083 'description': video_description,
1087 class VimeoIE(InfoExtractor):
1088 """Information extractor for vimeo.com."""
1090 # _VALID_URL matches Vimeo URLs
1091 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105 def _real_extract(self, url, new_video=True):
1106 # Extract ID from URL
1107 mobj = re.match(self._VALID_URL, url)
1109 self._downloader.report_error(u'Invalid URL: %s' % url)
1112 video_id = mobj.group('id')
1113 if not mobj.group('proto'):
1114 url = 'https://' + url
1115 if mobj.group('direct_link'):
1116 url = 'https://vimeo.com/' + video_id
1118 # Retrieve video webpage to extract further information
1119 request = compat_urllib_request.Request(url, None, std_headers)
1121 self.report_download_webpage(video_id)
1122 webpage_bytes = compat_urllib_request.urlopen(request).read()
1123 webpage = webpage_bytes.decode('utf-8')
1124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1128 # Now we begin extracting as much information as we can from what we
1129 # retrieved. First we extract the information common to all extractors,
1130 # and latter we extract those that are Vimeo specific.
1131 self.report_extraction(video_id)
1133 # Extract the config JSON
1135 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136 config = json.loads(config)
1138 self._downloader.report_error(u'unable to extract info section')
1142 video_title = config["video"]["title"]
1144 # Extract uploader and uploader_id
1145 video_uploader = config["video"]["owner"]["name"]
1146 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148 # Extract video thumbnail
1149 video_thumbnail = config["video"]["thumbnail"]
1151 # Extract video description
1152 video_description = get_element_by_attribute("itemprop", "description", webpage)
1153 if video_description: video_description = clean_html(video_description)
1154 else: video_description = u''
1156 # Extract upload date
1157 video_upload_date = None
1158 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159 if mobj is not None:
1160 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162 # Vimeo specific: extract request signature and timestamp
1163 sig = config['request']['signature']
1164 timestamp = config['request']['timestamp']
1166 # Vimeo specific: extract video codec and quality information
1167 # First consider quality, then codecs, then take everything
1168 # TODO bind to format param
1169 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170 files = { 'hd': [], 'sd': [], 'other': []}
1171 for codec_name, codec_extension in codecs:
1172 if codec_name in config["video"]["files"]:
1173 if 'hd' in config["video"]["files"][codec_name]:
1174 files['hd'].append((codec_name, codec_extension, 'hd'))
1175 elif 'sd' in config["video"]["files"][codec_name]:
1176 files['sd'].append((codec_name, codec_extension, 'sd'))
1178 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180 for quality in ('hd', 'sd', 'other'):
1181 if len(files[quality]) > 0:
1182 video_quality = files[quality][0][2]
1183 video_codec = files[quality][0][0]
1184 video_extension = files[quality][0][1]
1185 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1188 self._downloader.report_error(u'no known codec found')
1191 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197 'uploader': video_uploader,
1198 'uploader_id': video_uploader_id,
1199 'upload_date': video_upload_date,
1200 'title': video_title,
1201 'ext': video_extension,
1202 'thumbnail': video_thumbnail,
1203 'description': video_description,
1207 class ArteTvIE(InfoExtractor):
1208 """arte.tv information extractor."""
1210 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211 _LIVE_URL = r'index-[0-9]+\.html$'
1213 IE_NAME = u'arte.tv'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1218 def report_download_webpage(self, video_id):
1219 """Report webpage download."""
1220 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222 def report_extraction(self, video_id):
1223 """Report information extraction."""
1224 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234 except ValueError as err:
1235 self._downloader.report_error(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 self._downloader.report_error(u'Invalid URL: %s' % url)
1248 for (i, key, err) in matchTuples:
1249 if mobj.group(i) is None:
1250 self._downloader.trouble(err)
1253 info[key] = mobj.group(i)
1257 def extractLiveStream(self, url):
1258 video_lang = url.split('/')[-4]
1259 info = self.grep_webpage(
1261 r'src="(.*?/videothek_js.*?\.js)',
1264 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267 http_host = url.split('/')[2]
1268 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269 info = self.grep_webpage(
1271 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272 '(http://.*?\.swf).*?' +
1276 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1277 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1281 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283 def extractPlus7Stream(self, url):
1284 video_lang = url.split('/')[-3]
1285 info = self.grep_webpage(
1287 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1290 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1293 next_url = compat_urllib_parse.unquote(info.get('url'))
1294 info = self.grep_webpage(
1296 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1299 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1302 next_url = compat_urllib_parse.unquote(info.get('url'))
1304 info = self.grep_webpage(
1306 r'<video id="(.*?)".*?>.*?' +
1307 '<name>(.*?)</name>.*?' +
1308 '<dateVideo>(.*?)</dateVideo>.*?' +
1309 '<url quality="hd">(.*?)</url>',
1312 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1313 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1315 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1320 'id': info.get('id'),
1321 'url': compat_urllib_parse.unquote(info.get('url')),
1322 'uploader': u'arte.tv',
1323 'upload_date': info.get('date'),
1324 'title': info.get('title').decode('utf-8'),
1330 def _real_extract(self, url):
1331 video_id = url.split('/')[-1]
1332 self.report_extraction(video_id)
1334 if re.search(self._LIVE_URL, video_id) is not None:
1335 self.extractLiveStream(url)
1338 info = self.extractPlus7Stream(url)
1343 class GenericIE(InfoExtractor):
1344 """Generic last-resort information extractor."""
1347 IE_NAME = u'generic'
1349 def __init__(self, downloader=None):
1350 InfoExtractor.__init__(self, downloader)
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 if not self._downloader.params.get('test', False):
1355 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362 def report_following_redirect(self, new_url):
1363 """Report information extraction."""
1364 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366 def _test_redirect(self, url):
1367 """Check if it is a redirect, like url shorteners, in case return the new url."""
1368 class HeadRequest(compat_urllib_request.Request):
1369 def get_method(self):
1372 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374 Subclass the HTTPRedirectHandler to make it use our
1375 HeadRequest also on the redirected URL
1377 def redirect_request(self, req, fp, code, msg, headers, newurl):
1378 if code in (301, 302, 303, 307):
1379 newurl = newurl.replace(' ', '%20')
1380 newheaders = dict((k,v) for k,v in req.headers.items()
1381 if k.lower() not in ("content-length", "content-type"))
1382 return HeadRequest(newurl,
1384 origin_req_host=req.get_origin_req_host(),
1387 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391 Fallback to GET if HEAD is not allowed (405 HTTP error)
1393 def http_error_405(self, req, fp, code, msg, headers):
1397 newheaders = dict((k,v) for k,v in req.headers.items()
1398 if k.lower() not in ("content-length", "content-type"))
1399 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401 origin_req_host=req.get_origin_req_host(),
1405 opener = compat_urllib_request.OpenerDirector()
1406 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407 HTTPMethodFallback, HEADRedirectHandler,
1408 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409 opener.add_handler(handler())
1411 response = opener.open(HeadRequest(url))
1412 new_url = response.geturl()
1417 self.report_following_redirect(new_url)
1420 def _real_extract(self, url):
1421 new_url = self._test_redirect(url)
1422 if new_url: return [self.url_result(new_url)]
1424 video_id = url.split('/')[-1]
1426 webpage = self._download_webpage(url, video_id)
1427 except ValueError as err:
1428 # since this is the last-resort InfoExtractor, if
1429 # this error is thrown, it'll be thrown here
1430 self._downloader.report_error(u'Invalid URL: %s' % url)
1433 self.report_extraction(video_id)
1434 # Start with something easy: JW Player in SWFObject
1435 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437 # Broaden the search a little bit
1438 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440 # Broaden the search a little bit: JWPlayer JS loader
1441 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443 self._downloader.report_error(u'Invalid URL: %s' % url)
1446 # It's possible that one of the regexes
1447 # matched, but returned an empty group:
1448 if mobj.group(1) is None:
1449 self._downloader.report_error(u'Invalid URL: %s' % url)
1452 video_url = compat_urllib_parse.unquote(mobj.group(1))
1453 video_id = os.path.basename(video_url)
1455 # here's a fun little line of code for you:
1456 video_extension = os.path.splitext(video_id)[1][1:]
1457 video_id = os.path.splitext(video_id)[0]
1459 # it's tempting to parse this further, but you would
1460 # have to take into account all the variations like
1461 # Video Title - Site Name
1462 # Site Name | Video Title
1463 # Video Title - Tagline | Site Name
1464 # and so on and so forth; it's just not practical
1465 mobj = re.search(r'<title>(.*)</title>', webpage)
1467 self._downloader.report_error(u'unable to extract title')
1469 video_title = mobj.group(1)
1471 # video uploader is domain name
1472 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474 self._downloader.report_error(u'unable to extract title')
1476 video_uploader = mobj.group(1)
1481 'uploader': video_uploader,
1482 'upload_date': None,
1483 'title': video_title,
1484 'ext': video_extension,
1488 class YoutubeSearchIE(InfoExtractor):
1489 """Information Extractor for YouTube search queries."""
1490 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492 _max_youtube_results = 1000
1493 IE_NAME = u'youtube:search'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, query, pagenum):
1499 """Report attempt to download search page with given number."""
1500 query = query.decode(preferredencoding())
1501 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503 def _real_extract(self, query):
1504 mobj = re.match(self._VALID_URL, query)
1506 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 prefix, query = query.split(':')
1511 query = query.encode('utf-8')
1513 self._download_n_results(query, 1)
1515 elif prefix == 'all':
1516 self._download_n_results(query, self._max_youtube_results)
1522 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524 elif n > self._max_youtube_results:
1525 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526 n = self._max_youtube_results
1527 self._download_n_results(query, n)
1529 except ValueError: # parsing prefix as integer fails
1530 self._download_n_results(query, 1)
1533 def _download_n_results(self, query, n):
1534 """Downloads a specified number of results for a query"""
1540 while (50 * pagenum) < limit:
1541 self.report_download_page(query, pagenum+1)
1542 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543 request = compat_urllib_request.Request(result_url)
1545 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549 api_response = json.loads(data)['data']
1551 if not 'items' in api_response:
1552 self._downloader.trouble(u'[youtube] No video results')
1555 new_ids = list(video['id'] for video in api_response['items'])
1556 video_ids += new_ids
1558 limit = min(n, api_response['totalItems'])
1561 if len(video_ids) > n:
1562 video_ids = video_ids[:n]
1563 for id in video_ids:
1564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1568 class GoogleSearchIE(InfoExtractor):
1569 """Information Extractor for Google Video search queries."""
1570 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574 _max_google_results = 1000
1575 IE_NAME = u'video.google:search'
1577 def __init__(self, downloader=None):
1578 InfoExtractor.__init__(self, downloader)
1580 def report_download_page(self, query, pagenum):
1581 """Report attempt to download playlist page with given number."""
1582 query = query.decode(preferredencoding())
1583 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1585 def _real_extract(self, query):
1586 mobj = re.match(self._VALID_URL, query)
1588 self._downloader.report_error(u'invalid search query "%s"' % query)
1591 prefix, query = query.split(':')
1593 query = query.encode('utf-8')
1595 self._download_n_results(query, 1)
1597 elif prefix == 'all':
1598 self._download_n_results(query, self._max_google_results)
1604 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 elif n > self._max_google_results:
1607 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608 n = self._max_google_results
1609 self._download_n_results(query, n)
1611 except ValueError: # parsing prefix as integer fails
1612 self._download_n_results(query, 1)
1615 def _download_n_results(self, query, n):
1616 """Downloads a specified number of results for a query"""
1622 self.report_download_page(query, pagenum)
1623 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624 request = compat_urllib_request.Request(result_url)
1626 page = compat_urllib_request.urlopen(request).read()
1627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633 video_id = mobj.group(1)
1634 if video_id not in video_ids:
1635 video_ids.append(video_id)
1636 if len(video_ids) == n:
1637 # Specified n videos reached
1638 for id in video_ids:
1639 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 pagenum = pagenum + 1
1650 class YahooSearchIE(InfoExtractor):
1651 """Information Extractor for Yahoo! Video search queries."""
1654 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657 _MORE_PAGES_INDICATOR = r'\s*Next'
1658 _max_yahoo_results = 1000
1659 IE_NAME = u'video.yahoo:search'
1661 def __init__(self, downloader=None):
1662 InfoExtractor.__init__(self, downloader)
1664 def report_download_page(self, query, pagenum):
1665 """Report attempt to download playlist page with given number."""
1666 query = query.decode(preferredencoding())
1667 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1669 def _real_extract(self, query):
1670 mobj = re.match(self._VALID_URL, query)
1672 self._downloader.report_error(u'invalid search query "%s"' % query)
1675 prefix, query = query.split(':')
1677 query = query.encode('utf-8')
1679 self._download_n_results(query, 1)
1681 elif prefix == 'all':
1682 self._download_n_results(query, self._max_yahoo_results)
1688 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1690 elif n > self._max_yahoo_results:
1691 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692 n = self._max_yahoo_results
1693 self._download_n_results(query, n)
1695 except ValueError: # parsing prefix as integer fails
1696 self._download_n_results(query, 1)
1699 def _download_n_results(self, query, n):
1700 """Downloads a specified number of results for a query"""
1703 already_seen = set()
1707 self.report_download_page(query, pagenum)
1708 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709 request = compat_urllib_request.Request(result_url)
1711 page = compat_urllib_request.urlopen(request).read()
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 video_id = mobj.group(1)
1719 if video_id not in already_seen:
1720 video_ids.append(video_id)
1721 already_seen.add(video_id)
1722 if len(video_ids) == n:
1723 # Specified n videos reached
1724 for id in video_ids:
1725 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1728 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 pagenum = pagenum + 1
1736 class YoutubePlaylistIE(InfoExtractor):
1737 """Information Extractor for YouTube playlists."""
1739 _VALID_URL = r"""(?:
1744 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745 \? (?:.*?&)*? (?:p|a|list)=
1748 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1751 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1753 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1755 IE_NAME = u'youtube:playlist'
1757 def __init__(self, downloader=None):
1758 InfoExtractor.__init__(self, downloader)
1761 def suitable(cls, url):
1762 """Receives a URL and returns True if suitable for this IE."""
1763 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1765 def report_download_page(self, playlist_id, pagenum):
1766 """Report attempt to download playlist page with given number."""
1767 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1769 def _real_extract(self, url):
1770 # Extract playlist id
1771 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1773 self._downloader.report_error(u'invalid url: %s' % url)
1776 # Download playlist videos from API
1777 playlist_id = mobj.group(1) or mobj.group(2)
1782 self.report_download_page(playlist_id, page_num)
1784 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1786 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1792 response = json.loads(page)
1793 except ValueError as err:
1794 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1797 if 'feed' not in response:
1798 self._downloader.report_error(u'Got a malformed response from YouTube API')
1800 if 'entry' not in response['feed']:
1801 # Number of videos is a multiple of self._MAX_RESULTS
1804 playlist_title = response['feed']['title']['$t']
1806 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1807 for entry in response['feed']['entry']
1808 if 'content' in entry ]
1810 if len(response['feed']['entry']) < self._MAX_RESULTS:
1814 videos = [v[1] for v in sorted(videos)]
1816 url_results = [self.url_result(url, 'Youtube') for url in videos]
1817 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1820 class YoutubeChannelIE(InfoExtractor):
1821 """Information Extractor for YouTube channels."""
1823 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1824 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1825 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1826 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1827 IE_NAME = u'youtube:channel'
1829 def report_download_page(self, channel_id, pagenum):
1830 """Report attempt to download channel page with given number."""
1831 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1833 def extract_videos_from_page(self, page):
1835 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1836 if mobj.group(1) not in ids_in_page:
1837 ids_in_page.append(mobj.group(1))
1840 def _real_extract(self, url):
1841 # Extract channel id
1842 mobj = re.match(self._VALID_URL, url)
1844 self._downloader.report_error(u'invalid url: %s' % url)
1847 # Download channel page
1848 channel_id = mobj.group(1)
1852 self.report_download_page(channel_id, pagenum)
1853 url = self._TEMPLATE_URL % (channel_id, pagenum)
1854 request = compat_urllib_request.Request(url)
1856 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1861 # Extract video identifiers
1862 ids_in_page = self.extract_videos_from_page(page)
1863 video_ids.extend(ids_in_page)
1865 # Download any subsequent channel pages using the json-based channel_ajax query
1866 if self._MORE_PAGES_INDICATOR in page:
1868 pagenum = pagenum + 1
1870 self.report_download_page(channel_id, pagenum)
1871 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1872 request = compat_urllib_request.Request(url)
1874 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1875 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1876 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1879 page = json.loads(page)
1881 ids_in_page = self.extract_videos_from_page(page['content_html'])
1882 video_ids.extend(ids_in_page)
1884 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1887 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1889 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1890 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1891 return [self.playlist_result(url_entries, channel_id)]
1894 class YoutubeUserIE(InfoExtractor):
1895 """Information Extractor for YouTube users."""
1897 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1898 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1899 _GDATA_PAGE_SIZE = 50
1900 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1901 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1902 IE_NAME = u'youtube:user'
1904 def __init__(self, downloader=None):
1905 InfoExtractor.__init__(self, downloader)
1907 def report_download_page(self, username, start_index):
1908 """Report attempt to download user page."""
1909 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1910 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1912 def _real_extract(self, url):
1914 mobj = re.match(self._VALID_URL, url)
1916 self._downloader.report_error(u'invalid url: %s' % url)
1919 username = mobj.group(1)
1921 # Download video ids using YouTube Data API. Result size per
1922 # query is limited (currently to 50 videos) so we need to query
1923 # page by page until there are no video ids - it means we got
1930 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1931 self.report_download_page(username, start_index)
1933 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1936 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1937 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1941 # Extract video identifiers
1944 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1945 if mobj.group(1) not in ids_in_page:
1946 ids_in_page.append(mobj.group(1))
1948 video_ids.extend(ids_in_page)
1950 # A little optimization - if current page is not
1951 # "full", ie. does not contain PAGE_SIZE video ids then
1952 # we can assume that this page is the last one - there
1953 # are no more ids on further pages - no need to query
1956 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1961 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1962 url_results = [self.url_result(url, 'Youtube') for url in urls]
1963 return [self.playlist_result(url_results, playlist_title = username)]
1966 class BlipTVUserIE(InfoExtractor):
1967 """Information Extractor for blip.tv users."""
1969 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1971 IE_NAME = u'blip.tv:user'
1973 def __init__(self, downloader=None):
1974 InfoExtractor.__init__(self, downloader)
1976 def report_download_page(self, username, pagenum):
1977 """Report attempt to download user page."""
1978 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1979 (self.IE_NAME, username, pagenum))
1981 def _real_extract(self, url):
1983 mobj = re.match(self._VALID_URL, url)
1985 self._downloader.report_error(u'invalid url: %s' % url)
1988 username = mobj.group(1)
1990 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1992 request = compat_urllib_request.Request(url)
1995 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1996 mobj = re.search(r'data-users-id="([^"]+)"', page)
1997 page_base = page_base % mobj.group(1)
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2003 # Download video ids using BlipTV Ajax calls. Result size per
2004 # query is limited (currently to 12 videos) so we need to query
2005 # page by page until there are no video ids - it means we got
2012 self.report_download_page(username, pagenum)
2013 url = page_base + "&page=" + str(pagenum)
2014 request = compat_urllib_request.Request( url )
2016 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2017 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2018 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2021 # Extract video identifiers
2024 for mobj in re.finditer(r'href="/([^"]+)"', page):
2025 if mobj.group(1) not in ids_in_page:
2026 ids_in_page.append(unescapeHTML(mobj.group(1)))
2028 video_ids.extend(ids_in_page)
2030 # A little optimization - if current page is not
2031 # "full", ie. does not contain PAGE_SIZE video ids then
2032 # we can assume that this page is the last one - there
2033 # are no more ids on further pages - no need to query
2036 if len(ids_in_page) < self._PAGE_SIZE:
2041 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2042 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2043 return [self.playlist_result(url_entries, playlist_title = username)]
2046 class DepositFilesIE(InfoExtractor):
2047 """Information extractor for depositfiles.com"""
2049 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2051 def report_download_webpage(self, file_id):
2052 """Report webpage download."""
2053 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2055 def report_extraction(self, file_id):
2056 """Report information extraction."""
2057 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2059 def _real_extract(self, url):
2060 file_id = url.split('/')[-1]
2061 # Rebuild url in english locale
2062 url = 'http://depositfiles.com/en/files/' + file_id
2064 # Retrieve file webpage with 'Free download' button pressed
2065 free_download_indication = { 'gateway_result' : '1' }
2066 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2068 self.report_download_webpage(file_id)
2069 webpage = compat_urllib_request.urlopen(request).read()
2070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2074 # Search for the real file URL
2075 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2076 if (mobj is None) or (mobj.group(1) is None):
2077 # Try to figure out reason of the error.
2078 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2079 if (mobj is not None) and (mobj.group(1) is not None):
2080 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2081 self._downloader.report_error(u'%s' % restriction_message)
2083 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2086 file_url = mobj.group(1)
2087 file_extension = os.path.splitext(file_url)[1][1:]
2089 # Search for file title
2090 mobj = re.search(r'<b title="(.*?)">', webpage)
2092 self._downloader.report_error(u'unable to extract title')
2094 file_title = mobj.group(1).decode('utf-8')
2097 'id': file_id.decode('utf-8'),
2098 'url': file_url.decode('utf-8'),
2100 'upload_date': None,
2101 'title': file_title,
2102 'ext': file_extension.decode('utf-8'),
2106 class FacebookIE(InfoExtractor):
2107 """Information Extractor for Facebook"""
2109 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2110 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2111 _NETRC_MACHINE = 'facebook'
2112 IE_NAME = u'facebook'
2114 def report_login(self):
2115 """Report attempt to log in."""
2116 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2118 def _real_initialize(self):
2119 if self._downloader is None:
2124 downloader_params = self._downloader.params
2126 # Attempt to use provided username and password or .netrc data
2127 if downloader_params.get('username', None) is not None:
2128 useremail = downloader_params['username']
2129 password = downloader_params['password']
2130 elif downloader_params.get('usenetrc', False):
2132 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2133 if info is not None:
2137 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2138 except (IOError, netrc.NetrcParseError) as err:
2139 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2142 if useremail is None:
2151 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2154 login_results = compat_urllib_request.urlopen(request).read()
2155 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2156 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2159 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2162 def _real_extract(self, url):
2163 mobj = re.match(self._VALID_URL, url)
2165 self._downloader.report_error(u'invalid URL: %s' % url)
2167 video_id = mobj.group('ID')
2169 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2170 webpage = self._download_webpage(url, video_id)
2172 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2173 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2174 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2176 raise ExtractorError(u'Cannot parse data')
2177 data = dict(json.loads(m.group(1)))
2178 params_raw = compat_urllib_parse.unquote(data['params'])
2179 params = json.loads(params_raw)
2180 video_data = params['video_data'][0]
2181 video_url = video_data.get('hd_src')
2183 video_url = video_data['sd_src']
2185 raise ExtractorError(u'Cannot find video URL')
2186 video_duration = int(video_data['video_duration'])
2187 thumbnail = video_data['thumbnail_src']
2189 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2191 raise ExtractorError(u'Cannot find title in webpage')
2192 video_title = unescapeHTML(m.group(1))
2196 'title': video_title,
2199 'duration': video_duration,
2200 'thumbnail': thumbnail,
2205 class BlipTVIE(InfoExtractor):
2206 """Information extractor for blip.tv"""
2208 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2209 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2210 IE_NAME = u'blip.tv'
2212 def report_extraction(self, file_id):
2213 """Report information extraction."""
2214 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2216 def report_direct_download(self, title):
2217 """Report information extraction."""
2218 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2220 def _real_extract(self, url):
2221 mobj = re.match(self._VALID_URL, url)
2223 self._downloader.report_error(u'invalid URL: %s' % url)
2226 urlp = compat_urllib_parse_urlparse(url)
2227 if urlp.path.startswith('/play/'):
2228 request = compat_urllib_request.Request(url)
2229 response = compat_urllib_request.urlopen(request)
2230 redirecturl = response.geturl()
2231 rurlp = compat_urllib_parse_urlparse(redirecturl)
2232 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2233 url = 'http://blip.tv/a/a-' + file_id
2234 return self._real_extract(url)
2241 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2242 request = compat_urllib_request.Request(json_url)
2243 request.add_header('User-Agent', 'iTunes/10.6.1')
2244 self.report_extraction(mobj.group(1))
2247 urlh = compat_urllib_request.urlopen(request)
2248 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2249 basename = url.split('/')[-1]
2250 title,ext = os.path.splitext(basename)
2251 title = title.decode('UTF-8')
2252 ext = ext.replace('.', '')
2253 self.report_direct_download(title)
2258 'upload_date': None,
2263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2264 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2265 if info is None: # Regular URL
2267 json_code_bytes = urlh.read()
2268 json_code = json_code_bytes.decode('utf-8')
2269 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2270 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2274 json_data = json.loads(json_code)
2275 if 'Post' in json_data:
2276 data = json_data['Post']
2280 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2281 video_url = data['media']['url']
2282 umobj = re.match(self._URL_EXT, video_url)
2284 raise ValueError('Can not determine filename extension')
2285 ext = umobj.group(1)
2288 'id': data['item_id'],
2290 'uploader': data['display_name'],
2291 'upload_date': upload_date,
2292 'title': data['title'],
2294 'format': data['media']['mimeType'],
2295 'thumbnail': data['thumbnailUrl'],
2296 'description': data['description'],
2297 'player_url': data['embedUrl'],
2298 'user_agent': 'iTunes/10.6.1',
2300 except (ValueError,KeyError) as err:
2301 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2307 class MyVideoIE(InfoExtractor):
2308 """Information Extractor for myvideo.de."""
2310 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2311 IE_NAME = u'myvideo'
2313 def __init__(self, downloader=None):
2314 InfoExtractor.__init__(self, downloader)
2316 def report_extraction(self, video_id):
2317 """Report information extraction."""
2318 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2320 def _real_extract(self,url):
2321 mobj = re.match(self._VALID_URL, url)
2323 self._download.report_error(u'invalid URL: %s' % url)
2326 video_id = mobj.group(1)
2329 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2330 webpage = self._download_webpage(webpage_url, video_id)
2332 self.report_extraction(video_id)
2333 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2336 self._downloader.report_error(u'unable to extract media URL')
2338 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2340 mobj = re.search('<title>([^<]+)</title>', webpage)
2342 self._downloader.report_error(u'unable to extract title')
2345 video_title = mobj.group(1)
2351 'upload_date': None,
2352 'title': video_title,
2356 class ComedyCentralIE(InfoExtractor):
2357 """Information extractor for The Daily Show and Colbert Report """
2359 # urls can be abbreviations like :thedailyshow or :colbert
2360 # urls for episodes like:
2361 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2362 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2363 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2364 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2365 |(https?://)?(www\.)?
2366 (?P<showname>thedailyshow|colbertnation)\.com/
2367 (full-episodes/(?P<episode>.*)|
2369 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2370 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2373 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2375 _video_extensions = {
2383 _video_dimensions = {
2393 def suitable(cls, url):
2394 """Receives a URL and returns True if suitable for this IE."""
2395 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2397 def report_extraction(self, episode_id):
2398 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2400 def report_config_download(self, episode_id, media_id):
2401 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2403 def report_index_download(self, episode_id):
2404 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2406 def _print_formats(self, formats):
2407 print('Available formats:')
2409 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2412 def _real_extract(self, url):
2413 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2415 self._downloader.report_error(u'invalid URL: %s' % url)
2418 if mobj.group('shortname'):
2419 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2420 url = u'http://www.thedailyshow.com/full-episodes/'
2422 url = u'http://www.colbertnation.com/full-episodes/'
2423 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2424 assert mobj is not None
2426 if mobj.group('clip'):
2427 if mobj.group('showname') == 'thedailyshow':
2428 epTitle = mobj.group('tdstitle')
2430 epTitle = mobj.group('cntitle')
2433 dlNewest = not mobj.group('episode')
2435 epTitle = mobj.group('showname')
2437 epTitle = mobj.group('episode')
2439 req = compat_urllib_request.Request(url)
2440 self.report_extraction(epTitle)
2442 htmlHandle = compat_urllib_request.urlopen(req)
2443 html = htmlHandle.read()
2444 webpage = html.decode('utf-8')
2445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2449 url = htmlHandle.geturl()
2450 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2452 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2454 if mobj.group('episode') == '':
2455 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2457 epTitle = mobj.group('episode')
2459 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2461 if len(mMovieParams) == 0:
2462 # The Colbert Report embeds the information in a without
2463 # a URL prefix; so extract the alternate reference
2464 # and then add the URL prefix manually.
2466 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2467 if len(altMovieParams) == 0:
2468 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2471 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2473 uri = mMovieParams[0][1]
2474 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2475 self.report_index_download(epTitle)
2477 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2478 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2479 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2484 idoc = xml.etree.ElementTree.fromstring(indexXml)
2485 itemEls = idoc.findall('.//item')
2486 for partNum,itemEl in enumerate(itemEls):
2487 mediaId = itemEl.findall('./guid')[0].text
2488 shortMediaId = mediaId.split(':')[-1]
2489 showId = mediaId.split(':')[-2].replace('.com', '')
2490 officialTitle = itemEl.findall('./title')[0].text
2491 officialDate = itemEl.findall('./pubDate')[0].text
2493 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2494 compat_urllib_parse.urlencode({'uri': mediaId}))
2495 configReq = compat_urllib_request.Request(configUrl)
2496 self.report_config_download(epTitle, shortMediaId)
2498 configXml = compat_urllib_request.urlopen(configReq).read()
2499 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2500 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2503 cdoc = xml.etree.ElementTree.fromstring(configXml)
2505 for rendition in cdoc.findall('.//rendition'):
2506 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2510 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2513 if self._downloader.params.get('listformats', None):
2514 self._print_formats([i[0] for i in turls])
2517 # For now, just pick the highest bitrate
2518 format,rtmp_video_url = turls[-1]
2520 # Get the format arg from the arg stream
2521 req_format = self._downloader.params.get('format', None)
2523 # Select format if we can find one
2526 format, rtmp_video_url = f, v
2529 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2531 raise ExtractorError(u'Cannot transform RTMP url')
2532 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2533 video_url = base + m.group('finalid')
2535 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2540 'upload_date': officialDate,
2545 'description': officialTitle,
2547 results.append(info)
2552 class EscapistIE(InfoExtractor):
2553 """Information extractor for The Escapist """
2555 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2556 IE_NAME = u'escapist'
2558 def report_extraction(self, showName):
2559 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2561 def report_config_download(self, showName):
2562 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2564 def _real_extract(self, url):
2565 mobj = re.match(self._VALID_URL, url)
2567 self._downloader.report_error(u'invalid URL: %s' % url)
2569 showName = mobj.group('showname')
2570 videoId = mobj.group('episode')
2572 self.report_extraction(showName)
2574 webPage = compat_urllib_request.urlopen(url)
2575 webPageBytes = webPage.read()
2576 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2577 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2582 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2583 description = unescapeHTML(descMatch.group(1))
2584 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2585 imgUrl = unescapeHTML(imgMatch.group(1))
2586 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2587 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2588 configUrlMatch = re.search('config=(.*)$', playerUrl)
2589 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2591 self.report_config_download(showName)
2593 configJSON = compat_urllib_request.urlopen(configUrl)
2594 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2595 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2600 # Technically, it's JavaScript, not JSON
2601 configJSON = configJSON.replace("'", '"')
2604 config = json.loads(configJSON)
2605 except (ValueError,) as err:
2606 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2609 playlist = config['playlist']
2610 videoUrl = playlist[1]['url']
2615 'uploader': showName,
2616 'upload_date': None,
2619 'thumbnail': imgUrl,
2620 'description': description,
2621 'player_url': playerUrl,
2626 class CollegeHumorIE(InfoExtractor):
2627 """Information extractor for collegehumor.com"""
2630 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2631 IE_NAME = u'collegehumor'
2633 def report_manifest(self, video_id):
2634 """Report information extraction."""
2635 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2637 def report_extraction(self, video_id):
2638 """Report information extraction."""
2639 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2641 def _real_extract(self, url):
2642 mobj = re.match(self._VALID_URL, url)
2644 self._downloader.report_error(u'invalid URL: %s' % url)
2646 video_id = mobj.group('videoid')
2651 'upload_date': None,
2654 self.report_extraction(video_id)
2655 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2657 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2658 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2662 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2664 videoNode = mdoc.findall('./video')[0]
2665 info['description'] = videoNode.findall('./description')[0].text
2666 info['title'] = videoNode.findall('./caption')[0].text
2667 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2668 manifest_url = videoNode.findall('./file')[0].text
2670 self._downloader.report_error(u'Invalid metadata XML file')
2673 manifest_url += '?hdcore=2.10.3'
2674 self.report_manifest(video_id)
2676 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2677 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2678 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2681 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2683 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2684 node_id = media_node.attrib['url']
2685 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2686 except IndexError as err:
2687 self._downloader.report_error(u'Invalid manifest file')
2690 url_pr = compat_urllib_parse_urlparse(manifest_url)
2691 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2698 class XVideosIE(InfoExtractor):
2699 """Information extractor for xvideos.com"""
2701 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2702 IE_NAME = u'xvideos'
2704 def report_extraction(self, video_id):
2705 """Report information extraction."""
2706 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2708 def _real_extract(self, url):
2709 mobj = re.match(self._VALID_URL, url)
2711 self._downloader.report_error(u'invalid URL: %s' % url)
2713 video_id = mobj.group(1)
2715 webpage = self._download_webpage(url, video_id)
2717 self.report_extraction(video_id)
2721 mobj = re.search(r'flv_url=(.+?)&', webpage)
2723 self._downloader.report_error(u'unable to extract video url')
2725 video_url = compat_urllib_parse.unquote(mobj.group(1))
2729 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2731 self._downloader.report_error(u'unable to extract video title')
2733 video_title = mobj.group(1)
2736 # Extract video thumbnail
2737 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2739 self._downloader.report_error(u'unable to extract video thumbnail')
2741 video_thumbnail = mobj.group(0)
2747 'upload_date': None,
2748 'title': video_title,
2750 'thumbnail': video_thumbnail,
2751 'description': None,
2757 class SoundcloudIE(InfoExtractor):
2758 """Information extractor for soundcloud.com
2759 To access the media, the uid of the song and a stream token
2760 must be extracted from the page source and the script must make
2761 a request to media.soundcloud.com/crossdomain.xml. Then
2762 the media can be grabbed by requesting from an url composed
2763 of the stream token and uid
2766 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2767 IE_NAME = u'soundcloud'
2769 def __init__(self, downloader=None):
2770 InfoExtractor.__init__(self, downloader)
2772 def report_resolve(self, video_id):
2773 """Report information extraction."""
2774 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2776 def report_extraction(self, video_id):
2777 """Report information extraction."""
2778 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2780 def _real_extract(self, url):
2781 mobj = re.match(self._VALID_URL, url)
2783 self._downloader.report_error(u'invalid URL: %s' % url)
2786 # extract uploader (which is in the url)
2787 uploader = mobj.group(1)
2788 # extract simple title (uploader + slug of song title)
2789 slug_title = mobj.group(2)
2790 simple_title = uploader + u'-' + slug_title
2792 self.report_resolve('%s/%s' % (uploader, slug_title))
2794 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2795 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2796 request = compat_urllib_request.Request(resolv_url)
2798 info_json_bytes = compat_urllib_request.urlopen(request).read()
2799 info_json = info_json_bytes.decode('utf-8')
2800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2804 info = json.loads(info_json)
2805 video_id = info['id']
2806 self.report_extraction('%s/%s' % (uploader, slug_title))
2808 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2809 request = compat_urllib_request.Request(streams_url)
2811 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2812 stream_json = stream_json_bytes.decode('utf-8')
2813 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2814 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2817 streams = json.loads(stream_json)
2818 mediaURL = streams['http_mp3_128_url']
2823 'uploader': info['user']['username'],
2824 'upload_date': info['created_at'],
2825 'title': info['title'],
2827 'description': info['description'],
2830 class SoundcloudSetIE(InfoExtractor):
2831 """Information extractor for soundcloud.com sets
2832 To access the media, the uid of the song and a stream token
2833 must be extracted from the page source and the script must make
2834 a request to media.soundcloud.com/crossdomain.xml. Then
2835 the media can be grabbed by requesting from an url composed
2836 of the stream token and uid
2839 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2840 IE_NAME = u'soundcloud'
2842 def __init__(self, downloader=None):
2843 InfoExtractor.__init__(self, downloader)
2845 def report_resolve(self, video_id):
2846 """Report information extraction."""
2847 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2849 def report_extraction(self, video_id):
2850 """Report information extraction."""
2851 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2853 def _real_extract(self, url):
2854 mobj = re.match(self._VALID_URL, url)
2856 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2859 # extract uploader (which is in the url)
2860 uploader = mobj.group(1)
2861 # extract simple title (uploader + slug of song title)
2862 slug_title = mobj.group(2)
2863 simple_title = uploader + u'-' + slug_title
2865 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2867 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2868 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2869 request = compat_urllib_request.Request(resolv_url)
2871 info_json_bytes = compat_urllib_request.urlopen(request).read()
2872 info_json = info_json_bytes.decode('utf-8')
2873 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2874 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2878 info = json.loads(info_json)
2879 if 'errors' in info:
2880 for err in info['errors']:
2881 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2884 for track in info['tracks']:
2885 video_id = track['id']
2886 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2888 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2889 request = compat_urllib_request.Request(streams_url)
2891 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2892 stream_json = stream_json_bytes.decode('utf-8')
2893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2897 streams = json.loads(stream_json)
2898 mediaURL = streams['http_mp3_128_url']
2903 'uploader': track['user']['username'],
2904 'upload_date': track['created_at'],
2905 'title': track['title'],
2907 'description': track['description'],
2912 class InfoQIE(InfoExtractor):
2913 """Information extractor for infoq.com"""
2914 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2916 def report_extraction(self, video_id):
2917 """Report information extraction."""
2918 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2920 def _real_extract(self, url):
2921 mobj = re.match(self._VALID_URL, url)
2923 self._downloader.report_error(u'invalid URL: %s' % url)
2926 webpage = self._download_webpage(url, video_id=url)
2927 self.report_extraction(url)
2930 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2932 self._downloader.report_error(u'unable to extract video url')
2934 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2935 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2938 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2940 self._downloader.report_error(u'unable to extract video title')
2942 video_title = mobj.group(1)
2944 # Extract description
2945 video_description = u'No description available.'
2946 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2947 if mobj is not None:
2948 video_description = mobj.group(1)
2950 video_filename = video_url.split('/')[-1]
2951 video_id, extension = video_filename.split('.')
2957 'upload_date': None,
2958 'title': video_title,
2959 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2961 'description': video_description,
2966 class MixcloudIE(InfoExtractor):
2967 """Information extractor for www.mixcloud.com"""
2969 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2970 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2971 IE_NAME = u'mixcloud'
2973 def __init__(self, downloader=None):
2974 InfoExtractor.__init__(self, downloader)
2976 def report_download_json(self, file_id):
2977 """Report JSON download."""
2978 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2980 def report_extraction(self, file_id):
2981 """Report information extraction."""
2982 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2984 def get_urls(self, jsonData, fmt, bitrate='best'):
2985 """Get urls from 'audio_formats' section in json"""
2988 bitrate_list = jsonData[fmt]
2989 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2990 bitrate = max(bitrate_list) # select highest
2992 url_list = jsonData[fmt][bitrate]
2993 except TypeError: # we have no bitrate info.
2994 url_list = jsonData[fmt]
2997 def check_urls(self, url_list):
2998 """Returns 1st active url from list"""
2999 for url in url_list:
3001 compat_urllib_request.urlopen(url)
3003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3008 def _print_formats(self, formats):
3009 print('Available formats:')
3010 for fmt in formats.keys():
3011 for b in formats[fmt]:
3013 ext = formats[fmt][b][0]
3014 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3015 except TypeError: # we have no bitrate info
3016 ext = formats[fmt][0]
3017 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3020 def _real_extract(self, url):
3021 mobj = re.match(self._VALID_URL, url)
3023 self._downloader.report_error(u'invalid URL: %s' % url)
3025 # extract uploader & filename from url
3026 uploader = mobj.group(1).decode('utf-8')
3027 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3029 # construct API request
3030 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3031 # retrieve .json file with links to files
3032 request = compat_urllib_request.Request(file_url)
3034 self.report_download_json(file_url)
3035 jsonData = compat_urllib_request.urlopen(request).read()
3036 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3041 json_data = json.loads(jsonData)
3042 player_url = json_data['player_swf_url']
3043 formats = dict(json_data['audio_formats'])
3045 req_format = self._downloader.params.get('format', None)
3048 if self._downloader.params.get('listformats', None):
3049 self._print_formats(formats)
3052 if req_format is None or req_format == 'best':
3053 for format_param in formats.keys():
3054 url_list = self.get_urls(formats, format_param)
3056 file_url = self.check_urls(url_list)
3057 if file_url is not None:
3060 if req_format not in formats:
3061 self._downloader.report_error(u'format is not available')
3064 url_list = self.get_urls(formats, req_format)
3065 file_url = self.check_urls(url_list)
3066 format_param = req_format
3069 'id': file_id.decode('utf-8'),
3070 'url': file_url.decode('utf-8'),
3071 'uploader': uploader.decode('utf-8'),
3072 'upload_date': None,
3073 'title': json_data['name'],
3074 'ext': file_url.split('.')[-1].decode('utf-8'),
3075 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3076 'thumbnail': json_data['thumbnail_url'],
3077 'description': json_data['description'],
3078 'player_url': player_url.decode('utf-8'),
3081 class StanfordOpenClassroomIE(InfoExtractor):
3082 """Information extractor for Stanford's Open ClassRoom"""
3084 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3085 IE_NAME = u'stanfordoc'
3087 def report_download_webpage(self, objid):
3088 """Report information extraction."""
3089 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3091 def report_extraction(self, video_id):
3092 """Report information extraction."""
3093 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3095 def _real_extract(self, url):
3096 mobj = re.match(self._VALID_URL, url)
3098 raise ExtractorError(u'Invalid URL: %s' % url)
3100 if mobj.group('course') and mobj.group('video'): # A specific video
3101 course = mobj.group('course')
3102 video = mobj.group('video')
3104 'id': course + '_' + video,
3106 'upload_date': None,
3109 self.report_extraction(info['id'])
3110 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3111 xmlUrl = baseUrl + video + '.xml'
3113 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3117 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3119 info['title'] = mdoc.findall('./title')[0].text
3120 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3122 self._downloader.report_error(u'Invalid metadata XML file')
3124 info['ext'] = info['url'].rpartition('.')[2]
3126 elif mobj.group('course'): # A course page
3127 course = mobj.group('course')
3132 'upload_date': None,
3135 coursepage = self._download_webpage(url, info['id'],
3136 note='Downloading course info page',
3137 errnote='Unable to download course info page')
3139 m = re.search('<h1>([^<]+)</h1>', coursepage)
3141 info['title'] = unescapeHTML(m.group(1))
3143 info['title'] = info['id']
3145 m = re.search('<description>([^<]+)</description>', coursepage)
3147 info['description'] = unescapeHTML(m.group(1))
3149 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3152 'type': 'reference',
3153 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3157 for entry in info['list']:
3158 assert entry['type'] == 'reference'
3159 results += self.extract(entry['url'])
3163 'id': 'Stanford OpenClassroom',
3166 'upload_date': None,
3169 self.report_download_webpage(info['id'])
3170 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3172 rootpage = compat_urllib_request.urlopen(rootURL).read()
3173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3177 info['title'] = info['id']
3179 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3182 'type': 'reference',
3183 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3188 for entry in info['list']:
3189 assert entry['type'] == 'reference'
3190 results += self.extract(entry['url'])
3193 class MTVIE(InfoExtractor):
3194 """Information extractor for MTV.com"""
3196 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3199 def report_extraction(self, video_id):
3200 """Report information extraction."""
3201 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3203 def _real_extract(self, url):
3204 mobj = re.match(self._VALID_URL, url)
3206 self._downloader.report_error(u'invalid URL: %s' % url)
3208 if not mobj.group('proto'):
3209 url = 'http://' + url
3210 video_id = mobj.group('videoid')
3212 webpage = self._download_webpage(url, video_id)
3214 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3216 self._downloader.report_error(u'unable to extract song name')
3218 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3221 self._downloader.report_error(u'unable to extract performer')
3223 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3224 video_title = performer + ' - ' + song_name
3226 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3228 self._downloader.report_error(u'unable to mtvn_uri')
3230 mtvn_uri = mobj.group(1)
3232 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3234 self._downloader.report_error(u'unable to extract content id')
3236 content_id = mobj.group(1)
3238 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3239 self.report_extraction(video_id)
3240 request = compat_urllib_request.Request(videogen_url)
3242 metadataXml = compat_urllib_request.urlopen(request).read()
3243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3247 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3248 renditions = mdoc.findall('.//rendition')
3250 # For now, always pick the highest quality.
3251 rendition = renditions[-1]
3254 _,_,ext = rendition.attrib['type'].partition('/')
3255 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3256 video_url = rendition.find('./src').text
3258 self._downloader.trouble('Invalid rendition field.')
3264 'uploader': performer,
3265 'upload_date': None,
3266 'title': video_title,
3274 class YoukuIE(InfoExtractor):
3275 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3277 def report_download_webpage(self, file_id):
3278 """Report webpage download."""
3279 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3281 def report_extraction(self, file_id):
3282 """Report information extraction."""
3283 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3286 nowTime = int(time.time() * 1000)
3287 random1 = random.randint(1000,1998)
3288 random2 = random.randint(1000,9999)
3290 return "%d%d%d" %(nowTime,random1,random2)
3292 def _get_file_ID_mix_string(self, seed):
3294 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3296 for i in range(len(source)):
3297 seed = (seed * 211 + 30031 ) % 65536
3298 index = math.floor(seed / 65536 * len(source) )
3299 mixed.append(source[int(index)])
3300 source.remove(source[int(index)])
3301 #return ''.join(mixed)
3304 def _get_file_id(self, fileId, seed):
3305 mixed = self._get_file_ID_mix_string(seed)
3306 ids = fileId.split('*')
3310 realId.append(mixed[int(ch)])
3311 return ''.join(realId)
3313 def _real_extract(self, url):
3314 mobj = re.match(self._VALID_URL, url)
3316 self._downloader.report_error(u'invalid URL: %s' % url)
3318 video_id = mobj.group('ID')
3320 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3322 request = compat_urllib_request.Request(info_url, None, std_headers)
3324 self.report_download_webpage(video_id)
3325 jsondata = compat_urllib_request.urlopen(request).read()
3326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3330 self.report_extraction(video_id)
3332 jsonstr = jsondata.decode('utf-8')
3333 config = json.loads(jsonstr)
3335 video_title = config['data'][0]['title']
3336 seed = config['data'][0]['seed']
3338 format = self._downloader.params.get('format', None)
3339 supported_format = list(config['data'][0]['streamfileids'].keys())
3341 if format is None or format == 'best':
3342 if 'hd2' in supported_format:
3347 elif format == 'worst':
3355 fileid = config['data'][0]['streamfileids'][format]
3356 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3357 except (UnicodeDecodeError, ValueError, KeyError):
3358 self._downloader.report_error(u'unable to extract info section')
3362 sid = self._gen_sid()
3363 fileid = self._get_file_id(fileid, seed)
3365 #column 8,9 of fileid represent the segment number
3366 #fileid[7:9] should be changed
3367 for index, key in enumerate(keys):
3369 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3370 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3373 'id': '%s_part%02d' % (video_id, index),
3374 'url': download_url,
3376 'upload_date': None,
3377 'title': video_title,
3380 files_info.append(info)
3385 class XNXXIE(InfoExtractor):
3386 """Information extractor for xnxx.com"""
3388 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3390 VIDEO_URL_RE = r'flv_url=(.*?)&'
3391 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3392 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3394 def report_webpage(self, video_id):
3395 """Report information extraction"""
3396 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3398 def report_extraction(self, video_id):
3399 """Report information extraction"""
3400 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3402 def _real_extract(self, url):
3403 mobj = re.match(self._VALID_URL, url)
3405 self._downloader.report_error(u'invalid URL: %s' % url)
3407 video_id = mobj.group(1)
3409 self.report_webpage(video_id)
3411 # Get webpage content
3413 webpage_bytes = compat_urllib_request.urlopen(url).read()
3414 webpage = webpage_bytes.decode('utf-8')
3415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3419 result = re.search(self.VIDEO_URL_RE, webpage)
3421 self._downloader.report_error(u'unable to extract video url')
3423 video_url = compat_urllib_parse.unquote(result.group(1))
3425 result = re.search(self.VIDEO_TITLE_RE, webpage)
3427 self._downloader.report_error(u'unable to extract video title')
3429 video_title = result.group(1)
3431 result = re.search(self.VIDEO_THUMB_RE, webpage)
3433 self._downloader.report_error(u'unable to extract video thumbnail')
3435 video_thumbnail = result.group(1)
3441 'upload_date': None,
3442 'title': video_title,
3444 'thumbnail': video_thumbnail,
3445 'description': None,
3449 class GooglePlusIE(InfoExtractor):
3450 """Information extractor for plus.google.com."""
3452 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3453 IE_NAME = u'plus.google'
3455 def __init__(self, downloader=None):
3456 InfoExtractor.__init__(self, downloader)
3458 def report_extract_entry(self, url):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3462 def report_date(self, upload_date):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3466 def report_uploader(self, uploader):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3470 def report_title(self, video_title):
3471 """Report downloading extry"""
3472 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3474 def report_extract_vid_page(self, video_page):
3475 """Report information extraction."""
3476 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3478 def _real_extract(self, url):
3479 # Extract id from URL
3480 mobj = re.match(self._VALID_URL, url)
3482 self._downloader.report_error(u'Invalid URL: %s' % url)
3485 post_url = mobj.group(0)
3486 video_id = mobj.group(1)
3488 video_extension = 'flv'
3490 # Step 1, Retrieve post webpage to extract further information
3491 self.report_extract_entry(post_url)
3492 request = compat_urllib_request.Request(post_url)
3494 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3495 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3499 # Extract update date
3501 pattern = 'title="Timestamp">(.*?)</a>'
3502 mobj = re.search(pattern, webpage)
3504 upload_date = mobj.group(1)
3505 # Convert timestring to a format suitable for filename
3506 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507 upload_date = upload_date.strftime('%Y%m%d')
3508 self.report_date(upload_date)
3512 pattern = r'rel\="author".*?>(.*?)</a>'
3513 mobj = re.search(pattern, webpage)
3515 uploader = mobj.group(1)
3516 self.report_uploader(uploader)
3519 # Get the first line for title
3521 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522 mobj = re.search(pattern, webpage)
3524 video_title = mobj.group(1)
3525 self.report_title(video_title)
3527 # Step 2, Stimulate clicking the image box to launch video
3528 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529 mobj = re.search(pattern, webpage)
3531 self._downloader.report_error(u'unable to extract video page URL')
3533 video_page = mobj.group(1)
3534 request = compat_urllib_request.Request(video_page)
3536 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3537 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3540 self.report_extract_vid_page(video_page)
3543 # Extract video links on video page
3544 """Extract video links of all sizes"""
3545 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546 mobj = re.findall(pattern, webpage)
3548 self._downloader.report_error(u'unable to extract video links')
3550 # Sort in resolution
3551 links = sorted(mobj)
3553 # Choose the lowest of the sort, i.e. highest resolution
3554 video_url = links[-1]
3555 # Only get the url. The resolution part in the tuple has no use anymore
3556 video_url = video_url[-1]
3557 # Treat escaped \u0026 style hex
3559 video_url = video_url.decode("unicode_escape")
3560 except AttributeError: # Python 3
3561 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3567 'uploader': uploader,
3568 'upload_date': upload_date,
3569 'title': video_title,
3570 'ext': video_extension,
3573 class NBAIE(InfoExtractor):
3574 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3577 def _real_extract(self, url):
3578 mobj = re.match(self._VALID_URL, url)
3580 self._downloader.report_error(u'invalid URL: %s' % url)
3583 video_id = mobj.group(1)
3584 if video_id.endswith('/index.html'):
3585 video_id = video_id[:-len('/index.html')]
3587 webpage = self._download_webpage(url, video_id)
3589 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3590 def _findProp(rexp, default=None):
3591 m = re.search(rexp, webpage)
3593 return unescapeHTML(m.group(1))
3597 shortened_video_id = video_id.rpartition('/')[2]
3598 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3600 'id': shortened_video_id,
3604 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3605 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3609 class JustinTVIE(InfoExtractor):
3610 """Information extractor for justin.tv and twitch.tv"""
3611 # TODO: One broadcast may be split into multiple videos. The key
3612 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3613 # starts at 1 and increases. Can we treat all parts as one video?
3615 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3616 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3617 _JUSTIN_PAGE_LIMIT = 100
3618 IE_NAME = u'justin.tv'
3620 def report_extraction(self, file_id):
3621 """Report information extraction."""
3622 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3624 def report_download_page(self, channel, offset):
3625 """Report attempt to download a single page of videos."""
3626 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3627 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3629 # Return count of items, list of *valid* items
3630 def _parse_page(self, url):
3632 urlh = compat_urllib_request.urlopen(url)
3633 webpage_bytes = urlh.read()
3634 webpage = webpage_bytes.decode('utf-8', 'ignore')
3635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3636 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3639 response = json.loads(webpage)
3640 if type(response) != list:
3641 error_text = response.get('error', 'unknown error')
3642 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3645 for clip in response:
3646 video_url = clip['video_file_url']
3648 video_extension = os.path.splitext(video_url)[1][1:]
3649 video_date = re.sub('-', '', clip['start_time'][:10])
3650 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3651 video_id = clip['id']
3652 video_title = clip.get('title', video_id)
3656 'title': video_title,
3657 'uploader': clip.get('channel_name', video_uploader_id),
3658 'uploader_id': video_uploader_id,
3659 'upload_date': video_date,
3660 'ext': video_extension,
3662 return (len(response), info)
3664 def _real_extract(self, url):
3665 mobj = re.match(self._VALID_URL, url)
3667 self._downloader.report_error(u'invalid URL: %s' % url)
3670 api = 'http://api.justin.tv'
3671 video_id = mobj.group(mobj.lastindex)
3673 if mobj.lastindex == 1:
3675 api += '/channel/archives/%s.json'
3677 api += '/broadcast/by_archive/%s.json'
3678 api = api % (video_id,)
3680 self.report_extraction(video_id)
3684 limit = self._JUSTIN_PAGE_LIMIT
3687 self.report_download_page(video_id, offset)
3688 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3689 page_count, page_info = self._parse_page(page_url)
3690 info.extend(page_info)
3691 if not paged or page_count != limit:
3696 class FunnyOrDieIE(InfoExtractor):
3697 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3699 def _real_extract(self, url):
3700 mobj = re.match(self._VALID_URL, url)
3702 self._downloader.report_error(u'invalid URL: %s' % url)
3705 video_id = mobj.group('id')
3706 webpage = self._download_webpage(url, video_id)
3708 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3710 self._downloader.report_error(u'unable to find video information')
3711 video_url = unescapeHTML(m.group('url'))
3713 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3715 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3717 self._downloader.trouble(u'Cannot find video title')
3718 title = clean_html(m.group('title'))
3720 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3722 desc = unescapeHTML(m.group('desc'))
3731 'description': desc,
3735 class SteamIE(InfoExtractor):
3736 _VALID_URL = r"""http://store.steampowered.com/
3737 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3739 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3743 def suitable(cls, url):
3744 """Receives a URL and returns True if suitable for this IE."""
3745 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3747 def _real_extract(self, url):
3748 m = re.match(self._VALID_URL, url, re.VERBOSE)
3749 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3750 gameID = m.group('gameID')
3751 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3752 webpage = self._download_webpage(videourl, gameID)
3753 mweb = re.finditer(urlRE, webpage)
3754 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3755 titles = re.finditer(namesRE, webpage)
3756 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3757 thumbs = re.finditer(thumbsRE, webpage)
3759 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3760 video_id = vid.group('videoID')
3761 title = vtitle.group('videoName')
3762 video_url = vid.group('videoURL')
3763 video_thumb = thumb.group('thumbnail')
3765 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3770 'title': unescapeHTML(title),
3771 'thumbnail': video_thumb
3776 class UstreamIE(InfoExtractor):
3777 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3778 IE_NAME = u'ustream'
3780 def _real_extract(self, url):
3781 m = re.match(self._VALID_URL, url)
3782 video_id = m.group('videoID')
3783 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3784 webpage = self._download_webpage(url, video_id)
3785 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3786 title = m.group('title')
3787 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3788 uploader = m.group('uploader')
3794 'uploader': uploader
3798 class WorldStarHipHopIE(InfoExtractor):
3799 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3800 IE_NAME = u'WorldStarHipHop'
3802 def _real_extract(self, url):
3803 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3805 webpage_src = compat_urllib_request.urlopen(url).read()
3806 webpage_src = webpage_src.decode('utf-8')
3808 mobj = re.search(_src_url, webpage_src)
3810 m = re.match(self._VALID_URL, url)
3811 video_id = m.group('id')
3813 if mobj is not None:
3814 video_url = mobj.group()
3815 if 'mp4' in video_url:
3820 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3823 _title = r"""<title>(.*)</title>"""
3825 mobj = re.search(_title, webpage_src)
3827 if mobj is not None:
3828 title = mobj.group(1)
3830 title = 'World Start Hip Hop - %s' % time.ctime()
3832 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3833 mobj = re.search(_thumbnail, webpage_src)
3835 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3836 if mobj is not None:
3837 thumbnail = mobj.group(1)
3839 _title = r"""candytitles.*>(.*)</span>"""
3840 mobj = re.search(_title, webpage_src)
3841 if mobj is not None:
3842 title = mobj.group(1)
3849 'thumbnail' : thumbnail,
3854 class RBMARadioIE(InfoExtractor):
3855 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3857 def _real_extract(self, url):
3858 m = re.match(self._VALID_URL, url)
3859 video_id = m.group('videoID')
3861 webpage = self._download_webpage(url, video_id)
3862 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3864 raise ExtractorError(u'Cannot find metadata')
3865 json_data = m.group(1)
3868 data = json.loads(json_data)
3869 except ValueError as e:
3870 raise ExtractorError(u'Invalid JSON: ' + str(e))
3872 video_url = data['akamai_url'] + '&cbr=256'
3873 url_parts = compat_urllib_parse_urlparse(video_url)
3874 video_ext = url_parts.path.rpartition('.')[2]
3879 'title': data['title'],
3880 'description': data.get('teaser_text'),
3881 'location': data.get('country_of_origin'),
3882 'uploader': data.get('host', {}).get('name'),
3883 'uploader_id': data.get('host', {}).get('slug'),
3884 'thumbnail': data.get('image', {}).get('large_url_2x'),
3885 'duration': data.get('duration'),
3890 class YouPornIE(InfoExtractor):
3891 """Information extractor for youporn.com."""
3892 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3894 def _print_formats(self, formats):
3895 """Print all available formats"""
3896 print(u'Available formats:')
3897 print(u'ext\t\tformat')
3898 print(u'---------------------------------')
3899 for format in formats:
3900 print(u'%s\t\t%s' % (format['ext'], format['format']))
3902 def _specific(self, req_format, formats):
3904 if(x["format"]==req_format):
3908 def _real_extract(self, url):
3909 mobj = re.match(self._VALID_URL, url)
3911 self._downloader.report_error(u'invalid URL: %s' % url)
3914 video_id = mobj.group('videoid')
3916 req = compat_urllib_request.Request(url)
3917 req.add_header('Cookie', 'age_verified=1')
3918 webpage = self._download_webpage(req, video_id)
3920 # Get the video title
3921 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3923 raise ExtractorError(u'Unable to extract video title')
3924 video_title = result.group('title').strip()
3926 # Get the video date
3927 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3929 self._downloader.report_warning(u'unable to extract video date')
3932 upload_date = result.group('date').strip()
3934 # Get the video uploader
3935 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3937 self._downloader.report_warning(u'unable to extract uploader')
3938 video_uploader = None
3940 video_uploader = result.group('uploader').strip()
3941 video_uploader = clean_html( video_uploader )
3943 # Get all of the formats available
3944 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3945 result = re.search(DOWNLOAD_LIST_RE, webpage)
3947 raise ExtractorError(u'Unable to extract download list')
3948 download_list_html = result.group('download_list').strip()
3950 # Get all of the links from the page
3951 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3952 links = re.findall(LINK_RE, download_list_html)
3953 if(len(links) == 0):
3954 raise ExtractorError(u'ERROR: no known formats available for video')
3956 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3961 # A link looks like this:
3962 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3963 # A path looks like this:
3964 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3965 video_url = unescapeHTML( link )
3966 path = compat_urllib_parse_urlparse( video_url ).path
3967 extension = os.path.splitext( path )[1][1:]
3968 format = path.split('/')[4].split('_')[:2]
3971 format = "-".join( format )
3972 title = u'%s-%s-%s' % (video_title, size, bitrate)
3977 'uploader': video_uploader,
3978 'upload_date': upload_date,
3983 'description': None,
3987 if self._downloader.params.get('listformats', None):
3988 self._print_formats(formats)
3991 req_format = self._downloader.params.get('format', None)
3992 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3994 if req_format is None or req_format == 'best':
3996 elif req_format == 'worst':
3997 return [formats[-1]]
3998 elif req_format in ('-1', 'all'):
4001 format = self._specific( req_format, formats )
4003 self._downloader.report_error(u'requested format not available')
4009 class PornotubeIE(InfoExtractor):
4010 """Information extractor for pornotube.com."""
4011 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4013 def _real_extract(self, url):
4014 mobj = re.match(self._VALID_URL, url)
4016 self._downloader.report_error(u'invalid URL: %s' % url)
4019 video_id = mobj.group('videoid')
4020 video_title = mobj.group('title')
4022 # Get webpage content
4023 webpage = self._download_webpage(url, video_id)
4026 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4027 result = re.search(VIDEO_URL_RE, webpage)
4029 self._downloader.report_error(u'unable to extract video url')
4031 video_url = compat_urllib_parse.unquote(result.group('url'))
4033 #Get the uploaded date
4034 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4035 result = re.search(VIDEO_UPLOADED_RE, webpage)
4037 self._downloader.report_error(u'unable to extract video title')
4039 upload_date = result.group('date')
4041 info = {'id': video_id,
4044 'upload_date': upload_date,
4045 'title': video_title,
4051 class YouJizzIE(InfoExtractor):
4052 """Information extractor for youjizz.com."""
4053 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4055 def _real_extract(self, url):
4056 mobj = re.match(self._VALID_URL, url)
4058 self._downloader.report_error(u'invalid URL: %s' % url)
4061 video_id = mobj.group('videoid')
4063 # Get webpage content
4064 webpage = self._download_webpage(url, video_id)
4066 # Get the video title
4067 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4069 raise ExtractorError(u'ERROR: unable to extract video title')
4070 video_title = result.group('title').strip()
4072 # Get the embed page
4073 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4075 raise ExtractorError(u'ERROR: unable to extract embed page')
4077 embed_page_url = result.group(0).strip()
4078 video_id = result.group('videoid')
4080 webpage = self._download_webpage(embed_page_url, video_id)
4083 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4085 raise ExtractorError(u'ERROR: unable to extract video url')
4086 video_url = result.group('source')
4088 info = {'id': video_id,
4090 'title': video_title,
4093 'player_url': embed_page_url}
4097 class EightTracksIE(InfoExtractor):
4099 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4101 def _real_extract(self, url):
4102 mobj = re.match(self._VALID_URL, url)
4104 raise ExtractorError(u'Invalid URL: %s' % url)
4105 playlist_id = mobj.group('id')
4107 webpage = self._download_webpage(url, playlist_id)
4109 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4111 raise ExtractorError(u'Cannot find trax information')
4112 json_like = m.group(1)
4113 data = json.loads(json_like)
4115 session = str(random.randint(0, 1000000000))
4117 track_count = data['tracks_count']
4118 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4119 next_url = first_url
4121 for i in itertools.count():
4122 api_json = self._download_webpage(next_url, playlist_id,
4123 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4124 errnote=u'Failed to download song information')
4125 api_data = json.loads(api_json)
4126 track_data = api_data[u'set']['track']
4128 'id': track_data['id'],
4129 'url': track_data['track_file_stream_url'],
4130 'title': track_data['performer'] + u' - ' + track_data['name'],
4131 'raw_title': track_data['name'],
4132 'uploader_id': data['user']['login'],
4136 if api_data['set']['at_last_track']:
4138 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4141 class KeekIE(InfoExtractor):
4142 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4145 def _real_extract(self, url):
4146 m = re.match(self._VALID_URL, url)
4147 video_id = m.group('videoID')
4148 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4149 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4150 webpage = self._download_webpage(url, video_id)
4151 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4152 title = unescapeHTML(m.group('title'))
4153 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4154 uploader = clean_html(m.group('uploader'))
4160 'thumbnail': thumbnail,
4161 'uploader': uploader
4165 class TEDIE(InfoExtractor):
4166 _VALID_URL=r'''http://www.ted.com/
4168 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4170 ((?P<type_talk>talks)) # We have a simple talk
4172 /(?P<name>\w+) # Here goes the name and then ".html"
4176 def suitable(cls, url):
4177 """Receives a URL and returns True if suitable for this IE."""
4178 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4180 def _real_extract(self, url):
4181 m=re.match(self._VALID_URL, url, re.VERBOSE)
4182 if m.group('type_talk'):
4183 return [self._talk_info(url)]
4185 playlist_id=m.group('playlist_id')
4186 name=m.group('name')
4187 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4188 return [self._playlist_videos_info(url,name,playlist_id)]
4190 def _talk_video_link(self,mediaSlug):
4191 '''Returns the video link for that mediaSlug'''
4192 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4194 def _playlist_videos_info(self,url,name,playlist_id=0):
4195 '''Returns the videos of the playlist'''
4197 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4198 ([.\s]*?)data-playlist_item_id="(\d+)"
4199 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4201 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4202 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4203 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4204 m_names=re.finditer(video_name_RE,webpage)
4206 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4207 m_playlist = re.search(playlist_RE, webpage)
4208 playlist_title = m_playlist.group('playlist_title')
4210 playlist_entries = []
4211 for m_video, m_name in zip(m_videos,m_names):
4212 video_id=m_video.group('video_id')
4213 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4214 playlist_entries.append(self.url_result(talk_url, 'TED'))
4215 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4217 def _talk_info(self, url, video_id=0):
4218 """Return the video for the talk in the url"""
4219 m=re.match(self._VALID_URL, url,re.VERBOSE)
4220 videoName=m.group('name')
4221 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4222 # If the url includes the language we get the title translated
4223 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4224 title=re.search(title_RE, webpage).group('title')
4225 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4226 "id":(?P<videoID>[\d]+).*?
4227 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4228 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4229 thumb_match=re.search(thumb_RE,webpage)
4230 info_match=re.search(info_RE,webpage,re.VERBOSE)
4231 video_id=info_match.group('videoID')
4232 mediaSlug=info_match.group('mediaSlug')
4233 video_url=self._talk_video_link(mediaSlug)
4239 'thumbnail': thumb_match.group('thumbnail')
4243 class MySpassIE(InfoExtractor):
4244 _VALID_URL = r'http://www.myspass.de/.*'
4246 def _real_extract(self, url):
4247 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4249 # video id is the last path element of the URL
4250 # usually there is a trailing slash, so also try the second but last
4251 url_path = compat_urllib_parse_urlparse(url).path
4252 url_parent_path, video_id = os.path.split(url_path)
4254 _, video_id = os.path.split(url_parent_path)
4257 metadata_url = META_DATA_URL_TEMPLATE % video_id
4258 metadata_text = self._download_webpage(metadata_url, video_id)
4259 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4261 # extract values from metadata
4262 url_flv_el = metadata.find('url_flv')
4263 if url_flv_el is None:
4264 self._downloader.report_error(u'unable to extract download url')
4266 video_url = url_flv_el.text
4267 extension = os.path.splitext(video_url)[1][1:]
4268 title_el = metadata.find('title')
4269 if title_el is None:
4270 self._downloader.report_error(u'unable to extract title')
4272 title = title_el.text
4273 format_id_el = metadata.find('format_id')
4274 if format_id_el is None:
4277 format = format_id_el.text
4278 description_el = metadata.find('description')
4279 if description_el is not None:
4280 description = description_el.text
4283 imagePreview_el = metadata.find('imagePreview')
4284 if imagePreview_el is not None:
4285 thumbnail = imagePreview_el.text
4294 'thumbnail': thumbnail,
4295 'description': description
4299 class SpiegelIE(InfoExtractor):
4300 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4302 def _real_extract(self, url):
4303 m = re.match(self._VALID_URL, url)
4304 video_id = m.group('videoID')
4306 webpage = self._download_webpage(url, video_id)
4307 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4309 raise ExtractorError(u'Cannot find title')
4310 video_title = unescapeHTML(m.group(1))
4312 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4313 xml_code = self._download_webpage(xml_url, video_id,
4314 note=u'Downloading XML', errnote=u'Failed to download XML')
4316 idoc = xml.etree.ElementTree.fromstring(xml_code)
4317 last_type = idoc[-1]
4318 filename = last_type.findall('./filename')[0].text
4319 duration = float(last_type.findall('./duration')[0].text)
4321 video_url = 'http://video2.spiegel.de/flash/' + filename
4322 video_ext = filename.rpartition('.')[2]
4327 'title': video_title,
4328 'duration': duration,
4332 class LiveLeakIE(InfoExtractor):
4334 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4335 IE_NAME = u'liveleak'
4337 def _real_extract(self, url):
4338 mobj = re.match(self._VALID_URL, url)
4340 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4343 video_id = mobj.group('video_id')
4345 webpage = self._download_webpage(url, video_id)
4347 m = re.search(r'file: "(.*?)",', webpage)
4349 self._downloader.report_error(u'unable to find video url')
4351 video_url = m.group(1)
4353 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4355 self._downloader.trouble(u'Cannot find video title')
4356 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4358 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4360 desc = unescapeHTML(m.group('desc'))
4364 m = re.search(r'By:.*?(\w+)</a>', webpage)
4366 uploader = clean_html(m.group(1))
4375 'description': desc,
4376 'uploader': uploader
4381 class ARDIE(InfoExtractor):
4382 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4383 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4384 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4386 def _real_extract(self, url):
4387 # determine video id from url
4388 m = re.match(self._VALID_URL, url)
4390 numid = re.search(r'documentId=([0-9]+)', url)
4392 video_id = numid.group(1)
4394 video_id = m.group('video_id')
4396 # determine title and media streams from webpage
4397 html = self._download_webpage(url, video_id)
4398 title = re.search(self._TITLE, html).group('title')
4399 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4401 assert '"fsk"' in html
4402 self._downloader.report_error(u'this video is only available after 8:00 pm')
4405 # choose default media type and highest quality for now
4406 stream = max([s for s in streams if int(s["media_type"]) == 0],
4407 key=lambda s: int(s["quality"]))
4409 # there's two possibilities: RTMP stream or HTTP download
4410 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4411 if stream['rtmp_url']:
4412 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4413 assert stream['video_url'].startswith('mp4:')
4414 info["url"] = stream["rtmp_url"]
4415 info["play_path"] = stream['video_url']
4417 assert stream["video_url"].endswith('.mp4')
4418 info["url"] = stream["video_url"]
4422 def gen_extractors():
4423 """ Return a list of an instance of every supported extractor.
4424 The order does matter; the first extractor matched is the one handling the URL.
4427 YoutubePlaylistIE(),
4452 StanfordOpenClassroomIE(),
4462 WorldStarHipHopIE(),
4478 def get_info_extractor(ie_name):
4479 """Returns the info extractor class with the given ie_name"""
4480 return globals()[ie_name+'IE']