2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
736 if not 'ratebypass' in url: url += '&ratebypass=yes'
737 url_map[url_data['itag'][0]] = url
739 format_limit = self._downloader.params.get('format_limit', None)
740 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
741 if format_limit is not None and format_limit in available_formats:
742 format_list = available_formats[available_formats.index(format_limit):]
744 format_list = available_formats
745 existing_formats = [x for x in format_list if x in url_map]
746 if len(existing_formats) == 0:
747 raise ExtractorError(u'no known formats available for video')
748 if self._downloader.params.get('listformats', None):
749 self._print_formats(existing_formats)
751 if req_format is None or req_format == 'best':
752 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
753 elif req_format == 'worst':
754 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
755 elif req_format in ('-1', 'all'):
756 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
758 # Specific formats. We pick the first in a slash-delimeted sequence.
759 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
760 req_formats = req_format.split('/')
761 video_url_list = None
762 for rf in req_formats:
764 video_url_list = [(rf, url_map[rf])]
766 if video_url_list is None:
767 raise ExtractorError(u'requested format not available')
769 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
772 for format_param, video_real_url in video_url_list:
774 video_extension = self._video_extensions.get(format_param, 'flv')
776 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
777 self._video_dimensions.get(format_param, '???'))
781 'url': video_real_url,
782 'uploader': video_uploader,
783 'uploader_id': video_uploader_id,
784 'upload_date': upload_date,
785 'title': video_title,
786 'ext': video_extension,
787 'format': video_format,
788 'thumbnail': video_thumbnail,
789 'description': video_description,
790 'player_url': player_url,
791 'subtitles': video_subtitles,
792 'duration': video_duration
797 class MetacafeIE(InfoExtractor):
798 """Information Extractor for metacafe.com."""
800 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
801 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
802 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
803 IE_NAME = u'metacafe'
805 def report_disclaimer(self):
806 """Report disclaimer retrieval."""
807 self.to_screen(u'Retrieving disclaimer')
809 def _real_initialize(self):
810 # Retrieve disclaimer
811 request = compat_urllib_request.Request(self._DISCLAIMER)
813 self.report_disclaimer()
814 disclaimer = compat_urllib_request.urlopen(request).read()
815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
816 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
821 'submit': "Continue - I'm over 18",
823 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
825 self.report_age_confirmation()
826 disclaimer = compat_urllib_request.urlopen(request).read()
827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
830 def _real_extract(self, url):
831 # Extract id and simplified title from URL
832 mobj = re.match(self._VALID_URL, url)
834 raise ExtractorError(u'Invalid URL: %s' % url)
836 video_id = mobj.group(1)
838 # Check if video comes from YouTube
839 mobj2 = re.match(r'^yt-(.*)$', video_id)
840 if mobj2 is not None:
841 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
843 # Retrieve video webpage to extract further information
844 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
846 # Extract URL, uploader and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
850 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
851 video_extension = mediaURL[-3:]
853 # Extract gdaKey if available
854 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
858 gdaKey = mobj.group(1)
859 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
861 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
863 raise ExtractorError(u'Unable to extract media URL')
864 vardict = compat_parse_qs(mobj.group(1))
865 if 'mediaData' not in vardict:
866 raise ExtractorError(u'Unable to extract media URL')
867 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
869 raise ExtractorError(u'Unable to extract media URL')
870 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
871 video_extension = mediaURL[-3:]
872 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
874 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
876 raise ExtractorError(u'Unable to extract title')
877 video_title = mobj.group(1).decode('utf-8')
879 mobj = re.search(r'submitter=(.*?);', webpage)
881 raise ExtractorError(u'Unable to extract uploader nickname')
882 video_uploader = mobj.group(1)
885 'id': video_id.decode('utf-8'),
886 'url': video_url.decode('utf-8'),
887 'uploader': video_uploader.decode('utf-8'),
889 'title': video_title,
890 'ext': video_extension.decode('utf-8'),
893 class DailymotionIE(InfoExtractor):
894 """Information Extractor for Dailymotion"""
896 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
897 IE_NAME = u'dailymotion'
899 def _real_extract(self, url):
900 # Extract id and simplified title from URL
901 mobj = re.match(self._VALID_URL, url)
903 raise ExtractorError(u'Invalid URL: %s' % url)
905 video_id = mobj.group(1).split('_')[0].split('?')[0]
907 video_extension = 'mp4'
909 # Retrieve video webpage to extract further information
910 request = compat_urllib_request.Request(url)
911 request.add_header('Cookie', 'family_filter=off')
912 webpage = self._download_webpage(request, video_id)
914 # Extract URL, uploader and title from webpage
915 self.report_extraction(video_id)
916 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
918 raise ExtractorError(u'Unable to extract media URL')
919 flashvars = compat_urllib_parse.unquote(mobj.group(1))
921 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
924 self.to_screen(u'Using %s' % key)
927 raise ExtractorError(u'Unable to extract video URL')
929 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
931 raise ExtractorError(u'Unable to extract video URL')
933 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
935 # TODO: support choosing qualities
937 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
939 raise ExtractorError(u'Unable to extract title')
940 video_title = unescapeHTML(mobj.group('title'))
942 video_uploader = None
943 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
944 # Looking for official user
945 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
946 webpage, 'video uploader')
948 video_upload_date = None
949 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
951 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
956 'uploader': video_uploader,
957 'upload_date': video_upload_date,
958 'title': video_title,
959 'ext': video_extension,
963 class PhotobucketIE(InfoExtractor):
964 """Information extractor for photobucket.com."""
966 # TODO: the original _VALID_URL was:
967 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
968 # Check if it's necessary to keep the old extracion process
969 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
970 IE_NAME = u'photobucket'
972 def _real_extract(self, url):
973 # Extract id from URL
974 mobj = re.match(self._VALID_URL, url)
976 raise ExtractorError(u'Invalid URL: %s' % url)
978 video_id = mobj.group('id')
980 video_extension = mobj.group('ext')
982 # Retrieve video webpage to extract further information
983 webpage = self._download_webpage(url, video_id)
985 # Extract URL, uploader, and title from webpage
986 self.report_extraction(video_id)
987 # We try first by looking the javascript code:
988 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
990 info = json.loads(mobj.group('json'))
993 'url': info[u'downloadUrl'],
994 'uploader': info[u'username'],
995 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
996 'title': info[u'title'],
997 'ext': video_extension,
998 'thumbnail': info[u'thumbUrl'],
1001 # We try looking in other parts of the webpage
1002 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1003 webpage, u'video URL')
1005 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1007 raise ExtractorError(u'Unable to extract title')
1008 video_title = mobj.group(1).decode('utf-8')
1009 video_uploader = mobj.group(2).decode('utf-8')
1012 'id': video_id.decode('utf-8'),
1013 'url': video_url.decode('utf-8'),
1014 'uploader': video_uploader,
1015 'upload_date': None,
1016 'title': video_title,
1017 'ext': video_extension.decode('utf-8'),
1021 class YahooIE(InfoExtractor):
1022 """Information extractor for screen.yahoo.com."""
1023 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1025 def _real_extract(self, url):
1026 mobj = re.match(self._VALID_URL, url)
1028 raise ExtractorError(u'Invalid URL: %s' % url)
1029 video_id = mobj.group('id')
1030 webpage = self._download_webpage(url, video_id)
1031 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1034 # TODO: Check which url parameters are required
1035 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1036 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1037 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1038 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1039 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1040 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1042 self.report_extraction(video_id)
1043 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1045 raise ExtractorError(u'Unable to extract video info')
1046 video_title = m_info.group('title')
1047 video_description = m_info.group('description')
1048 video_thumb = m_info.group('thumb')
1049 video_date = m_info.group('date')
1050 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1052 # TODO: Find a way to get mp4 videos
1053 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1054 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1055 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1056 video_url = m_rest.group('url')
1057 video_path = m_rest.group('path')
1059 raise ExtractorError(u'Unable to extract video url')
1061 else: # We have to use a different method if another id is defined
1062 long_id = m_id.group('new_id')
1063 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1064 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1065 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1066 info = json.loads(json_str)
1067 res = info[u'query'][u'results'][u'mediaObj'][0]
1068 stream = res[u'streams'][0]
1069 video_path = stream[u'path']
1070 video_url = stream[u'host']
1072 video_title = meta[u'title']
1073 video_description = meta[u'description']
1074 video_thumb = meta[u'thumbnail']
1075 video_date = None # I can't find it
1080 'play_path': video_path,
1081 'title':video_title,
1082 'description': video_description,
1083 'thumbnail': video_thumb,
1084 'upload_date': video_date,
1089 class VimeoIE(InfoExtractor):
1090 """Information extractor for vimeo.com."""
1092 # _VALID_URL matches Vimeo URLs
1093 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1096 def _real_extract(self, url, new_video=True):
1097 # Extract ID from URL
1098 mobj = re.match(self._VALID_URL, url)
1100 raise ExtractorError(u'Invalid URL: %s' % url)
1102 video_id = mobj.group('id')
1103 if not mobj.group('proto'):
1104 url = 'https://' + url
1105 if mobj.group('direct_link') or mobj.group('pro'):
1106 url = 'https://vimeo.com/' + video_id
1108 # Retrieve video webpage to extract further information
1109 request = compat_urllib_request.Request(url, None, std_headers)
1110 webpage = self._download_webpage(request, video_id)
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1123 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1125 raise ExtractorError(u'Unable to extract info section')
1128 video_title = config["video"]["title"]
1130 # Extract uploader and uploader_id
1131 video_uploader = config["video"]["owner"]["name"]
1132 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1134 # Extract video thumbnail
1135 video_thumbnail = config["video"]["thumbnail"]
1137 # Extract video description
1138 video_description = get_element_by_attribute("itemprop", "description", webpage)
1139 if video_description: video_description = clean_html(video_description)
1140 else: video_description = u''
1142 # Extract upload date
1143 video_upload_date = None
1144 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1145 if mobj is not None:
1146 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1148 # Vimeo specific: extract request signature and timestamp
1149 sig = config['request']['signature']
1150 timestamp = config['request']['timestamp']
1152 # Vimeo specific: extract video codec and quality information
1153 # First consider quality, then codecs, then take everything
1154 # TODO bind to format param
1155 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1156 files = { 'hd': [], 'sd': [], 'other': []}
1157 for codec_name, codec_extension in codecs:
1158 if codec_name in config["video"]["files"]:
1159 if 'hd' in config["video"]["files"][codec_name]:
1160 files['hd'].append((codec_name, codec_extension, 'hd'))
1161 elif 'sd' in config["video"]["files"][codec_name]:
1162 files['sd'].append((codec_name, codec_extension, 'sd'))
1164 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1166 for quality in ('hd', 'sd', 'other'):
1167 if len(files[quality]) > 0:
1168 video_quality = files[quality][0][2]
1169 video_codec = files[quality][0][0]
1170 video_extension = files[quality][0][1]
1171 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1174 raise ExtractorError(u'No known codec found')
1176 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1182 'uploader': video_uploader,
1183 'uploader_id': video_uploader_id,
1184 'upload_date': video_upload_date,
1185 'title': video_title,
1186 'ext': video_extension,
1187 'thumbnail': video_thumbnail,
1188 'description': video_description,
1192 class ArteTvIE(InfoExtractor):
1193 """arte.tv information extractor."""
1195 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196 _LIVE_URL = r'index-[0-9]+\.html$'
1198 IE_NAME = u'arte.tv'
1200 def fetch_webpage(self, url):
1201 request = compat_urllib_request.Request(url)
1203 self.report_download_webpage(url)
1204 webpage = compat_urllib_request.urlopen(request).read()
1205 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1206 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1207 except ValueError as err:
1208 raise ExtractorError(u'Invalid URL: %s' % url)
1211 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1212 page = self.fetch_webpage(url)
1213 mobj = re.search(regex, page, regexFlags)
1217 raise ExtractorError(u'Invalid URL: %s' % url)
1219 for (i, key, err) in matchTuples:
1220 if mobj.group(i) is None:
1221 raise ExtractorError(err)
1223 info[key] = mobj.group(i)
1227 def extractLiveStream(self, url):
1228 video_lang = url.split('/')[-4]
1229 info = self.grep_webpage(
1231 r'src="(.*?/videothek_js.*?\.js)',
1234 (1, 'url', u'Invalid URL: %s' % url)
1237 http_host = url.split('/')[2]
1238 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239 info = self.grep_webpage(
1241 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242 '(http://.*?\.swf).*?' +
1246 (1, 'path', u'could not extract video path: %s' % url),
1247 (2, 'player', u'could not extract video player: %s' % url),
1248 (3, 'url', u'could not extract video url: %s' % url)
1251 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1253 def extractPlus7Stream(self, url):
1254 video_lang = url.split('/')[-3]
1255 info = self.grep_webpage(
1257 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1260 (1, 'url', u'Invalid URL: %s' % url)
1263 next_url = compat_urllib_parse.unquote(info.get('url'))
1264 info = self.grep_webpage(
1266 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1269 (1, 'url', u'Could not find <video> tag: %s' % url)
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1274 info = self.grep_webpage(
1276 r'<video id="(.*?)".*?>.*?' +
1277 '<name>(.*?)</name>.*?' +
1278 '<dateVideo>(.*?)</dateVideo>.*?' +
1279 '<url quality="hd">(.*?)</url>',
1282 (1, 'id', u'could not extract video id: %s' % url),
1283 (2, 'title', u'could not extract video title: %s' % url),
1284 (3, 'date', u'could not extract video date: %s' % url),
1285 (4, 'url', u'could not extract video url: %s' % url)
1290 'id': info.get('id'),
1291 'url': compat_urllib_parse.unquote(info.get('url')),
1292 'uploader': u'arte.tv',
1293 'upload_date': unified_strdate(info.get('date')),
1294 'title': info.get('title').decode('utf-8'),
1300 def _real_extract(self, url):
1301 video_id = url.split('/')[-1]
1302 self.report_extraction(video_id)
1304 if re.search(self._LIVE_URL, video_id) is not None:
1305 self.extractLiveStream(url)
1308 info = self.extractPlus7Stream(url)
1313 class GenericIE(InfoExtractor):
1314 """Generic last-resort information extractor."""
1317 IE_NAME = u'generic'
1319 def report_download_webpage(self, video_id):
1320 """Report webpage download."""
1321 if not self._downloader.params.get('test', False):
1322 self._downloader.report_warning(u'Falling back on generic information extractor.')
1323 super(GenericIE, self).report_download_webpage(video_id)
1325 def report_following_redirect(self, new_url):
1326 """Report information extraction."""
1327 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1329 def _test_redirect(self, url):
1330 """Check if it is a redirect, like url shorteners, in case return the new url."""
1331 class HeadRequest(compat_urllib_request.Request):
1332 def get_method(self):
1335 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1337 Subclass the HTTPRedirectHandler to make it use our
1338 HeadRequest also on the redirected URL
1340 def redirect_request(self, req, fp, code, msg, headers, newurl):
1341 if code in (301, 302, 303, 307):
1342 newurl = newurl.replace(' ', '%20')
1343 newheaders = dict((k,v) for k,v in req.headers.items()
1344 if k.lower() not in ("content-length", "content-type"))
1345 return HeadRequest(newurl,
1347 origin_req_host=req.get_origin_req_host(),
1350 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1352 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1354 Fallback to GET if HEAD is not allowed (405 HTTP error)
1356 def http_error_405(self, req, fp, code, msg, headers):
1360 newheaders = dict((k,v) for k,v in req.headers.items()
1361 if k.lower() not in ("content-length", "content-type"))
1362 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1364 origin_req_host=req.get_origin_req_host(),
1368 opener = compat_urllib_request.OpenerDirector()
1369 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370 HTTPMethodFallback, HEADRedirectHandler,
1371 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372 opener.add_handler(handler())
1374 response = opener.open(HeadRequest(url))
1375 if response is None:
1376 raise ExtractorError(u'Invalid URL protocol')
1377 new_url = response.geturl()
1382 self.report_following_redirect(new_url)
1385 def _real_extract(self, url):
1386 new_url = self._test_redirect(url)
1387 if new_url: return [self.url_result(new_url)]
1389 video_id = url.split('/')[-1]
1391 webpage = self._download_webpage(url, video_id)
1392 except ValueError as err:
1393 # since this is the last-resort InfoExtractor, if
1394 # this error is thrown, it'll be thrown here
1395 raise ExtractorError(u'Invalid URL: %s' % url)
1397 self.report_extraction(video_id)
1398 # Start with something easy: JW Player in SWFObject
1399 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1401 # Broaden the search a little bit
1402 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit: JWPlayer JS loader
1405 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1407 # Try to find twitter cards info
1408 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1410 raise ExtractorError(u'Invalid URL: %s' % url)
1412 # It's possible that one of the regexes
1413 # matched, but returned an empty group:
1414 if mobj.group(1) is None:
1415 raise ExtractorError(u'Invalid URL: %s' % url)
1417 video_url = compat_urllib_parse.unquote(mobj.group(1))
1418 video_id = os.path.basename(video_url)
1420 # here's a fun little line of code for you:
1421 video_extension = os.path.splitext(video_id)[1][1:]
1422 video_id = os.path.splitext(video_id)[0]
1424 # it's tempting to parse this further, but you would
1425 # have to take into account all the variations like
1426 # Video Title - Site Name
1427 # Site Name | Video Title
1428 # Video Title - Tagline | Site Name
1429 # and so on and so forth; it's just not practical
1430 video_title = self._html_search_regex(r'<title>(.*)</title>',
1431 webpage, u'video title')
1433 # video uploader is domain name
1434 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1435 url, u'video uploader')
1440 'uploader': video_uploader,
1441 'upload_date': None,
1442 'title': video_title,
1443 'ext': video_extension,
1447 class YoutubeSearchIE(SearchInfoExtractor):
1448 """Information Extractor for YouTube search queries."""
1449 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1451 IE_NAME = u'youtube:search'
1452 _SEARCH_KEY = 'ytsearch'
1454 def report_download_page(self, query, pagenum):
1455 """Report attempt to download search page with given number."""
1456 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1458 def _get_n_results(self, query, n):
1459 """Get a specified number of results for a query"""
1465 while (50 * pagenum) < limit:
1466 self.report_download_page(query, pagenum+1)
1467 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1468 request = compat_urllib_request.Request(result_url)
1470 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1471 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1472 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1473 api_response = json.loads(data)['data']
1475 if not 'items' in api_response:
1476 raise ExtractorError(u'[youtube] No video results')
1478 new_ids = list(video['id'] for video in api_response['items'])
1479 video_ids += new_ids
1481 limit = min(n, api_response['totalItems'])
1484 if len(video_ids) > n:
1485 video_ids = video_ids[:n]
1486 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1487 return self.playlist_result(videos, query)
1490 class GoogleSearchIE(SearchInfoExtractor):
1491 """Information Extractor for Google Video search queries."""
1492 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1494 IE_NAME = u'video.google:search'
1495 _SEARCH_KEY = 'gvsearch'
1497 def _get_n_results(self, query, n):
1498 """Get a specified number of results for a query"""
1501 '_type': 'playlist',
1506 for pagenum in itertools.count(1):
1507 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1508 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1509 note='Downloading result page ' + str(pagenum))
1511 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1514 'url': mobj.group(1)
1516 res['entries'].append(e)
1518 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1521 class YahooSearchIE(SearchInfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1525 IE_NAME = u'screen.yahoo:search'
1526 _SEARCH_KEY = 'yvsearch'
1528 def _get_n_results(self, query, n):
1529 """Get a specified number of results for a query"""
1532 '_type': 'playlist',
1536 for pagenum in itertools.count(0):
1537 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1538 webpage = self._download_webpage(result_url, query,
1539 note='Downloading results page '+str(pagenum+1))
1540 info = json.loads(webpage)
1542 results = info[u'results']
1544 for (i, r) in enumerate(results):
1545 if (pagenum * 30) +i >= n:
1547 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1548 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1549 res['entries'].append(e)
1550 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1556 class YoutubePlaylistIE(InfoExtractor):
1557 """Information Extractor for YouTube playlists."""
1559 _VALID_URL = r"""(?:
1564 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1565 \? (?:.*?&)*? (?:p|a|list)=
1568 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1571 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1573 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1575 IE_NAME = u'youtube:playlist'
1578 def suitable(cls, url):
1579 """Receives a URL and returns True if suitable for this IE."""
1580 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1582 def _real_extract(self, url):
1583 # Extract playlist id
1584 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1586 raise ExtractorError(u'Invalid URL: %s' % url)
1588 # Download playlist videos from API
1589 playlist_id = mobj.group(1) or mobj.group(2)
1594 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1595 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1598 response = json.loads(page)
1599 except ValueError as err:
1600 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1602 if 'feed' not in response:
1603 raise ExtractorError(u'Got a malformed response from YouTube API')
1604 playlist_title = response['feed']['title']['$t']
1605 if 'entry' not in response['feed']:
1606 # Number of videos is a multiple of self._MAX_RESULTS
1609 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1610 for entry in response['feed']['entry']
1611 if 'content' in entry ]
1613 if len(response['feed']['entry']) < self._MAX_RESULTS:
1617 videos = [v[1] for v in sorted(videos)]
1619 url_results = [self.url_result(url, 'Youtube') for url in videos]
1620 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1623 class YoutubeChannelIE(InfoExtractor):
1624 """Information Extractor for YouTube channels."""
1626 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1627 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1628 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1629 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1630 IE_NAME = u'youtube:channel'
1632 def extract_videos_from_page(self, page):
1634 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1635 if mobj.group(1) not in ids_in_page:
1636 ids_in_page.append(mobj.group(1))
1639 def _real_extract(self, url):
1640 # Extract channel id
1641 mobj = re.match(self._VALID_URL, url)
1643 raise ExtractorError(u'Invalid URL: %s' % url)
1645 # Download channel page
1646 channel_id = mobj.group(1)
1650 url = self._TEMPLATE_URL % (channel_id, pagenum)
1651 page = self._download_webpage(url, channel_id,
1652 u'Downloading page #%s' % pagenum)
1654 # Extract video identifiers
1655 ids_in_page = self.extract_videos_from_page(page)
1656 video_ids.extend(ids_in_page)
1658 # Download any subsequent channel pages using the json-based channel_ajax query
1659 if self._MORE_PAGES_INDICATOR in page:
1661 pagenum = pagenum + 1
1663 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1664 page = self._download_webpage(url, channel_id,
1665 u'Downloading page #%s' % pagenum)
1667 page = json.loads(page)
1669 ids_in_page = self.extract_videos_from_page(page['content_html'])
1670 video_ids.extend(ids_in_page)
1672 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1675 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1677 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1678 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1679 return [self.playlist_result(url_entries, channel_id)]
1682 class YoutubeUserIE(InfoExtractor):
1683 """Information Extractor for YouTube users."""
1685 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1686 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1687 _GDATA_PAGE_SIZE = 50
1688 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1689 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1690 IE_NAME = u'youtube:user'
1692 def _real_extract(self, url):
1694 mobj = re.match(self._VALID_URL, url)
1696 raise ExtractorError(u'Invalid URL: %s' % url)
1698 username = mobj.group(1)
1700 # Download video ids using YouTube Data API. Result size per
1701 # query is limited (currently to 50 videos) so we need to query
1702 # page by page until there are no video ids - it means we got
1709 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1711 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1712 page = self._download_webpage(gdata_url, username,
1713 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1715 # Extract video identifiers
1718 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1722 video_ids.extend(ids_in_page)
1724 # A little optimization - if current page is not
1725 # "full", ie. does not contain PAGE_SIZE video ids then
1726 # we can assume that this page is the last one - there
1727 # are no more ids on further pages - no need to query
1730 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1735 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1736 url_results = [self.url_result(url, 'Youtube') for url in urls]
1737 return [self.playlist_result(url_results, playlist_title = username)]
1740 class BlipTVUserIE(InfoExtractor):
1741 """Information Extractor for blip.tv users."""
1743 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1745 IE_NAME = u'blip.tv:user'
1747 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1751 raise ExtractorError(u'Invalid URL: %s' % url)
1753 username = mobj.group(1)
1755 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1757 page = self._download_webpage(url, username, u'Downloading user page')
1758 mobj = re.search(r'data-users-id="([^"]+)"', page)
1759 page_base = page_base % mobj.group(1)
1762 # Download video ids using BlipTV Ajax calls. Result size per
1763 # query is limited (currently to 12 videos) so we need to query
1764 # page by page until there are no video ids - it means we got
1771 url = page_base + "&page=" + str(pagenum)
1772 page = self._download_webpage(url, username,
1773 u'Downloading video ids from page %d' % pagenum)
1775 # Extract video identifiers
1778 for mobj in re.finditer(r'href="/([^"]+)"', page):
1779 if mobj.group(1) not in ids_in_page:
1780 ids_in_page.append(unescapeHTML(mobj.group(1)))
1782 video_ids.extend(ids_in_page)
1784 # A little optimization - if current page is not
1785 # "full", ie. does not contain PAGE_SIZE video ids then
1786 # we can assume that this page is the last one - there
1787 # are no more ids on further pages - no need to query
1790 if len(ids_in_page) < self._PAGE_SIZE:
1795 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1796 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1797 return [self.playlist_result(url_entries, playlist_title = username)]
1800 class DepositFilesIE(InfoExtractor):
1801 """Information extractor for depositfiles.com"""
1803 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1805 def _real_extract(self, url):
1806 file_id = url.split('/')[-1]
1807 # Rebuild url in english locale
1808 url = 'http://depositfiles.com/en/files/' + file_id
1810 # Retrieve file webpage with 'Free download' button pressed
1811 free_download_indication = { 'gateway_result' : '1' }
1812 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1814 self.report_download_webpage(file_id)
1815 webpage = compat_urllib_request.urlopen(request).read()
1816 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1817 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1819 # Search for the real file URL
1820 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1821 if (mobj is None) or (mobj.group(1) is None):
1822 # Try to figure out reason of the error.
1823 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1824 if (mobj is not None) and (mobj.group(1) is not None):
1825 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1826 raise ExtractorError(u'%s' % restriction_message)
1828 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1830 file_url = mobj.group(1)
1831 file_extension = os.path.splitext(file_url)[1][1:]
1833 # Search for file title
1834 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1837 'id': file_id.decode('utf-8'),
1838 'url': file_url.decode('utf-8'),
1840 'upload_date': None,
1841 'title': file_title,
1842 'ext': file_extension.decode('utf-8'),
1846 class FacebookIE(InfoExtractor):
1847 """Information Extractor for Facebook"""
1849 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1850 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1851 _NETRC_MACHINE = 'facebook'
1852 IE_NAME = u'facebook'
1854 def report_login(self):
1855 """Report attempt to log in."""
1856 self.to_screen(u'Logging in')
1858 def _real_initialize(self):
1859 if self._downloader is None:
1864 downloader_params = self._downloader.params
1866 # Attempt to use provided username and password or .netrc data
1867 if downloader_params.get('username', None) is not None:
1868 useremail = downloader_params['username']
1869 password = downloader_params['password']
1870 elif downloader_params.get('usenetrc', False):
1872 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1873 if info is not None:
1877 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1878 except (IOError, netrc.NetrcParseError) as err:
1879 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1882 if useremail is None:
1891 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1894 login_results = compat_urllib_request.urlopen(request).read()
1895 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1896 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1902 def _real_extract(self, url):
1903 mobj = re.match(self._VALID_URL, url)
1905 raise ExtractorError(u'Invalid URL: %s' % url)
1906 video_id = mobj.group('ID')
1908 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1909 webpage = self._download_webpage(url, video_id)
1911 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1912 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1913 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1915 raise ExtractorError(u'Cannot parse data')
1916 data = dict(json.loads(m.group(1)))
1917 params_raw = compat_urllib_parse.unquote(data['params'])
1918 params = json.loads(params_raw)
1919 video_data = params['video_data'][0]
1920 video_url = video_data.get('hd_src')
1922 video_url = video_data['sd_src']
1924 raise ExtractorError(u'Cannot find video URL')
1925 video_duration = int(video_data['video_duration'])
1926 thumbnail = video_data['thumbnail_src']
1928 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1933 'title': video_title,
1936 'duration': video_duration,
1937 'thumbnail': thumbnail,
1942 class BlipTVIE(InfoExtractor):
1943 """Information extractor for blip.tv"""
1945 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947 IE_NAME = u'blip.tv'
1949 def report_direct_download(self, title):
1950 """Report information extraction."""
1951 self.to_screen(u'%s: Direct download detected' % title)
1953 def _real_extract(self, url):
1954 mobj = re.match(self._VALID_URL, url)
1956 raise ExtractorError(u'Invalid URL: %s' % url)
1958 # See https://github.com/rg3/youtube-dl/issues/857
1959 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960 if api_mobj is not None:
1961 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962 urlp = compat_urllib_parse_urlparse(url)
1963 if urlp.path.startswith('/play/'):
1964 request = compat_urllib_request.Request(url)
1965 response = compat_urllib_request.urlopen(request)
1966 redirecturl = response.geturl()
1967 rurlp = compat_urllib_parse_urlparse(redirecturl)
1968 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969 url = 'http://blip.tv/a/a-' + file_id
1970 return self._real_extract(url)
1977 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978 request = compat_urllib_request.Request(json_url)
1979 request.add_header('User-Agent', 'iTunes/10.6.1')
1980 self.report_extraction(mobj.group(1))
1983 urlh = compat_urllib_request.urlopen(request)
1984 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985 basename = url.split('/')[-1]
1986 title,ext = os.path.splitext(basename)
1987 title = title.decode('UTF-8')
1988 ext = ext.replace('.', '')
1989 self.report_direct_download(title)
1994 'upload_date': None,
1999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001 if info is None: # Regular URL
2003 json_code_bytes = urlh.read()
2004 json_code = json_code_bytes.decode('utf-8')
2005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2009 json_data = json.loads(json_code)
2010 if 'Post' in json_data:
2011 data = json_data['Post']
2015 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016 video_url = data['media']['url']
2017 umobj = re.match(self._URL_EXT, video_url)
2019 raise ValueError('Can not determine filename extension')
2020 ext = umobj.group(1)
2023 'id': data['item_id'],
2025 'uploader': data['display_name'],
2026 'upload_date': upload_date,
2027 'title': data['title'],
2029 'format': data['media']['mimeType'],
2030 'thumbnail': data['thumbnailUrl'],
2031 'description': data['description'],
2032 'player_url': data['embedUrl'],
2033 'user_agent': 'iTunes/10.6.1',
2035 except (ValueError,KeyError) as err:
2036 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2041 class MyVideoIE(InfoExtractor):
2042 """Information Extractor for myvideo.de."""
2044 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045 IE_NAME = u'myvideo'
2047 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049 # https://github.com/rg3/youtube-dl/pull/842
2050 def __rc4crypt(self,data, key):
2052 box = list(range(256))
2053 for i in list(range(256)):
2054 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055 box[i], box[x] = box[x], box[i]
2061 y = (y + box[x]) % 256
2062 box[x], box[y] = box[y], box[x]
2063 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2067 return hashlib.md5(s).hexdigest().encode()
2069 def _real_extract(self,url):
2070 mobj = re.match(self._VALID_URL, url)
2072 raise ExtractorError(u'invalid URL: %s' % url)
2074 video_id = mobj.group(1)
2077 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079 b'TnpsbA0KTVRkbU1tSTRNdz09'
2083 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084 webpage = self._download_webpage(webpage_url, video_id)
2086 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087 if mobj is not None:
2088 self.report_extraction(video_id)
2089 video_url = mobj.group(1) + '.flv'
2091 video_title = self._html_search_regex('<title>([^<]+)</title>',
2094 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2100 'upload_date': None,
2101 'title': video_title,
2106 mobj = re.search('var flashvars={(.+?)}', webpage)
2108 raise ExtractorError(u'Unable to extract video')
2113 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114 if not a == '_encxml':
2117 encxml = compat_urllib_parse.unquote(b)
2118 if not params.get('domain'):
2119 params['domain'] = 'www.myvideo.de'
2120 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121 if 'flash_playertype=MTV' in xmldata_url:
2122 self._downloader.report_warning(u'avoiding MTV player')
2124 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2129 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130 enc_data_b = binascii.unhexlify(enc_data)
2132 base64.b64decode(base64.b64decode(GK)) +
2134 str(video_id).encode('utf-8')
2137 dec_data = self.__rc4crypt(enc_data_b, sk)
2140 self.report_extraction(video_id)
2143 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2145 video_url = compat_urllib_parse.unquote(mobj.group(1))
2146 if 'myvideo2flash' in video_url:
2147 self._downloader.report_warning(u'forcing RTMPT ...')
2148 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2151 # extract non rtmp videos
2152 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2154 raise ExtractorError(u'unable to extract url')
2155 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2157 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158 video_file = compat_urllib_parse.unquote(video_file)
2160 if not video_file.endswith('f4m'):
2161 ppath, prefix = video_file.split('.')
2162 video_playpath = '%s:%s' % (prefix, ppath)
2163 video_hls_playlist = ''
2166 video_hls_playlist = (
2167 video_filepath + video_file
2168 ).replace('.f4m', '.m3u8')
2170 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2173 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2179 'tc_url': video_url,
2181 'upload_date': None,
2182 'title': video_title,
2184 'play_path': video_playpath,
2185 'video_file': video_file,
2186 'video_hls_playlist': video_hls_playlist,
2187 'player_url': video_swfobj,
2191 class ComedyCentralIE(InfoExtractor):
2192 """Information extractor for The Daily Show and Colbert Report """
2194 # urls can be abbreviations like :thedailyshow or :colbert
2195 # urls for episodes like:
2196 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200 |(https?://)?(www\.)?
2201 (?P<showname>thedailyshow|colbertnation)\.com/
2202 (full-episodes/(?P<episode>.*)|
2204 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2208 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2210 _video_extensions = {
2218 _video_dimensions = {
2228 def suitable(cls, url):
2229 """Receives a URL and returns True if suitable for this IE."""
2230 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2232 def _print_formats(self, formats):
2233 print('Available formats:')
2235 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2238 def _real_extract(self, url):
2239 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2241 raise ExtractorError(u'Invalid URL: %s' % url)
2243 if mobj.group('shortname'):
2244 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245 url = u'http://www.thedailyshow.com/full-episodes/'
2247 url = u'http://www.colbertnation.com/full-episodes/'
2248 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249 assert mobj is not None
2251 if mobj.group('clip'):
2252 if mobj.group('showname') == 'thedailyshow':
2253 epTitle = mobj.group('tdstitle')
2255 epTitle = mobj.group('cntitle')
2258 dlNewest = not mobj.group('episode')
2260 epTitle = mobj.group('showname')
2262 epTitle = mobj.group('episode')
2264 self.report_extraction(epTitle)
2265 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2267 url = htmlHandle.geturl()
2268 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2270 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271 if mobj.group('episode') == '':
2272 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273 epTitle = mobj.group('episode')
2275 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2277 if len(mMovieParams) == 0:
2278 # The Colbert Report embeds the information in a without
2279 # a URL prefix; so extract the alternate reference
2280 # and then add the URL prefix manually.
2282 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283 if len(altMovieParams) == 0:
2284 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2286 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2288 uri = mMovieParams[0][1]
2289 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290 indexXml = self._download_webpage(indexUrl, epTitle,
2291 u'Downloading show index',
2292 u'unable to download episode index')
2296 idoc = xml.etree.ElementTree.fromstring(indexXml)
2297 itemEls = idoc.findall('.//item')
2298 for partNum,itemEl in enumerate(itemEls):
2299 mediaId = itemEl.findall('./guid')[0].text
2300 shortMediaId = mediaId.split(':')[-1]
2301 showId = mediaId.split(':')[-2].replace('.com', '')
2302 officialTitle = itemEl.findall('./title')[0].text
2303 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2305 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306 compat_urllib_parse.urlencode({'uri': mediaId}))
2307 configXml = self._download_webpage(configUrl, epTitle,
2308 u'Downloading configuration for %s' % shortMediaId)
2310 cdoc = xml.etree.ElementTree.fromstring(configXml)
2312 for rendition in cdoc.findall('.//rendition'):
2313 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2317 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2320 if self._downloader.params.get('listformats', None):
2321 self._print_formats([i[0] for i in turls])
2324 # For now, just pick the highest bitrate
2325 format,rtmp_video_url = turls[-1]
2327 # Get the format arg from the arg stream
2328 req_format = self._downloader.params.get('format', None)
2330 # Select format if we can find one
2333 format, rtmp_video_url = f, v
2336 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2338 raise ExtractorError(u'Cannot transform RTMP url')
2339 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340 video_url = base + m.group('finalid')
2342 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2347 'upload_date': officialDate,
2352 'description': officialTitle,
2354 results.append(info)
2359 class EscapistIE(InfoExtractor):
2360 """Information extractor for The Escapist """
2362 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363 IE_NAME = u'escapist'
2365 def _real_extract(self, url):
2366 mobj = re.match(self._VALID_URL, url)
2368 raise ExtractorError(u'Invalid URL: %s' % url)
2369 showName = mobj.group('showname')
2370 videoId = mobj.group('episode')
2372 self.report_extraction(videoId)
2373 webpage = self._download_webpage(url, videoId)
2375 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2376 webpage, u'description', fatal=False)
2378 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2379 webpage, u'thumbnail', fatal=False)
2381 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2382 webpage, u'player url')
2384 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2385 webpage, u'player url').split(' : ')[-1]
2387 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388 configUrl = compat_urllib_parse.unquote(configUrl)
2390 configJSON = self._download_webpage(configUrl, videoId,
2391 u'Downloading configuration',
2392 u'unable to download configuration')
2394 # Technically, it's JavaScript, not JSON
2395 configJSON = configJSON.replace("'", '"')
2398 config = json.loads(configJSON)
2399 except (ValueError,) as err:
2400 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2402 playlist = config['playlist']
2403 videoUrl = playlist[1]['url']
2408 'uploader': showName,
2409 'upload_date': None,
2412 'thumbnail': imgUrl,
2413 'description': videoDesc,
2414 'player_url': playerUrl,
2419 class CollegeHumorIE(InfoExtractor):
2420 """Information extractor for collegehumor.com"""
2423 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424 IE_NAME = u'collegehumor'
2426 def report_manifest(self, video_id):
2427 """Report information extraction."""
2428 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2430 def _real_extract(self, url):
2431 mobj = re.match(self._VALID_URL, url)
2433 raise ExtractorError(u'Invalid URL: %s' % url)
2434 video_id = mobj.group('videoid')
2439 'upload_date': None,
2442 self.report_extraction(video_id)
2443 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2445 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2449 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2451 videoNode = mdoc.findall('./video')[0]
2452 info['description'] = videoNode.findall('./description')[0].text
2453 info['title'] = videoNode.findall('./caption')[0].text
2454 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455 manifest_url = videoNode.findall('./file')[0].text
2457 raise ExtractorError(u'Invalid metadata XML file')
2459 manifest_url += '?hdcore=2.10.3'
2460 self.report_manifest(video_id)
2462 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2466 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2468 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469 node_id = media_node.attrib['url']
2470 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471 except IndexError as err:
2472 raise ExtractorError(u'Invalid manifest file')
2474 url_pr = compat_urllib_parse_urlparse(manifest_url)
2475 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2482 class XVideosIE(InfoExtractor):
2483 """Information extractor for xvideos.com"""
2485 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486 IE_NAME = u'xvideos'
2488 def _real_extract(self, url):
2489 mobj = re.match(self._VALID_URL, url)
2491 raise ExtractorError(u'Invalid URL: %s' % url)
2492 video_id = mobj.group(1)
2494 webpage = self._download_webpage(url, video_id)
2496 self.report_extraction(video_id)
2499 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500 webpage, u'video URL'))
2503 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2506 # Extract video thumbnail
2507 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508 webpage, u'thumbnail', fatal=False)
2514 'upload_date': None,
2515 'title': video_title,
2517 'thumbnail': video_thumbnail,
2518 'description': None,
2524 class SoundcloudIE(InfoExtractor):
2525 """Information extractor for soundcloud.com
2526 To access the media, the uid of the song and a stream token
2527 must be extracted from the page source and the script must make
2528 a request to media.soundcloud.com/crossdomain.xml. Then
2529 the media can be grabbed by requesting from an url composed
2530 of the stream token and uid
2533 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534 IE_NAME = u'soundcloud'
2536 def report_resolve(self, video_id):
2537 """Report information extraction."""
2538 self.to_screen(u'%s: Resolving id' % video_id)
2540 def _real_extract(self, url):
2541 mobj = re.match(self._VALID_URL, url)
2543 raise ExtractorError(u'Invalid URL: %s' % url)
2545 # extract uploader (which is in the url)
2546 uploader = mobj.group(1)
2547 # extract simple title (uploader + slug of song title)
2548 slug_title = mobj.group(2)
2549 simple_title = uploader + u'-' + slug_title
2550 full_title = '%s/%s' % (uploader, slug_title)
2552 self.report_resolve(full_title)
2554 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2558 info = json.loads(info_json)
2559 video_id = info['id']
2560 self.report_extraction(full_title)
2562 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563 stream_json = self._download_webpage(streams_url, full_title,
2564 u'Downloading stream definitions',
2565 u'unable to download stream definitions')
2567 streams = json.loads(stream_json)
2568 mediaURL = streams['http_mp3_128_url']
2569 upload_date = unified_strdate(info['created_at'])
2574 'uploader': info['user']['username'],
2575 'upload_date': upload_date,
2576 'title': info['title'],
2578 'description': info['description'],
2581 class SoundcloudSetIE(InfoExtractor):
2582 """Information extractor for soundcloud.com sets
2583 To access the media, the uid of the song and a stream token
2584 must be extracted from the page source and the script must make
2585 a request to media.soundcloud.com/crossdomain.xml. Then
2586 the media can be grabbed by requesting from an url composed
2587 of the stream token and uid
2590 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591 IE_NAME = u'soundcloud:set'
2593 def report_resolve(self, video_id):
2594 """Report information extraction."""
2595 self.to_screen(u'%s: Resolving id' % video_id)
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2600 raise ExtractorError(u'Invalid URL: %s' % url)
2602 # extract uploader (which is in the url)
2603 uploader = mobj.group(1)
2604 # extract simple title (uploader + slug of song title)
2605 slug_title = mobj.group(2)
2606 simple_title = uploader + u'-' + slug_title
2607 full_title = '%s/sets/%s' % (uploader, slug_title)
2609 self.report_resolve(full_title)
2611 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613 info_json = self._download_webpage(resolv_url, full_title)
2616 info = json.loads(info_json)
2617 if 'errors' in info:
2618 for err in info['errors']:
2619 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2622 self.report_extraction(full_title)
2623 for track in info['tracks']:
2624 video_id = track['id']
2626 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2629 self.report_extraction(video_id)
2630 streams = json.loads(stream_json)
2631 mediaURL = streams['http_mp3_128_url']
2636 'uploader': track['user']['username'],
2637 'upload_date': unified_strdate(track['created_at']),
2638 'title': track['title'],
2640 'description': track['description'],
2645 class InfoQIE(InfoExtractor):
2646 """Information extractor for infoq.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2649 def _real_extract(self, url):
2650 mobj = re.match(self._VALID_URL, url)
2652 raise ExtractorError(u'Invalid URL: %s' % url)
2654 webpage = self._download_webpage(url, video_id=url)
2655 self.report_extraction(url)
2658 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2660 raise ExtractorError(u'Unable to extract video url')
2661 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2665 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2668 # Extract description
2669 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670 webpage, u'description', fatal=False)
2672 video_filename = video_url.split('/')[-1]
2673 video_id, extension = video_filename.split('.')
2679 'upload_date': None,
2680 'title': video_title,
2681 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2683 'description': video_description,
2688 class MixcloudIE(InfoExtractor):
2689 """Information extractor for www.mixcloud.com"""
2691 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693 IE_NAME = u'mixcloud'
2695 def report_download_json(self, file_id):
2696 """Report JSON download."""
2697 self.to_screen(u'Downloading json')
2699 def get_urls(self, jsonData, fmt, bitrate='best'):
2700 """Get urls from 'audio_formats' section in json"""
2703 bitrate_list = jsonData[fmt]
2704 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705 bitrate = max(bitrate_list) # select highest
2707 url_list = jsonData[fmt][bitrate]
2708 except TypeError: # we have no bitrate info.
2709 url_list = jsonData[fmt]
2712 def check_urls(self, url_list):
2713 """Returns 1st active url from list"""
2714 for url in url_list:
2716 compat_urllib_request.urlopen(url)
2718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723 def _print_formats(self, formats):
2724 print('Available formats:')
2725 for fmt in formats.keys():
2726 for b in formats[fmt]:
2728 ext = formats[fmt][b][0]
2729 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730 except TypeError: # we have no bitrate info
2731 ext = formats[fmt][0]
2732 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2735 def _real_extract(self, url):
2736 mobj = re.match(self._VALID_URL, url)
2738 raise ExtractorError(u'Invalid URL: %s' % url)
2739 # extract uploader & filename from url
2740 uploader = mobj.group(1).decode('utf-8')
2741 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2743 # construct API request
2744 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745 # retrieve .json file with links to files
2746 request = compat_urllib_request.Request(file_url)
2748 self.report_download_json(file_url)
2749 jsonData = compat_urllib_request.urlopen(request).read()
2750 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2754 json_data = json.loads(jsonData)
2755 player_url = json_data['player_swf_url']
2756 formats = dict(json_data['audio_formats'])
2758 req_format = self._downloader.params.get('format', None)
2761 if self._downloader.params.get('listformats', None):
2762 self._print_formats(formats)
2765 if req_format is None or req_format == 'best':
2766 for format_param in formats.keys():
2767 url_list = self.get_urls(formats, format_param)
2769 file_url = self.check_urls(url_list)
2770 if file_url is not None:
2773 if req_format not in formats:
2774 raise ExtractorError(u'Format is not available')
2776 url_list = self.get_urls(formats, req_format)
2777 file_url = self.check_urls(url_list)
2778 format_param = req_format
2781 'id': file_id.decode('utf-8'),
2782 'url': file_url.decode('utf-8'),
2783 'uploader': uploader.decode('utf-8'),
2784 'upload_date': None,
2785 'title': json_data['name'],
2786 'ext': file_url.split('.')[-1].decode('utf-8'),
2787 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788 'thumbnail': json_data['thumbnail_url'],
2789 'description': json_data['description'],
2790 'player_url': player_url.decode('utf-8'),
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794 """Information extractor for Stanford's Open ClassRoom"""
2796 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797 IE_NAME = u'stanfordoc'
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2802 raise ExtractorError(u'Invalid URL: %s' % url)
2804 if mobj.group('course') and mobj.group('video'): # A specific video
2805 course = mobj.group('course')
2806 video = mobj.group('video')
2808 'id': course + '_' + video,
2810 'upload_date': None,
2813 self.report_extraction(info['id'])
2814 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815 xmlUrl = baseUrl + video + '.xml'
2817 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2822 info['title'] = mdoc.findall('./title')[0].text
2823 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2825 raise ExtractorError(u'Invalid metadata XML file')
2826 info['ext'] = info['url'].rpartition('.')[2]
2828 elif mobj.group('course'): # A course page
2829 course = mobj.group('course')
2834 'upload_date': None,
2837 coursepage = self._download_webpage(url, info['id'],
2838 note='Downloading course info page',
2839 errnote='Unable to download course info page')
2841 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2843 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2844 coursepage, u'description', fatal=False)
2846 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2849 'type': 'reference',
2850 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2854 for entry in info['list']:
2855 assert entry['type'] == 'reference'
2856 results += self.extract(entry['url'])
2860 'id': 'Stanford OpenClassroom',
2863 'upload_date': None,
2866 self.report_download_webpage(info['id'])
2867 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869 rootpage = compat_urllib_request.urlopen(rootURL).read()
2870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2873 info['title'] = info['id']
2875 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2878 'type': 'reference',
2879 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2884 for entry in info['list']:
2885 assert entry['type'] == 'reference'
2886 results += self.extract(entry['url'])
2889 class MTVIE(InfoExtractor):
2890 """Information extractor for MTV.com"""
2892 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2895 def _real_extract(self, url):
2896 mobj = re.match(self._VALID_URL, url)
2898 raise ExtractorError(u'Invalid URL: %s' % url)
2899 if not mobj.group('proto'):
2900 url = 'http://' + url
2901 video_id = mobj.group('videoid')
2903 webpage = self._download_webpage(url, video_id)
2905 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2906 webpage, u'song name', fatal=False)
2908 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2911 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2912 webpage, u'mtvn_uri', fatal=False)
2914 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2915 webpage, u'content id', fatal=False)
2917 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2918 self.report_extraction(video_id)
2919 request = compat_urllib_request.Request(videogen_url)
2921 metadataXml = compat_urllib_request.urlopen(request).read()
2922 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2923 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2925 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2926 renditions = mdoc.findall('.//rendition')
2928 # For now, always pick the highest quality.
2929 rendition = renditions[-1]
2932 _,_,ext = rendition.attrib['type'].partition('/')
2933 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2934 video_url = rendition.find('./src').text
2936 raise ExtractorError('Invalid rendition field.')
2941 'uploader': performer,
2942 'upload_date': None,
2943 'title': video_title,
2951 class YoukuIE(InfoExtractor):
2952 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2955 nowTime = int(time.time() * 1000)
2956 random1 = random.randint(1000,1998)
2957 random2 = random.randint(1000,9999)
2959 return "%d%d%d" %(nowTime,random1,random2)
2961 def _get_file_ID_mix_string(self, seed):
2963 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2965 for i in range(len(source)):
2966 seed = (seed * 211 + 30031 ) % 65536
2967 index = math.floor(seed / 65536 * len(source) )
2968 mixed.append(source[int(index)])
2969 source.remove(source[int(index)])
2970 #return ''.join(mixed)
2973 def _get_file_id(self, fileId, seed):
2974 mixed = self._get_file_ID_mix_string(seed)
2975 ids = fileId.split('*')
2979 realId.append(mixed[int(ch)])
2980 return ''.join(realId)
2982 def _real_extract(self, url):
2983 mobj = re.match(self._VALID_URL, url)
2985 raise ExtractorError(u'Invalid URL: %s' % url)
2986 video_id = mobj.group('ID')
2988 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2990 jsondata = self._download_webpage(info_url, video_id)
2992 self.report_extraction(video_id)
2994 config = json.loads(jsondata)
2996 video_title = config['data'][0]['title']
2997 seed = config['data'][0]['seed']
2999 format = self._downloader.params.get('format', None)
3000 supported_format = list(config['data'][0]['streamfileids'].keys())
3002 if format is None or format == 'best':
3003 if 'hd2' in supported_format:
3008 elif format == 'worst':
3016 fileid = config['data'][0]['streamfileids'][format]
3017 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3018 except (UnicodeDecodeError, ValueError, KeyError):
3019 raise ExtractorError(u'Unable to extract info section')
3022 sid = self._gen_sid()
3023 fileid = self._get_file_id(fileid, seed)
3025 #column 8,9 of fileid represent the segment number
3026 #fileid[7:9] should be changed
3027 for index, key in enumerate(keys):
3029 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3030 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3033 'id': '%s_part%02d' % (video_id, index),
3034 'url': download_url,
3036 'upload_date': None,
3037 'title': video_title,
3040 files_info.append(info)
3045 class XNXXIE(InfoExtractor):
3046 """Information extractor for xnxx.com"""
3048 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3050 VIDEO_URL_RE = r'flv_url=(.*?)&'
3051 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3052 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3054 def _real_extract(self, url):
3055 mobj = re.match(self._VALID_URL, url)
3057 raise ExtractorError(u'Invalid URL: %s' % url)
3058 video_id = mobj.group(1)
3060 # Get webpage content
3061 webpage = self._download_webpage(url, video_id)
3063 video_url = self._search_regex(self.VIDEO_URL_RE,
3064 webpage, u'video URL')
3065 video_url = compat_urllib_parse.unquote(video_url)
3067 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3070 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3071 webpage, u'thumbnail', fatal=False)
3077 'upload_date': None,
3078 'title': video_title,
3080 'thumbnail': video_thumbnail,
3081 'description': None,
3085 class GooglePlusIE(InfoExtractor):
3086 """Information extractor for plus.google.com."""
3088 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3089 IE_NAME = u'plus.google'
3091 def _real_extract(self, url):
3092 # Extract id from URL
3093 mobj = re.match(self._VALID_URL, url)
3095 raise ExtractorError(u'Invalid URL: %s' % url)
3097 post_url = mobj.group(0)
3098 video_id = mobj.group(1)
3100 video_extension = 'flv'
3102 # Step 1, Retrieve post webpage to extract further information
3103 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3105 self.report_extraction(video_id)
3107 # Extract update date
3108 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3109 webpage, u'upload date', fatal=False)
3111 # Convert timestring to a format suitable for filename
3112 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3113 upload_date = upload_date.strftime('%Y%m%d')
3116 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3117 webpage, u'uploader', fatal=False)
3120 # Get the first line for title
3121 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3122 webpage, 'title', default=u'NA')
3124 # Step 2, Stimulate clicking the image box to launch video
3125 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3126 webpage, u'video page URL')
3127 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3129 # Extract video links on video page
3130 """Extract video links of all sizes"""
3131 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3132 mobj = re.findall(pattern, webpage)
3134 raise ExtractorError(u'Unable to extract video links')
3136 # Sort in resolution
3137 links = sorted(mobj)
3139 # Choose the lowest of the sort, i.e. highest resolution
3140 video_url = links[-1]
3141 # Only get the url. The resolution part in the tuple has no use anymore
3142 video_url = video_url[-1]
3143 # Treat escaped \u0026 style hex
3145 video_url = video_url.decode("unicode_escape")
3146 except AttributeError: # Python 3
3147 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3153 'uploader': uploader,
3154 'upload_date': upload_date,
3155 'title': video_title,
3156 'ext': video_extension,
3159 class NBAIE(InfoExtractor):
3160 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3163 def _real_extract(self, url):
3164 mobj = re.match(self._VALID_URL, url)
3166 raise ExtractorError(u'Invalid URL: %s' % url)
3168 video_id = mobj.group(1)
3170 webpage = self._download_webpage(url, video_id)
3172 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3174 shortened_video_id = video_id.rpartition('/')[2]
3175 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3176 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3178 # It isn't there in the HTML it returns to us
3179 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3181 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3184 'id': shortened_video_id,
3188 # 'uploader_date': uploader_date,
3189 'description': description,
3193 class JustinTVIE(InfoExtractor):
3194 """Information extractor for justin.tv and twitch.tv"""
3195 # TODO: One broadcast may be split into multiple videos. The key
3196 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3197 # starts at 1 and increases. Can we treat all parts as one video?
3199 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3201 (?P<channelid>[^/]+)|
3202 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3203 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3207 _JUSTIN_PAGE_LIMIT = 100
3208 IE_NAME = u'justin.tv'
3210 def report_download_page(self, channel, offset):
3211 """Report attempt to download a single page of videos."""
3212 self.to_screen(u'%s: Downloading video information from %d to %d' %
3213 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3215 # Return count of items, list of *valid* items
3216 def _parse_page(self, url, video_id):
3217 webpage = self._download_webpage(url, video_id,
3218 u'Downloading video info JSON',
3219 u'unable to download video info JSON')
3221 response = json.loads(webpage)
3222 if type(response) != list:
3223 error_text = response.get('error', 'unknown error')
3224 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3226 for clip in response:
3227 video_url = clip['video_file_url']
3229 video_extension = os.path.splitext(video_url)[1][1:]
3230 video_date = re.sub('-', '', clip['start_time'][:10])
3231 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3232 video_id = clip['id']
3233 video_title = clip.get('title', video_id)
3237 'title': video_title,
3238 'uploader': clip.get('channel_name', video_uploader_id),
3239 'uploader_id': video_uploader_id,
3240 'upload_date': video_date,
3241 'ext': video_extension,
3243 return (len(response), info)
3245 def _real_extract(self, url):
3246 mobj = re.match(self._VALID_URL, url)
3248 raise ExtractorError(u'invalid URL: %s' % url)
3250 api_base = 'http://api.justin.tv'
3252 if mobj.group('channelid'):
3254 video_id = mobj.group('channelid')
3255 api = api_base + '/channel/archives/%s.json' % video_id
3256 elif mobj.group('chapterid'):
3257 chapter_id = mobj.group('chapterid')
3259 webpage = self._download_webpage(url, chapter_id)
3260 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3262 raise ExtractorError(u'Cannot find archive of a chapter')
3263 archive_id = m.group(1)
3265 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3266 chapter_info_xml = self._download_webpage(api, chapter_id,
3267 note=u'Downloading chapter information',
3268 errnote=u'Chapter information download failed')
3269 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3270 for a in doc.findall('.//archive'):
3271 if archive_id == a.find('./id').text:
3274 raise ExtractorError(u'Could not find chapter in chapter information')
3276 video_url = a.find('./video_file_url').text
3277 video_ext = video_url.rpartition('.')[2] or u'flv'
3279 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3280 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3281 note='Downloading chapter metadata',
3282 errnote='Download of chapter metadata failed')
3283 chapter_info = json.loads(chapter_info_json)
3285 bracket_start = int(doc.find('.//bracket_start').text)
3286 bracket_end = int(doc.find('.//bracket_end').text)
3288 # TODO determine start (and probably fix up file)
3289 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3290 #video_url += u'?start=' + TODO:start_timestamp
3291 # bracket_start is 13290, but we want 51670615
3292 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3293 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3296 'id': u'c' + chapter_id,
3299 'title': chapter_info['title'],
3300 'thumbnail': chapter_info['preview'],
3301 'description': chapter_info['description'],
3302 'uploader': chapter_info['channel']['display_name'],
3303 'uploader_id': chapter_info['channel']['name'],
3307 video_id = mobj.group('videoid')
3308 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3310 self.report_extraction(video_id)
3314 limit = self._JUSTIN_PAGE_LIMIT
3317 self.report_download_page(video_id, offset)
3318 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3319 page_count, page_info = self._parse_page(page_url, video_id)
3320 info.extend(page_info)
3321 if not paged or page_count != limit:
3326 class FunnyOrDieIE(InfoExtractor):
3327 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3329 def _real_extract(self, url):
3330 mobj = re.match(self._VALID_URL, url)
3332 raise ExtractorError(u'invalid URL: %s' % url)
3334 video_id = mobj.group('id')
3335 webpage = self._download_webpage(url, video_id)
3337 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3338 webpage, u'video URL', flags=re.DOTALL)
3340 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3341 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3343 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3344 webpage, u'description', fatal=False, flags=re.DOTALL)
3351 'description': video_description,
3355 class SteamIE(InfoExtractor):
3356 _VALID_URL = r"""http://store\.steampowered\.com/
3358 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3360 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3362 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3363 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3366 def suitable(cls, url):
3367 """Receives a URL and returns True if suitable for this IE."""
3368 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3370 def _real_extract(self, url):
3371 m = re.match(self._VALID_URL, url, re.VERBOSE)
3372 gameID = m.group('gameID')
3374 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3375 webpage = self._download_webpage(videourl, gameID)
3377 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3378 videourl = self._AGECHECK_TEMPLATE % gameID
3379 self.report_age_confirmation()
3380 webpage = self._download_webpage(videourl, gameID)
3382 self.report_extraction(gameID)
3383 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3384 webpage, 'game title')
3386 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3387 mweb = re.finditer(urlRE, webpage)
3388 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3389 titles = re.finditer(namesRE, webpage)
3390 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3391 thumbs = re.finditer(thumbsRE, webpage)
3393 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3394 video_id = vid.group('videoID')
3395 title = vtitle.group('videoName')
3396 video_url = vid.group('videoURL')
3397 video_thumb = thumb.group('thumbnail')
3399 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3404 'title': unescapeHTML(title),
3405 'thumbnail': video_thumb
3408 return [self.playlist_result(videos, gameID, game_title)]
3410 class UstreamIE(InfoExtractor):
3411 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3412 IE_NAME = u'ustream'
3414 def _real_extract(self, url):
3415 m = re.match(self._VALID_URL, url)
3416 video_id = m.group('videoID')
3418 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3419 webpage = self._download_webpage(url, video_id)
3421 self.report_extraction(video_id)
3423 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3426 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3427 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3429 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3430 webpage, u'thumbnail', fatal=False)
3436 'title': video_title,
3437 'uploader': uploader,
3438 'thumbnail': thumbnail,
3442 class WorldStarHipHopIE(InfoExtractor):
3443 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3444 IE_NAME = u'WorldStarHipHop'
3446 def _real_extract(self, url):
3447 m = re.match(self._VALID_URL, url)
3448 video_id = m.group('id')
3450 webpage_src = self._download_webpage(url, video_id)
3452 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3453 webpage_src, u'video URL')
3455 if 'mp4' in video_url:
3460 video_title = self._html_search_regex(r"<title>(.*)</title>",
3461 webpage_src, u'title')
3463 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3464 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3465 webpage_src, u'thumbnail', fatal=False)
3468 _title = r"""candytitles.*>(.*)</span>"""
3469 mobj = re.search(_title, webpage_src)
3470 if mobj is not None:
3471 video_title = mobj.group(1)
3476 'title' : video_title,
3477 'thumbnail' : thumbnail,
3482 class RBMARadioIE(InfoExtractor):
3483 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3485 def _real_extract(self, url):
3486 m = re.match(self._VALID_URL, url)
3487 video_id = m.group('videoID')
3489 webpage = self._download_webpage(url, video_id)
3491 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3492 webpage, u'json data', flags=re.MULTILINE)
3495 data = json.loads(json_data)
3496 except ValueError as e:
3497 raise ExtractorError(u'Invalid JSON: ' + str(e))
3499 video_url = data['akamai_url'] + '&cbr=256'
3500 url_parts = compat_urllib_parse_urlparse(video_url)
3501 video_ext = url_parts.path.rpartition('.')[2]
3506 'title': data['title'],
3507 'description': data.get('teaser_text'),
3508 'location': data.get('country_of_origin'),
3509 'uploader': data.get('host', {}).get('name'),
3510 'uploader_id': data.get('host', {}).get('slug'),
3511 'thumbnail': data.get('image', {}).get('large_url_2x'),
3512 'duration': data.get('duration'),
3517 class YouPornIE(InfoExtractor):
3518 """Information extractor for youporn.com."""
3519 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3521 def _print_formats(self, formats):
3522 """Print all available formats"""
3523 print(u'Available formats:')
3524 print(u'ext\t\tformat')
3525 print(u'---------------------------------')
3526 for format in formats:
3527 print(u'%s\t\t%s' % (format['ext'], format['format']))
3529 def _specific(self, req_format, formats):
3531 if(x["format"]==req_format):
3535 def _real_extract(self, url):
3536 mobj = re.match(self._VALID_URL, url)
3538 raise ExtractorError(u'Invalid URL: %s' % url)
3539 video_id = mobj.group('videoid')
3541 req = compat_urllib_request.Request(url)
3542 req.add_header('Cookie', 'age_verified=1')
3543 webpage = self._download_webpage(req, video_id)
3545 # Get JSON parameters
3546 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3548 params = json.loads(json_params)
3550 raise ExtractorError(u'Invalid JSON')
3552 self.report_extraction(video_id)
3554 video_title = params['title']
3555 upload_date = unified_strdate(params['release_date_f'])
3556 video_description = params['description']
3557 video_uploader = params['submitted_by']
3558 thumbnail = params['thumbnails'][0]['image']
3560 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3562 # Get all of the formats available
3563 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3564 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3565 webpage, u'download list').strip()
3567 # Get all of the links from the page
3568 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3569 links = re.findall(LINK_RE, download_list_html)
3570 if(len(links) == 0):
3571 raise ExtractorError(u'ERROR: no known formats available for video')
3573 self.to_screen(u'Links found: %d' % len(links))
3578 # A link looks like this:
3579 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3580 # A path looks like this:
3581 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3582 video_url = unescapeHTML( link )
3583 path = compat_urllib_parse_urlparse( video_url ).path
3584 extension = os.path.splitext( path )[1][1:]
3585 format = path.split('/')[4].split('_')[:2]
3588 format = "-".join( format )
3589 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3594 'uploader': video_uploader,
3595 'upload_date': upload_date,
3596 'title': video_title,
3599 'thumbnail': thumbnail,
3600 'description': video_description
3603 if self._downloader.params.get('listformats', None):
3604 self._print_formats(formats)
3607 req_format = self._downloader.params.get('format', None)
3608 self.to_screen(u'Format: %s' % req_format)
3610 if req_format is None or req_format == 'best':
3612 elif req_format == 'worst':
3613 return [formats[-1]]
3614 elif req_format in ('-1', 'all'):
3617 format = self._specific( req_format, formats )
3619 raise ExtractorError(u'Requested format not available')
3624 class PornotubeIE(InfoExtractor):
3625 """Information extractor for pornotube.com."""
3626 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3628 def _real_extract(self, url):
3629 mobj = re.match(self._VALID_URL, url)
3631 raise ExtractorError(u'Invalid URL: %s' % url)
3633 video_id = mobj.group('videoid')
3634 video_title = mobj.group('title')
3636 # Get webpage content
3637 webpage = self._download_webpage(url, video_id)
3640 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3641 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3642 video_url = compat_urllib_parse.unquote(video_url)
3644 #Get the uploaded date
3645 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3646 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3647 if upload_date: upload_date = unified_strdate(upload_date)
3649 info = {'id': video_id,
3652 'upload_date': upload_date,
3653 'title': video_title,
3659 class YouJizzIE(InfoExtractor):
3660 """Information extractor for youjizz.com."""
3661 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3663 def _real_extract(self, url):
3664 mobj = re.match(self._VALID_URL, url)
3666 raise ExtractorError(u'Invalid URL: %s' % url)
3668 video_id = mobj.group('videoid')
3670 # Get webpage content
3671 webpage = self._download_webpage(url, video_id)
3673 # Get the video title
3674 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3675 webpage, u'title').strip()
3677 # Get the embed page
3678 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3680 raise ExtractorError(u'ERROR: unable to extract embed page')
3682 embed_page_url = result.group(0).strip()
3683 video_id = result.group('videoid')
3685 webpage = self._download_webpage(embed_page_url, video_id)
3688 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3689 webpage, u'video URL')
3691 info = {'id': video_id,
3693 'title': video_title,
3696 'player_url': embed_page_url}
3700 class EightTracksIE(InfoExtractor):
3702 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3704 def _real_extract(self, url):
3705 mobj = re.match(self._VALID_URL, url)
3707 raise ExtractorError(u'Invalid URL: %s' % url)
3708 playlist_id = mobj.group('id')
3710 webpage = self._download_webpage(url, playlist_id)
3712 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3713 data = json.loads(json_like)
3715 session = str(random.randint(0, 1000000000))
3717 track_count = data['tracks_count']
3718 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3719 next_url = first_url
3721 for i in itertools.count():
3722 api_json = self._download_webpage(next_url, playlist_id,
3723 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3724 errnote=u'Failed to download song information')
3725 api_data = json.loads(api_json)
3726 track_data = api_data[u'set']['track']
3728 'id': track_data['id'],
3729 'url': track_data['track_file_stream_url'],
3730 'title': track_data['performer'] + u' - ' + track_data['name'],
3731 'raw_title': track_data['name'],
3732 'uploader_id': data['user']['login'],
3736 if api_data['set']['at_last_track']:
3738 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3741 class KeekIE(InfoExtractor):
3742 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3745 def _real_extract(self, url):
3746 m = re.match(self._VALID_URL, url)
3747 video_id = m.group('videoID')
3749 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3750 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3751 webpage = self._download_webpage(url, video_id)
3753 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3756 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3757 webpage, u'uploader', fatal=False)
3763 'title': video_title,
3764 'thumbnail': thumbnail,
3765 'uploader': uploader
3769 class TEDIE(InfoExtractor):
3770 _VALID_URL=r'''http://www\.ted\.com/
3772 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3774 ((?P<type_talk>talks)) # We have a simple talk
3776 (/lang/(.*?))? # The url may contain the language
3777 /(?P<name>\w+) # Here goes the name and then ".html"
3781 def suitable(cls, url):
3782 """Receives a URL and returns True if suitable for this IE."""
3783 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3785 def _real_extract(self, url):
3786 m=re.match(self._VALID_URL, url, re.VERBOSE)
3787 if m.group('type_talk'):
3788 return [self._talk_info(url)]
3790 playlist_id=m.group('playlist_id')
3791 name=m.group('name')
3792 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3793 return [self._playlist_videos_info(url,name,playlist_id)]
3795 def _talk_video_link(self,mediaSlug):
3796 '''Returns the video link for that mediaSlug'''
3797 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3799 def _playlist_videos_info(self,url,name,playlist_id=0):
3800 '''Returns the videos of the playlist'''
3802 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3803 ([.\s]*?)data-playlist_item_id="(\d+)"
3804 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3806 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3807 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3808 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3809 m_names=re.finditer(video_name_RE,webpage)
3811 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3812 m_playlist = re.search(playlist_RE, webpage)
3813 playlist_title = m_playlist.group('playlist_title')
3815 playlist_entries = []
3816 for m_video, m_name in zip(m_videos,m_names):
3817 video_id=m_video.group('video_id')
3818 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3819 playlist_entries.append(self.url_result(talk_url, 'TED'))
3820 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3822 def _talk_info(self, url, video_id=0):
3823 """Return the video for the talk in the url"""
3824 m=re.match(self._VALID_URL, url,re.VERBOSE)
3825 videoName=m.group('name')
3826 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3827 # If the url includes the language we get the title translated
3828 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3829 title=re.search(title_RE, webpage).group('title')
3830 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3831 "id":(?P<videoID>[\d]+).*?
3832 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3833 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3834 thumb_match=re.search(thumb_RE,webpage)
3835 info_match=re.search(info_RE,webpage,re.VERBOSE)
3836 video_id=info_match.group('videoID')
3837 mediaSlug=info_match.group('mediaSlug')
3838 video_url=self._talk_video_link(mediaSlug)
3844 'thumbnail': thumb_match.group('thumbnail')
3848 class MySpassIE(InfoExtractor):
3849 _VALID_URL = r'http://www.myspass.de/.*'
3851 def _real_extract(self, url):
3852 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3854 # video id is the last path element of the URL
3855 # usually there is a trailing slash, so also try the second but last
3856 url_path = compat_urllib_parse_urlparse(url).path
3857 url_parent_path, video_id = os.path.split(url_path)
3859 _, video_id = os.path.split(url_parent_path)
3862 metadata_url = META_DATA_URL_TEMPLATE % video_id
3863 metadata_text = self._download_webpage(metadata_url, video_id)
3864 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3866 # extract values from metadata
3867 url_flv_el = metadata.find('url_flv')
3868 if url_flv_el is None:
3869 raise ExtractorError(u'Unable to extract download url')
3870 video_url = url_flv_el.text
3871 extension = os.path.splitext(video_url)[1][1:]
3872 title_el = metadata.find('title')
3873 if title_el is None:
3874 raise ExtractorError(u'Unable to extract title')
3875 title = title_el.text
3876 format_id_el = metadata.find('format_id')
3877 if format_id_el is None:
3880 format = format_id_el.text
3881 description_el = metadata.find('description')
3882 if description_el is not None:
3883 description = description_el.text
3886 imagePreview_el = metadata.find('imagePreview')
3887 if imagePreview_el is not None:
3888 thumbnail = imagePreview_el.text
3897 'thumbnail': thumbnail,
3898 'description': description
3902 class SpiegelIE(InfoExtractor):
3903 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3905 def _real_extract(self, url):
3906 m = re.match(self._VALID_URL, url)
3907 video_id = m.group('videoID')
3909 webpage = self._download_webpage(url, video_id)
3911 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3914 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915 xml_code = self._download_webpage(xml_url, video_id,
3916 note=u'Downloading XML', errnote=u'Failed to download XML')
3918 idoc = xml.etree.ElementTree.fromstring(xml_code)
3919 last_type = idoc[-1]
3920 filename = last_type.findall('./filename')[0].text
3921 duration = float(last_type.findall('./duration')[0].text)
3923 video_url = 'http://video2.spiegel.de/flash/' + filename
3924 video_ext = filename.rpartition('.')[2]
3929 'title': video_title,
3930 'duration': duration,
3934 class LiveLeakIE(InfoExtractor):
3936 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937 IE_NAME = u'liveleak'
3939 def _real_extract(self, url):
3940 mobj = re.match(self._VALID_URL, url)
3942 raise ExtractorError(u'Invalid URL: %s' % url)
3944 video_id = mobj.group('video_id')
3946 webpage = self._download_webpage(url, video_id)
3948 video_url = self._search_regex(r'file: "(.*?)",',
3949 webpage, u'video URL')
3951 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3952 webpage, u'title').replace('LiveLeak.com -', '').strip()
3954 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3955 webpage, u'description', fatal=False)
3957 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3958 webpage, u'uploader', fatal=False)
3964 'title': video_title,
3965 'description': video_description,
3966 'uploader': video_uploader
3971 class ARDIE(InfoExtractor):
3972 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3973 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3974 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3976 def _real_extract(self, url):
3977 # determine video id from url
3978 m = re.match(self._VALID_URL, url)
3980 numid = re.search(r'documentId=([0-9]+)', url)
3982 video_id = numid.group(1)
3984 video_id = m.group('video_id')
3986 # determine title and media streams from webpage
3987 html = self._download_webpage(url, video_id)
3988 title = re.search(self._TITLE, html).group('title')
3989 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3991 assert '"fsk"' in html
3992 raise ExtractorError(u'This video is only available after 8:00 pm')
3994 # choose default media type and highest quality for now
3995 stream = max([s for s in streams if int(s["media_type"]) == 0],
3996 key=lambda s: int(s["quality"]))
3998 # there's two possibilities: RTMP stream or HTTP download
3999 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4000 if stream['rtmp_url']:
4001 self.to_screen(u'RTMP download detected')
4002 assert stream['video_url'].startswith('mp4:')
4003 info["url"] = stream["rtmp_url"]
4004 info["play_path"] = stream['video_url']
4006 assert stream["video_url"].endswith('.mp4')
4007 info["url"] = stream["video_url"]
4010 class ZDFIE(InfoExtractor):
4011 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4012 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4013 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4014 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4015 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4017 def _real_extract(self, url):
4018 mobj = re.match(self._VALID_URL, url)
4020 raise ExtractorError(u'Invalid URL: %s' % url)
4021 video_id = mobj.group('video_id')
4023 html = self._download_webpage(url, video_id)
4024 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4026 raise ExtractorError(u'No media url found.')
4028 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4029 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4030 # choose first/default media type and highest quality for now
4031 for s in streams: #find 300 - dsl1000mbit
4032 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4035 for s in streams: #find veryhigh - dsl2000mbit
4036 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4040 raise ExtractorError(u'No stream found.')
4042 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4044 self.report_extraction(video_id)
4045 mobj = re.search(self._TITLE, html)
4047 raise ExtractorError(u'Cannot extract title')
4048 title = unescapeHTML(mobj.group('title'))
4050 mobj = re.search(self._MMS_STREAM, media_link)
4052 mobj = re.search(self._RTSP_STREAM, media_link)
4054 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4055 mms_url = mobj.group('video_url')
4057 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4059 raise ExtractorError(u'Cannot extract extention')
4060 ext = mobj.group('ext')
4062 return [{'id': video_id,
4068 class TumblrIE(InfoExtractor):
4069 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4071 def _real_extract(self, url):
4072 m_url = re.match(self._VALID_URL, url)
4073 video_id = m_url.group('id')
4074 blog = m_url.group('blog_name')
4076 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4077 webpage = self._download_webpage(url, video_id)
4079 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4080 video = re.search(re_video, webpage)
4082 raise ExtractorError(u'Unable to extract video')
4083 video_url = video.group('video_url')
4084 ext = video.group('ext')
4086 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4087 webpage, u'thumbnail', fatal=False) # We pick the first poster
4088 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4090 # The only place where you can get a title, it's not complete,
4091 # but searching in other places doesn't work for all videos
4092 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4093 webpage, u'title', flags=re.DOTALL)
4095 return [{'id': video_id,
4097 'title': video_title,
4098 'thumbnail': video_thumbnail,
4102 class BandcampIE(InfoExtractor):
4103 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4105 def _real_extract(self, url):
4106 mobj = re.match(self._VALID_URL, url)
4107 title = mobj.group('title')
4108 webpage = self._download_webpage(url, title)
4109 # We get the link to the free download page
4110 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4111 if m_download is None:
4112 raise ExtractorError(u'No free songs found')
4114 download_link = m_download.group(1)
4115 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4116 webpage, re.MULTILINE|re.DOTALL).group('id')
4118 download_webpage = self._download_webpage(download_link, id,
4119 'Downloading free downloads page')
4120 # We get the dictionary of the track from some javascrip code
4121 info = re.search(r'items: (.*?),$',
4122 download_webpage, re.MULTILINE).group(1)
4123 info = json.loads(info)[0]
4124 # We pick mp3-320 for now, until format selection can be easily implemented.
4125 mp3_info = info[u'downloads'][u'mp3-320']
4126 # If we try to use this url it says the link has expired
4127 initial_url = mp3_info[u'url']
4128 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4129 m_url = re.match(re_url, initial_url)
4130 #We build the url we will use to get the final track url
4131 # This url is build in Bandcamp in the script download_bunde_*.js
4132 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4133 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4134 # If we could correctly generate the .rand field the url would be
4135 #in the "download_url" key
4136 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4138 track_info = {'id':id,
4139 'title' : info[u'title'],
4142 'thumbnail' : info[u'thumb_url'],
4143 'uploader' : info[u'artist']
4148 class RedTubeIE(InfoExtractor):
4149 """Information Extractor for redtube"""
4150 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4152 def _real_extract(self,url):
4153 mobj = re.match(self._VALID_URL, url)
4155 raise ExtractorError(u'Invalid URL: %s' % url)
4157 video_id = mobj.group('id')
4158 video_extension = 'mp4'
4159 webpage = self._download_webpage(url, video_id)
4161 self.report_extraction(video_id)
4163 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4164 webpage, u'video URL')
4166 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4172 'ext': video_extension,
4173 'title': video_title,
4176 class InaIE(InfoExtractor):
4177 """Information Extractor for Ina.fr"""
4178 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4180 def _real_extract(self,url):
4181 mobj = re.match(self._VALID_URL, url)
4183 video_id = mobj.group('id')
4184 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4185 video_extension = 'mp4'
4186 webpage = self._download_webpage(mrss_url, video_id)
4188 self.report_extraction(video_id)
4190 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4191 webpage, u'video URL')
4193 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4199 'ext': video_extension,
4200 'title': video_title,
4203 class HowcastIE(InfoExtractor):
4204 """Information Extractor for Howcast.com"""
4205 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4207 def _real_extract(self, url):
4208 mobj = re.match(self._VALID_URL, url)
4210 video_id = mobj.group('id')
4211 webpage_url = 'http://www.howcast.com/videos/' + video_id
4212 webpage = self._download_webpage(webpage_url, video_id)
4214 self.report_extraction(video_id)
4216 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4217 webpage, u'video URL')
4219 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4222 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4223 webpage, u'description', fatal=False)
4225 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4226 webpage, u'thumbnail', fatal=False)
4232 'title': video_title,
4233 'description': video_description,
4234 'thumbnail': thumbnail,
4237 class VineIE(InfoExtractor):
4238 """Information Extractor for Vine.co"""
4239 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4241 def _real_extract(self, url):
4242 mobj = re.match(self._VALID_URL, url)
4244 video_id = mobj.group('id')
4245 webpage_url = 'https://vine.co/v/' + video_id
4246 webpage = self._download_webpage(webpage_url, video_id)
4248 self.report_extraction(video_id)
4250 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4251 webpage, u'video URL')
4253 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4256 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4257 webpage, u'thumbnail', fatal=False)
4259 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4260 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4266 'title': video_title,
4267 'thumbnail': thumbnail,
4268 'uploader': uploader,
4271 class FlickrIE(InfoExtractor):
4272 """Information Extractor for Flickr videos"""
4273 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4275 def _real_extract(self, url):
4276 mobj = re.match(self._VALID_URL, url)
4278 video_id = mobj.group('id')
4279 video_uploader_id = mobj.group('uploader_id')
4280 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4281 webpage = self._download_webpage(webpage_url, video_id)
4283 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4285 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4286 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4288 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4289 first_xml, u'node_id')
4291 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4292 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4294 self.report_extraction(video_id)
4296 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4298 raise ExtractorError(u'Unable to extract video url')
4299 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4301 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4302 webpage, u'video title')
4304 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4305 webpage, u'description', fatal=False)
4307 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4308 webpage, u'thumbnail', fatal=False)
4314 'title': video_title,
4315 'description': video_description,
4316 'thumbnail': thumbnail,
4317 'uploader_id': video_uploader_id,
4320 class TeamcocoIE(InfoExtractor):
4321 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4323 def _real_extract(self, url):
4324 mobj = re.match(self._VALID_URL, url)
4326 raise ExtractorError(u'Invalid URL: %s' % url)
4327 url_title = mobj.group('url_title')
4328 webpage = self._download_webpage(url, url_title)
4330 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4331 webpage, u'video id')
4333 self.report_extraction(video_id)
4335 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4338 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4339 webpage, u'thumbnail', fatal=False)
4341 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4342 webpage, u'description', fatal=False)
4344 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4345 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4347 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4354 'title': video_title,
4355 'thumbnail': thumbnail,
4356 'description': video_description,
4359 class XHamsterIE(InfoExtractor):
4360 """Information Extractor for xHamster"""
4361 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4363 def _real_extract(self,url):
4364 mobj = re.match(self._VALID_URL, url)
4366 video_id = mobj.group('id')
4367 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4368 webpage = self._download_webpage(mrss_url, video_id)
4370 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4372 raise ExtractorError(u'Unable to extract media URL')
4373 if len(mobj.group('server')) == 0:
4374 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4376 video_url = mobj.group('server')+'/key='+mobj.group('file')
4377 video_extension = video_url.split('.')[-1]
4379 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4382 # Can't see the description anywhere in the UI
4383 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4384 # webpage, u'description', fatal=False)
4385 # if video_description: video_description = unescapeHTML(video_description)
4387 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4389 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4391 video_upload_date = None
4392 self._downloader.report_warning(u'Unable to extract upload date')
4394 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4395 webpage, u'uploader id', default=u'anonymous')
4397 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4398 webpage, u'thumbnail', fatal=False)
4403 'ext': video_extension,
4404 'title': video_title,
4405 # 'description': video_description,
4406 'upload_date': video_upload_date,
4407 'uploader_id': video_uploader_id,
4408 'thumbnail': video_thumbnail
4411 class HypemIE(InfoExtractor):
4412 """Information Extractor for hypem"""
4413 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4415 def _real_extract(self, url):
4416 mobj = re.match(self._VALID_URL, url)
4418 raise ExtractorError(u'Invalid URL: %s' % url)
4419 track_id = mobj.group(1)
4421 data = { 'ax': 1, 'ts': time.time() }
4422 data_encoded = compat_urllib_parse.urlencode(data)
4423 complete_url = url + "?" + data_encoded
4424 request = compat_urllib_request.Request(complete_url)
4425 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4426 cookie = urlh.headers.get('Set-Cookie', '')
4428 self.report_extraction(track_id)
4430 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4431 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4433 track_list = json.loads(html_tracks)
4434 track = track_list[u'tracks'][0]
4436 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4439 track_id = track[u"id"]
4440 artist = track[u"artist"]
4441 title = track[u"song"]
4443 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4444 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4445 request.add_header('cookie', cookie)
4446 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4448 song_data = json.loads(song_data_json)
4450 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4451 final_url = song_data[u"url"]
4461 class Vbox7IE(InfoExtractor):
4462 """Information Extractor for Vbox7"""
4463 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4465 def _real_extract(self,url):
4466 mobj = re.match(self._VALID_URL, url)
4468 raise ExtractorError(u'Invalid URL: %s' % url)
4469 video_id = mobj.group(1)
4471 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4472 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4473 redirect_url = urlh.geturl() + new_location
4474 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4476 title = self._html_search_regex(r'<title>(.*)</title>',
4477 webpage, u'title').split('/')[0].strip()
4480 info_url = "http://vbox7.com/play/magare.do"
4481 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4482 info_request = compat_urllib_request.Request(info_url, data)
4483 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4484 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4485 if info_response is None:
4486 raise ExtractorError(u'Unable to extract the media url')
4487 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4494 'thumbnail': thumbnail_url,
4497 class GametrailersIE(InfoExtractor):
4498 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4500 def _real_extract(self, url):
4501 mobj = re.match(self._VALID_URL, url)
4503 raise ExtractorError(u'Invalid URL: %s' % url)
4504 video_id = mobj.group('id')
4505 video_type = mobj.group('type')
4506 webpage = self._download_webpage(url, video_id)
4507 if video_type == 'full-episodes':
4508 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4510 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4511 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4512 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4514 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4515 video_id, u'Downloading video info')
4516 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4517 video_id, u'Downloading video urls info')
4519 self.report_extraction(video_id)
4520 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4521 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4523 <url>(?P<thumb>.*?)</url>.*
4526 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4528 raise ExtractorError(u'Unable to extract video info')
4529 video_title = m_info.group('title')
4530 video_description = m_info.group('description')
4531 video_thumb = m_info.group('thumb')
4533 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4534 if m_urls is None or len(m_urls) == 0:
4535 raise ExtractError(u'Unable to extrat video url')
4536 # They are sorted from worst to best quality
4537 video_url = m_urls[-1].group('url')
4539 return {'url': video_url,
4541 'title': video_title,
4542 # Videos are actually flv not mp4
4544 'thumbnail': video_thumb,
4545 'description': video_description,
4548 def gen_extractors():
4549 """ Return a list of an instance of every supported extractor.
4550 The order does matter; the first extractor matched is the one handling the URL.
4553 YoutubePlaylistIE(),
4578 StanfordOpenClassroomIE(),
4588 WorldStarHipHopIE(),
4617 def get_info_extractor(ie_name):
4618 """Returns the info extractor class with the given ie_name"""
4619 return globals()[ie_name+'IE']