2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang') or 'en'
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_warning(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_warning(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0]
736 if 'sig' in url_data:
737 url += '&signature=' + url_data['sig'][0]
738 if 'ratebypass' not in url:
739 url += '&ratebypass=yes'
740 url_map[url_data['itag'][0]] = url
742 format_limit = self._downloader.params.get('format_limit', None)
743 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
744 if format_limit is not None and format_limit in available_formats:
745 format_list = available_formats[available_formats.index(format_limit):]
747 format_list = available_formats
748 existing_formats = [x for x in format_list if x in url_map]
749 if len(existing_formats) == 0:
750 raise ExtractorError(u'no known formats available for video')
751 if self._downloader.params.get('listformats', None):
752 self._print_formats(existing_formats)
754 if req_format is None or req_format == 'best':
755 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
756 elif req_format == 'worst':
757 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
758 elif req_format in ('-1', 'all'):
759 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
761 # Specific formats. We pick the first in a slash-delimeted sequence.
762 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
763 req_formats = req_format.split('/')
764 video_url_list = None
765 for rf in req_formats:
767 video_url_list = [(rf, url_map[rf])]
769 if video_url_list is None:
770 raise ExtractorError(u'requested format not available')
772 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
775 for format_param, video_real_url in video_url_list:
777 video_extension = self._video_extensions.get(format_param, 'flv')
779 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
780 self._video_dimensions.get(format_param, '???'))
784 'url': video_real_url,
785 'uploader': video_uploader,
786 'uploader_id': video_uploader_id,
787 'upload_date': upload_date,
788 'title': video_title,
789 'ext': video_extension,
790 'format': video_format,
791 'thumbnail': video_thumbnail,
792 'description': video_description,
793 'player_url': player_url,
794 'subtitles': video_subtitles,
795 'duration': video_duration
800 class MetacafeIE(InfoExtractor):
801 """Information Extractor for metacafe.com."""
803 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
804 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
805 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
806 IE_NAME = u'metacafe'
808 def report_disclaimer(self):
809 """Report disclaimer retrieval."""
810 self.to_screen(u'Retrieving disclaimer')
812 def _real_initialize(self):
813 # Retrieve disclaimer
814 request = compat_urllib_request.Request(self._DISCLAIMER)
816 self.report_disclaimer()
817 disclaimer = compat_urllib_request.urlopen(request).read()
818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
819 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
824 'submit': "Continue - I'm over 18",
826 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
828 self.report_age_confirmation()
829 disclaimer = compat_urllib_request.urlopen(request).read()
830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
831 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
833 def _real_extract(self, url):
834 # Extract id and simplified title from URL
835 mobj = re.match(self._VALID_URL, url)
837 raise ExtractorError(u'Invalid URL: %s' % url)
839 video_id = mobj.group(1)
841 # Check if video comes from YouTube
842 mobj2 = re.match(r'^yt-(.*)$', video_id)
843 if mobj2 is not None:
844 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
846 # Retrieve video webpage to extract further information
847 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
849 # Extract URL, uploader and title from webpage
850 self.report_extraction(video_id)
851 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
853 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
854 video_extension = mediaURL[-3:]
856 # Extract gdaKey if available
857 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
861 gdaKey = mobj.group(1)
862 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
864 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
866 raise ExtractorError(u'Unable to extract media URL')
867 vardict = compat_parse_qs(mobj.group(1))
868 if 'mediaData' not in vardict:
869 raise ExtractorError(u'Unable to extract media URL')
870 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
872 raise ExtractorError(u'Unable to extract media URL')
873 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
874 video_extension = mediaURL[-3:]
875 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
877 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
879 raise ExtractorError(u'Unable to extract title')
880 video_title = mobj.group(1).decode('utf-8')
882 mobj = re.search(r'submitter=(.*?);', webpage)
884 raise ExtractorError(u'Unable to extract uploader nickname')
885 video_uploader = mobj.group(1)
888 'id': video_id.decode('utf-8'),
889 'url': video_url.decode('utf-8'),
890 'uploader': video_uploader.decode('utf-8'),
892 'title': video_title,
893 'ext': video_extension.decode('utf-8'),
896 class DailymotionIE(InfoExtractor):
897 """Information Extractor for Dailymotion"""
899 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
900 IE_NAME = u'dailymotion'
902 def _real_extract(self, url):
903 # Extract id and simplified title from URL
904 mobj = re.match(self._VALID_URL, url)
906 raise ExtractorError(u'Invalid URL: %s' % url)
908 video_id = mobj.group(1).split('_')[0].split('?')[0]
910 video_extension = 'mp4'
912 # Retrieve video webpage to extract further information
913 request = compat_urllib_request.Request(url)
914 request.add_header('Cookie', 'family_filter=off')
915 webpage = self._download_webpage(request, video_id)
917 # Extract URL, uploader and title from webpage
918 self.report_extraction(video_id)
919 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
921 raise ExtractorError(u'Unable to extract media URL')
922 flashvars = compat_urllib_parse.unquote(mobj.group(1))
924 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
927 self.to_screen(u'Using %s' % key)
930 raise ExtractorError(u'Unable to extract video URL')
932 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
934 raise ExtractorError(u'Unable to extract video URL')
936 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
938 # TODO: support choosing qualities
940 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
942 raise ExtractorError(u'Unable to extract title')
943 video_title = unescapeHTML(mobj.group('title'))
945 video_uploader = None
946 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
947 # Looking for official user
948 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
949 webpage, 'video uploader')
951 video_upload_date = None
952 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
954 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
959 'uploader': video_uploader,
960 'upload_date': video_upload_date,
961 'title': video_title,
962 'ext': video_extension,
966 class PhotobucketIE(InfoExtractor):
967 """Information extractor for photobucket.com."""
969 # TODO: the original _VALID_URL was:
970 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
971 # Check if it's necessary to keep the old extracion process
972 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
973 IE_NAME = u'photobucket'
975 def _real_extract(self, url):
976 # Extract id from URL
977 mobj = re.match(self._VALID_URL, url)
979 raise ExtractorError(u'Invalid URL: %s' % url)
981 video_id = mobj.group('id')
983 video_extension = mobj.group('ext')
985 # Retrieve video webpage to extract further information
986 webpage = self._download_webpage(url, video_id)
988 # Extract URL, uploader, and title from webpage
989 self.report_extraction(video_id)
990 # We try first by looking the javascript code:
991 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
993 info = json.loads(mobj.group('json'))
996 'url': info[u'downloadUrl'],
997 'uploader': info[u'username'],
998 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
999 'title': info[u'title'],
1000 'ext': video_extension,
1001 'thumbnail': info[u'thumbUrl'],
1004 # We try looking in other parts of the webpage
1005 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006 webpage, u'video URL')
1008 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1010 raise ExtractorError(u'Unable to extract title')
1011 video_title = mobj.group(1).decode('utf-8')
1012 video_uploader = mobj.group(2).decode('utf-8')
1015 'id': video_id.decode('utf-8'),
1016 'url': video_url.decode('utf-8'),
1017 'uploader': video_uploader,
1018 'upload_date': None,
1019 'title': video_title,
1020 'ext': video_extension.decode('utf-8'),
1024 class YahooIE(InfoExtractor):
1025 """Information extractor for screen.yahoo.com."""
1026 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1028 def _real_extract(self, url):
1029 mobj = re.match(self._VALID_URL, url)
1031 raise ExtractorError(u'Invalid URL: %s' % url)
1032 video_id = mobj.group('id')
1033 webpage = self._download_webpage(url, video_id)
1034 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1037 # TODO: Check which url parameters are required
1038 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1045 self.report_extraction(video_id)
1046 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1048 raise ExtractorError(u'Unable to extract video info')
1049 video_title = m_info.group('title')
1050 video_description = m_info.group('description')
1051 video_thumb = m_info.group('thumb')
1052 video_date = m_info.group('date')
1053 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1055 # TODO: Find a way to get mp4 videos
1056 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059 video_url = m_rest.group('url')
1060 video_path = m_rest.group('path')
1062 raise ExtractorError(u'Unable to extract video url')
1064 else: # We have to use a different method if another id is defined
1065 long_id = m_id.group('new_id')
1066 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069 info = json.loads(json_str)
1070 res = info[u'query'][u'results'][u'mediaObj'][0]
1071 stream = res[u'streams'][0]
1072 video_path = stream[u'path']
1073 video_url = stream[u'host']
1075 video_title = meta[u'title']
1076 video_description = meta[u'description']
1077 video_thumb = meta[u'thumbnail']
1078 video_date = None # I can't find it
1083 'play_path': video_path,
1084 'title':video_title,
1085 'description': video_description,
1086 'thumbnail': video_thumb,
1087 'upload_date': video_date,
1092 class VimeoIE(InfoExtractor):
1093 """Information extractor for vimeo.com."""
1095 # _VALID_URL matches Vimeo URLs
1096 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1099 def _real_extract(self, url, new_video=True):
1100 # Extract ID from URL
1101 mobj = re.match(self._VALID_URL, url)
1103 raise ExtractorError(u'Invalid URL: %s' % url)
1105 video_id = mobj.group('id')
1106 if not mobj.group('proto'):
1107 url = 'https://' + url
1108 if mobj.group('direct_link') or mobj.group('pro'):
1109 url = 'https://vimeo.com/' + video_id
1111 # Retrieve video webpage to extract further information
1112 request = compat_urllib_request.Request(url, None, std_headers)
1113 webpage = self._download_webpage(request, video_id)
1115 # Now we begin extracting as much information as we can from what we
1116 # retrieved. First we extract the information common to all extractors,
1117 # and latter we extract those that are Vimeo specific.
1118 self.report_extraction(video_id)
1120 # Extract the config JSON
1122 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1123 config = json.loads(config)
1125 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1126 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1128 raise ExtractorError(u'Unable to extract info section')
1131 video_title = config["video"]["title"]
1133 # Extract uploader and uploader_id
1134 video_uploader = config["video"]["owner"]["name"]
1135 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1137 # Extract video thumbnail
1138 video_thumbnail = config["video"]["thumbnail"]
1140 # Extract video description
1141 video_description = get_element_by_attribute("itemprop", "description", webpage)
1142 if video_description: video_description = clean_html(video_description)
1143 else: video_description = u''
1145 # Extract upload date
1146 video_upload_date = None
1147 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148 if mobj is not None:
1149 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1151 # Vimeo specific: extract request signature and timestamp
1152 sig = config['request']['signature']
1153 timestamp = config['request']['timestamp']
1155 # Vimeo specific: extract video codec and quality information
1156 # First consider quality, then codecs, then take everything
1157 # TODO bind to format param
1158 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159 files = { 'hd': [], 'sd': [], 'other': []}
1160 for codec_name, codec_extension in codecs:
1161 if codec_name in config["video"]["files"]:
1162 if 'hd' in config["video"]["files"][codec_name]:
1163 files['hd'].append((codec_name, codec_extension, 'hd'))
1164 elif 'sd' in config["video"]["files"][codec_name]:
1165 files['sd'].append((codec_name, codec_extension, 'sd'))
1167 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1169 for quality in ('hd', 'sd', 'other'):
1170 if len(files[quality]) > 0:
1171 video_quality = files[quality][0][2]
1172 video_codec = files[quality][0][0]
1173 video_extension = files[quality][0][1]
1174 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1177 raise ExtractorError(u'No known codec found')
1179 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1185 'uploader': video_uploader,
1186 'uploader_id': video_uploader_id,
1187 'upload_date': video_upload_date,
1188 'title': video_title,
1189 'ext': video_extension,
1190 'thumbnail': video_thumbnail,
1191 'description': video_description,
1195 class ArteTvIE(InfoExtractor):
1196 """arte.tv information extractor."""
1198 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199 _LIVE_URL = r'index-[0-9]+\.html$'
1201 IE_NAME = u'arte.tv'
1203 def fetch_webpage(self, url):
1204 request = compat_urllib_request.Request(url)
1206 self.report_download_webpage(url)
1207 webpage = compat_urllib_request.urlopen(request).read()
1208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210 except ValueError as err:
1211 raise ExtractorError(u'Invalid URL: %s' % url)
1214 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1215 page = self.fetch_webpage(url)
1216 mobj = re.search(regex, page, regexFlags)
1220 raise ExtractorError(u'Invalid URL: %s' % url)
1222 for (i, key, err) in matchTuples:
1223 if mobj.group(i) is None:
1224 raise ExtractorError(err)
1226 info[key] = mobj.group(i)
1230 def extractLiveStream(self, url):
1231 video_lang = url.split('/')[-4]
1232 info = self.grep_webpage(
1234 r'src="(.*?/videothek_js.*?\.js)',
1237 (1, 'url', u'Invalid URL: %s' % url)
1240 http_host = url.split('/')[2]
1241 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242 info = self.grep_webpage(
1244 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245 '(http://.*?\.swf).*?' +
1249 (1, 'path', u'could not extract video path: %s' % url),
1250 (2, 'player', u'could not extract video player: %s' % url),
1251 (3, 'url', u'could not extract video url: %s' % url)
1254 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1256 def extractPlus7Stream(self, url):
1257 video_lang = url.split('/')[-3]
1258 info = self.grep_webpage(
1260 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1263 (1, 'url', u'Invalid URL: %s' % url)
1266 next_url = compat_urllib_parse.unquote(info.get('url'))
1267 info = self.grep_webpage(
1269 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1272 (1, 'url', u'Could not find <video> tag: %s' % url)
1275 next_url = compat_urllib_parse.unquote(info.get('url'))
1277 info = self.grep_webpage(
1279 r'<video id="(.*?)".*?>.*?' +
1280 '<name>(.*?)</name>.*?' +
1281 '<dateVideo>(.*?)</dateVideo>.*?' +
1282 '<url quality="hd">(.*?)</url>',
1285 (1, 'id', u'could not extract video id: %s' % url),
1286 (2, 'title', u'could not extract video title: %s' % url),
1287 (3, 'date', u'could not extract video date: %s' % url),
1288 (4, 'url', u'could not extract video url: %s' % url)
1293 'id': info.get('id'),
1294 'url': compat_urllib_parse.unquote(info.get('url')),
1295 'uploader': u'arte.tv',
1296 'upload_date': unified_strdate(info.get('date')),
1297 'title': info.get('title').decode('utf-8'),
1303 def _real_extract(self, url):
1304 video_id = url.split('/')[-1]
1305 self.report_extraction(video_id)
1307 if re.search(self._LIVE_URL, video_id) is not None:
1308 self.extractLiveStream(url)
1311 info = self.extractPlus7Stream(url)
1316 class GenericIE(InfoExtractor):
1317 """Generic last-resort information extractor."""
1320 IE_NAME = u'generic'
1322 def report_download_webpage(self, video_id):
1323 """Report webpage download."""
1324 if not self._downloader.params.get('test', False):
1325 self._downloader.report_warning(u'Falling back on generic information extractor.')
1326 super(GenericIE, self).report_download_webpage(video_id)
1328 def report_following_redirect(self, new_url):
1329 """Report information extraction."""
1330 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1332 def _test_redirect(self, url):
1333 """Check if it is a redirect, like url shorteners, in case return the new url."""
1334 class HeadRequest(compat_urllib_request.Request):
1335 def get_method(self):
1338 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1340 Subclass the HTTPRedirectHandler to make it use our
1341 HeadRequest also on the redirected URL
1343 def redirect_request(self, req, fp, code, msg, headers, newurl):
1344 if code in (301, 302, 303, 307):
1345 newurl = newurl.replace(' ', '%20')
1346 newheaders = dict((k,v) for k,v in req.headers.items()
1347 if k.lower() not in ("content-length", "content-type"))
1348 return HeadRequest(newurl,
1350 origin_req_host=req.get_origin_req_host(),
1353 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1355 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1357 Fallback to GET if HEAD is not allowed (405 HTTP error)
1359 def http_error_405(self, req, fp, code, msg, headers):
1363 newheaders = dict((k,v) for k,v in req.headers.items()
1364 if k.lower() not in ("content-length", "content-type"))
1365 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1367 origin_req_host=req.get_origin_req_host(),
1371 opener = compat_urllib_request.OpenerDirector()
1372 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1373 HTTPMethodFallback, HEADRedirectHandler,
1374 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1375 opener.add_handler(handler())
1377 response = opener.open(HeadRequest(url))
1378 if response is None:
1379 raise ExtractorError(u'Invalid URL protocol')
1380 new_url = response.geturl()
1385 self.report_following_redirect(new_url)
1388 def _real_extract(self, url):
1389 new_url = self._test_redirect(url)
1390 if new_url: return [self.url_result(new_url)]
1392 video_id = url.split('/')[-1]
1394 webpage = self._download_webpage(url, video_id)
1395 except ValueError as err:
1396 # since this is the last-resort InfoExtractor, if
1397 # this error is thrown, it'll be thrown here
1398 raise ExtractorError(u'Invalid URL: %s' % url)
1400 self.report_extraction(video_id)
1401 # Start with something easy: JW Player in SWFObject
1402 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit
1405 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1407 # Broaden the search a little bit: JWPlayer JS loader
1408 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1410 # Try to find twitter cards info
1411 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1413 # We look for Open Graph info:
1414 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1415 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1416 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1417 if m_video_type is not None:
1418 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1420 raise ExtractorError(u'Invalid URL: %s' % url)
1422 # It's possible that one of the regexes
1423 # matched, but returned an empty group:
1424 if mobj.group(1) is None:
1425 raise ExtractorError(u'Invalid URL: %s' % url)
1427 video_url = compat_urllib_parse.unquote(mobj.group(1))
1428 video_id = os.path.basename(video_url)
1430 # here's a fun little line of code for you:
1431 video_extension = os.path.splitext(video_id)[1][1:]
1432 video_id = os.path.splitext(video_id)[0]
1434 # it's tempting to parse this further, but you would
1435 # have to take into account all the variations like
1436 # Video Title - Site Name
1437 # Site Name | Video Title
1438 # Video Title - Tagline | Site Name
1439 # and so on and so forth; it's just not practical
1440 video_title = self._html_search_regex(r'<title>(.*)</title>',
1441 webpage, u'video title')
1443 # video uploader is domain name
1444 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1445 url, u'video uploader')
1450 'uploader': video_uploader,
1451 'upload_date': None,
1452 'title': video_title,
1453 'ext': video_extension,
1457 class YoutubeSearchIE(SearchInfoExtractor):
1458 """Information Extractor for YouTube search queries."""
1459 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1461 IE_NAME = u'youtube:search'
1462 _SEARCH_KEY = 'ytsearch'
1464 def report_download_page(self, query, pagenum):
1465 """Report attempt to download search page with given number."""
1466 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1468 def _get_n_results(self, query, n):
1469 """Get a specified number of results for a query"""
1475 while (50 * pagenum) < limit:
1476 self.report_download_page(query, pagenum+1)
1477 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1478 request = compat_urllib_request.Request(result_url)
1480 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1482 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1483 api_response = json.loads(data)['data']
1485 if not 'items' in api_response:
1486 raise ExtractorError(u'[youtube] No video results')
1488 new_ids = list(video['id'] for video in api_response['items'])
1489 video_ids += new_ids
1491 limit = min(n, api_response['totalItems'])
1494 if len(video_ids) > n:
1495 video_ids = video_ids[:n]
1496 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1497 return self.playlist_result(videos, query)
1500 class GoogleSearchIE(SearchInfoExtractor):
1501 """Information Extractor for Google Video search queries."""
1502 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1504 IE_NAME = u'video.google:search'
1505 _SEARCH_KEY = 'gvsearch'
1507 def _get_n_results(self, query, n):
1508 """Get a specified number of results for a query"""
1511 '_type': 'playlist',
1516 for pagenum in itertools.count(1):
1517 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1518 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1519 note='Downloading result page ' + str(pagenum))
1521 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1524 'url': mobj.group(1)
1526 res['entries'].append(e)
1528 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1531 class YahooSearchIE(SearchInfoExtractor):
1532 """Information Extractor for Yahoo! Video search queries."""
1535 IE_NAME = u'screen.yahoo:search'
1536 _SEARCH_KEY = 'yvsearch'
1538 def _get_n_results(self, query, n):
1539 """Get a specified number of results for a query"""
1542 '_type': 'playlist',
1546 for pagenum in itertools.count(0):
1547 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1548 webpage = self._download_webpage(result_url, query,
1549 note='Downloading results page '+str(pagenum+1))
1550 info = json.loads(webpage)
1552 results = info[u'results']
1554 for (i, r) in enumerate(results):
1555 if (pagenum * 30) +i >= n:
1557 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1558 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1559 res['entries'].append(e)
1560 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1566 class YoutubePlaylistIE(InfoExtractor):
1567 """Information Extractor for YouTube playlists."""
1569 _VALID_URL = r"""(?:
1574 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1575 \? (?:.*?&)*? (?:p|a|list)=
1578 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1581 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1583 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1585 IE_NAME = u'youtube:playlist'
1588 def suitable(cls, url):
1589 """Receives a URL and returns True if suitable for this IE."""
1590 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1592 def _real_extract(self, url):
1593 # Extract playlist id
1594 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1596 raise ExtractorError(u'Invalid URL: %s' % url)
1598 # Download playlist videos from API
1599 playlist_id = mobj.group(1) or mobj.group(2)
1604 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1605 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1608 response = json.loads(page)
1609 except ValueError as err:
1610 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1612 if 'feed' not in response:
1613 raise ExtractorError(u'Got a malformed response from YouTube API')
1614 playlist_title = response['feed']['title']['$t']
1615 if 'entry' not in response['feed']:
1616 # Number of videos is a multiple of self._MAX_RESULTS
1619 for entry in response['feed']['entry']:
1620 index = entry['yt$position']['$t']
1621 if 'media$group' in entry and 'media$player' in entry['media$group']:
1622 videos.append((index, entry['media$group']['media$player']['url']))
1624 if len(response['feed']['entry']) < self._MAX_RESULTS:
1628 videos = [v[1] for v in sorted(videos)]
1630 url_results = [self.url_result(url, 'Youtube') for url in videos]
1631 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1634 class YoutubeChannelIE(InfoExtractor):
1635 """Information Extractor for YouTube channels."""
1637 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1638 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1639 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1640 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1641 IE_NAME = u'youtube:channel'
1643 def extract_videos_from_page(self, page):
1645 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1646 if mobj.group(1) not in ids_in_page:
1647 ids_in_page.append(mobj.group(1))
1650 def _real_extract(self, url):
1651 # Extract channel id
1652 mobj = re.match(self._VALID_URL, url)
1654 raise ExtractorError(u'Invalid URL: %s' % url)
1656 # Download channel page
1657 channel_id = mobj.group(1)
1661 url = self._TEMPLATE_URL % (channel_id, pagenum)
1662 page = self._download_webpage(url, channel_id,
1663 u'Downloading page #%s' % pagenum)
1665 # Extract video identifiers
1666 ids_in_page = self.extract_videos_from_page(page)
1667 video_ids.extend(ids_in_page)
1669 # Download any subsequent channel pages using the json-based channel_ajax query
1670 if self._MORE_PAGES_INDICATOR in page:
1672 pagenum = pagenum + 1
1674 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1675 page = self._download_webpage(url, channel_id,
1676 u'Downloading page #%s' % pagenum)
1678 page = json.loads(page)
1680 ids_in_page = self.extract_videos_from_page(page['content_html'])
1681 video_ids.extend(ids_in_page)
1683 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1686 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1688 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1689 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1690 return [self.playlist_result(url_entries, channel_id)]
1693 class YoutubeUserIE(InfoExtractor):
1694 """Information Extractor for YouTube users."""
1696 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1697 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1698 _GDATA_PAGE_SIZE = 50
1699 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1700 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1701 IE_NAME = u'youtube:user'
1703 def _real_extract(self, url):
1705 mobj = re.match(self._VALID_URL, url)
1707 raise ExtractorError(u'Invalid URL: %s' % url)
1709 username = mobj.group(1)
1711 # Download video ids using YouTube Data API. Result size per
1712 # query is limited (currently to 50 videos) so we need to query
1713 # page by page until there are no video ids - it means we got
1720 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1722 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1723 page = self._download_webpage(gdata_url, username,
1724 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1726 # Extract video identifiers
1729 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1730 if mobj.group(1) not in ids_in_page:
1731 ids_in_page.append(mobj.group(1))
1733 video_ids.extend(ids_in_page)
1735 # A little optimization - if current page is not
1736 # "full", ie. does not contain PAGE_SIZE video ids then
1737 # we can assume that this page is the last one - there
1738 # are no more ids on further pages - no need to query
1741 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1746 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1747 url_results = [self.url_result(url, 'Youtube') for url in urls]
1748 return [self.playlist_result(url_results, playlist_title = username)]
1751 class BlipTVUserIE(InfoExtractor):
1752 """Information Extractor for blip.tv users."""
1754 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1756 IE_NAME = u'blip.tv:user'
1758 def _real_extract(self, url):
1760 mobj = re.match(self._VALID_URL, url)
1762 raise ExtractorError(u'Invalid URL: %s' % url)
1764 username = mobj.group(1)
1766 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1768 page = self._download_webpage(url, username, u'Downloading user page')
1769 mobj = re.search(r'data-users-id="([^"]+)"', page)
1770 page_base = page_base % mobj.group(1)
1773 # Download video ids using BlipTV Ajax calls. Result size per
1774 # query is limited (currently to 12 videos) so we need to query
1775 # page by page until there are no video ids - it means we got
1782 url = page_base + "&page=" + str(pagenum)
1783 page = self._download_webpage(url, username,
1784 u'Downloading video ids from page %d' % pagenum)
1786 # Extract video identifiers
1789 for mobj in re.finditer(r'href="/([^"]+)"', page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(unescapeHTML(mobj.group(1)))
1793 video_ids.extend(ids_in_page)
1795 # A little optimization - if current page is not
1796 # "full", ie. does not contain PAGE_SIZE video ids then
1797 # we can assume that this page is the last one - there
1798 # are no more ids on further pages - no need to query
1801 if len(ids_in_page) < self._PAGE_SIZE:
1806 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1807 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1808 return [self.playlist_result(url_entries, playlist_title = username)]
1811 class DepositFilesIE(InfoExtractor):
1812 """Information extractor for depositfiles.com"""
1814 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1816 def _real_extract(self, url):
1817 file_id = url.split('/')[-1]
1818 # Rebuild url in english locale
1819 url = 'http://depositfiles.com/en/files/' + file_id
1821 # Retrieve file webpage with 'Free download' button pressed
1822 free_download_indication = { 'gateway_result' : '1' }
1823 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1825 self.report_download_webpage(file_id)
1826 webpage = compat_urllib_request.urlopen(request).read()
1827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1830 # Search for the real file URL
1831 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1832 if (mobj is None) or (mobj.group(1) is None):
1833 # Try to figure out reason of the error.
1834 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1835 if (mobj is not None) and (mobj.group(1) is not None):
1836 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1837 raise ExtractorError(u'%s' % restriction_message)
1839 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1841 file_url = mobj.group(1)
1842 file_extension = os.path.splitext(file_url)[1][1:]
1844 # Search for file title
1845 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1848 'id': file_id.decode('utf-8'),
1849 'url': file_url.decode('utf-8'),
1851 'upload_date': None,
1852 'title': file_title,
1853 'ext': file_extension.decode('utf-8'),
1857 class FacebookIE(InfoExtractor):
1858 """Information Extractor for Facebook"""
1860 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1861 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1862 _NETRC_MACHINE = 'facebook'
1863 IE_NAME = u'facebook'
1865 def report_login(self):
1866 """Report attempt to log in."""
1867 self.to_screen(u'Logging in')
1869 def _real_initialize(self):
1870 if self._downloader is None:
1875 downloader_params = self._downloader.params
1877 # Attempt to use provided username and password or .netrc data
1878 if downloader_params.get('username', None) is not None:
1879 useremail = downloader_params['username']
1880 password = downloader_params['password']
1881 elif downloader_params.get('usenetrc', False):
1883 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1884 if info is not None:
1888 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1889 except (IOError, netrc.NetrcParseError) as err:
1890 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1893 if useremail is None:
1902 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1905 login_results = compat_urllib_request.urlopen(request).read()
1906 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1907 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1913 def _real_extract(self, url):
1914 mobj = re.match(self._VALID_URL, url)
1916 raise ExtractorError(u'Invalid URL: %s' % url)
1917 video_id = mobj.group('ID')
1919 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1920 webpage = self._download_webpage(url, video_id)
1922 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1923 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1924 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1926 raise ExtractorError(u'Cannot parse data')
1927 data = dict(json.loads(m.group(1)))
1928 params_raw = compat_urllib_parse.unquote(data['params'])
1929 params = json.loads(params_raw)
1930 video_data = params['video_data'][0]
1931 video_url = video_data.get('hd_src')
1933 video_url = video_data['sd_src']
1935 raise ExtractorError(u'Cannot find video URL')
1936 video_duration = int(video_data['video_duration'])
1937 thumbnail = video_data['thumbnail_src']
1939 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1944 'title': video_title,
1947 'duration': video_duration,
1948 'thumbnail': thumbnail,
1953 class BlipTVIE(InfoExtractor):
1954 """Information extractor for blip.tv"""
1956 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1957 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1958 IE_NAME = u'blip.tv'
1960 def report_direct_download(self, title):
1961 """Report information extraction."""
1962 self.to_screen(u'%s: Direct download detected' % title)
1964 def _real_extract(self, url):
1965 mobj = re.match(self._VALID_URL, url)
1967 raise ExtractorError(u'Invalid URL: %s' % url)
1969 # See https://github.com/rg3/youtube-dl/issues/857
1970 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1971 if api_mobj is not None:
1972 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1973 urlp = compat_urllib_parse_urlparse(url)
1974 if urlp.path.startswith('/play/'):
1975 request = compat_urllib_request.Request(url)
1976 response = compat_urllib_request.urlopen(request)
1977 redirecturl = response.geturl()
1978 rurlp = compat_urllib_parse_urlparse(redirecturl)
1979 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1980 url = 'http://blip.tv/a/a-' + file_id
1981 return self._real_extract(url)
1988 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1989 request = compat_urllib_request.Request(json_url)
1990 request.add_header('User-Agent', 'iTunes/10.6.1')
1991 self.report_extraction(mobj.group(1))
1994 urlh = compat_urllib_request.urlopen(request)
1995 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1996 basename = url.split('/')[-1]
1997 title,ext = os.path.splitext(basename)
1998 title = title.decode('UTF-8')
1999 ext = ext.replace('.', '')
2000 self.report_direct_download(title)
2005 'upload_date': None,
2010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2011 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2012 if info is None: # Regular URL
2014 json_code_bytes = urlh.read()
2015 json_code = json_code_bytes.decode('utf-8')
2016 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2020 json_data = json.loads(json_code)
2021 if 'Post' in json_data:
2022 data = json_data['Post']
2026 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2027 video_url = data['media']['url']
2028 umobj = re.match(self._URL_EXT, video_url)
2030 raise ValueError('Can not determine filename extension')
2031 ext = umobj.group(1)
2034 'id': data['item_id'],
2036 'uploader': data['display_name'],
2037 'upload_date': upload_date,
2038 'title': data['title'],
2040 'format': data['media']['mimeType'],
2041 'thumbnail': data['thumbnailUrl'],
2042 'description': data['description'],
2043 'player_url': data['embedUrl'],
2044 'user_agent': 'iTunes/10.6.1',
2046 except (ValueError,KeyError) as err:
2047 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2052 class MyVideoIE(InfoExtractor):
2053 """Information Extractor for myvideo.de."""
2055 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2056 IE_NAME = u'myvideo'
2058 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2059 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2060 # https://github.com/rg3/youtube-dl/pull/842
2061 def __rc4crypt(self,data, key):
2063 box = list(range(256))
2064 for i in list(range(256)):
2065 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2066 box[i], box[x] = box[x], box[i]
2072 y = (y + box[x]) % 256
2073 box[x], box[y] = box[y], box[x]
2074 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2078 return hashlib.md5(s).hexdigest().encode()
2080 def _real_extract(self,url):
2081 mobj = re.match(self._VALID_URL, url)
2083 raise ExtractorError(u'invalid URL: %s' % url)
2085 video_id = mobj.group(1)
2088 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2089 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2090 b'TnpsbA0KTVRkbU1tSTRNdz09'
2094 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2095 webpage = self._download_webpage(webpage_url, video_id)
2097 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2098 if mobj is not None:
2099 self.report_extraction(video_id)
2100 video_url = mobj.group(1) + '.flv'
2102 video_title = self._html_search_regex('<title>([^<]+)</title>',
2105 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2111 'upload_date': None,
2112 'title': video_title,
2117 mobj = re.search('var flashvars={(.+?)}', webpage)
2119 raise ExtractorError(u'Unable to extract video')
2124 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2125 if not a == '_encxml':
2128 encxml = compat_urllib_parse.unquote(b)
2129 if not params.get('domain'):
2130 params['domain'] = 'www.myvideo.de'
2131 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2132 if 'flash_playertype=MTV' in xmldata_url:
2133 self._downloader.report_warning(u'avoiding MTV player')
2135 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2136 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2140 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2141 enc_data_b = binascii.unhexlify(enc_data)
2143 base64.b64decode(base64.b64decode(GK)) +
2145 str(video_id).encode('utf-8')
2148 dec_data = self.__rc4crypt(enc_data_b, sk)
2151 self.report_extraction(video_id)
2154 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2156 video_url = compat_urllib_parse.unquote(mobj.group(1))
2157 if 'myvideo2flash' in video_url:
2158 self._downloader.report_warning(u'forcing RTMPT ...')
2159 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2162 # extract non rtmp videos
2163 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2165 raise ExtractorError(u'unable to extract url')
2166 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2168 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2169 video_file = compat_urllib_parse.unquote(video_file)
2171 if not video_file.endswith('f4m'):
2172 ppath, prefix = video_file.split('.')
2173 video_playpath = '%s:%s' % (prefix, ppath)
2174 video_hls_playlist = ''
2177 video_hls_playlist = (
2178 video_filepath + video_file
2179 ).replace('.f4m', '.m3u8')
2181 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2182 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2184 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2190 'tc_url': video_url,
2192 'upload_date': None,
2193 'title': video_title,
2195 'play_path': video_playpath,
2196 'video_file': video_file,
2197 'video_hls_playlist': video_hls_playlist,
2198 'player_url': video_swfobj,
2202 class ComedyCentralIE(InfoExtractor):
2203 """Information extractor for The Daily Show and Colbert Report """
2205 # urls can be abbreviations like :thedailyshow or :colbert
2206 # urls for episodes like:
2207 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2208 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2209 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2210 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2211 |(https?://)?(www\.)?
2212 (?P<showname>thedailyshow|colbertnation)\.com/
2213 (full-episodes/(?P<episode>.*)|
2215 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2216 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2219 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2221 _video_extensions = {
2229 _video_dimensions = {
2239 def suitable(cls, url):
2240 """Receives a URL and returns True if suitable for this IE."""
2241 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2243 def _print_formats(self, formats):
2244 print('Available formats:')
2246 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2249 def _real_extract(self, url):
2250 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252 raise ExtractorError(u'Invalid URL: %s' % url)
2254 if mobj.group('shortname'):
2255 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2256 url = u'http://www.thedailyshow.com/full-episodes/'
2258 url = u'http://www.colbertnation.com/full-episodes/'
2259 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2260 assert mobj is not None
2262 if mobj.group('clip'):
2263 if mobj.group('showname') == 'thedailyshow':
2264 epTitle = mobj.group('tdstitle')
2266 epTitle = mobj.group('cntitle')
2269 dlNewest = not mobj.group('episode')
2271 epTitle = mobj.group('showname')
2273 epTitle = mobj.group('episode')
2275 self.report_extraction(epTitle)
2276 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2278 url = htmlHandle.geturl()
2279 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2281 raise ExtractorError(u'Invalid redirected URL: ' + url)
2282 if mobj.group('episode') == '':
2283 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2284 epTitle = mobj.group('episode')
2286 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2288 if len(mMovieParams) == 0:
2289 # The Colbert Report embeds the information in a without
2290 # a URL prefix; so extract the alternate reference
2291 # and then add the URL prefix manually.
2293 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2294 if len(altMovieParams) == 0:
2295 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2297 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2299 uri = mMovieParams[0][1]
2300 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2301 indexXml = self._download_webpage(indexUrl, epTitle,
2302 u'Downloading show index',
2303 u'unable to download episode index')
2307 idoc = xml.etree.ElementTree.fromstring(indexXml)
2308 itemEls = idoc.findall('.//item')
2309 for partNum,itemEl in enumerate(itemEls):
2310 mediaId = itemEl.findall('./guid')[0].text
2311 shortMediaId = mediaId.split(':')[-1]
2312 showId = mediaId.split(':')[-2].replace('.com', '')
2313 officialTitle = itemEl.findall('./title')[0].text
2314 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2316 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2317 compat_urllib_parse.urlencode({'uri': mediaId}))
2318 configXml = self._download_webpage(configUrl, epTitle,
2319 u'Downloading configuration for %s' % shortMediaId)
2321 cdoc = xml.etree.ElementTree.fromstring(configXml)
2323 for rendition in cdoc.findall('.//rendition'):
2324 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2328 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2331 if self._downloader.params.get('listformats', None):
2332 self._print_formats([i[0] for i in turls])
2335 # For now, just pick the highest bitrate
2336 format,rtmp_video_url = turls[-1]
2338 # Get the format arg from the arg stream
2339 req_format = self._downloader.params.get('format', None)
2341 # Select format if we can find one
2344 format, rtmp_video_url = f, v
2347 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2349 raise ExtractorError(u'Cannot transform RTMP url')
2350 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2351 video_url = base + m.group('finalid')
2353 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2358 'upload_date': officialDate,
2363 'description': officialTitle,
2365 results.append(info)
2370 class EscapistIE(InfoExtractor):
2371 """Information extractor for The Escapist """
2373 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2374 IE_NAME = u'escapist'
2376 def _real_extract(self, url):
2377 mobj = re.match(self._VALID_URL, url)
2379 raise ExtractorError(u'Invalid URL: %s' % url)
2380 showName = mobj.group('showname')
2381 videoId = mobj.group('episode')
2383 self.report_extraction(videoId)
2384 webpage = self._download_webpage(url, videoId)
2386 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2387 webpage, u'description', fatal=False)
2389 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2390 webpage, u'thumbnail', fatal=False)
2392 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2393 webpage, u'player url')
2395 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2396 webpage, u'player url').split(' : ')[-1]
2398 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2399 configUrl = compat_urllib_parse.unquote(configUrl)
2401 configJSON = self._download_webpage(configUrl, videoId,
2402 u'Downloading configuration',
2403 u'unable to download configuration')
2405 # Technically, it's JavaScript, not JSON
2406 configJSON = configJSON.replace("'", '"')
2409 config = json.loads(configJSON)
2410 except (ValueError,) as err:
2411 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2413 playlist = config['playlist']
2414 videoUrl = playlist[1]['url']
2419 'uploader': showName,
2420 'upload_date': None,
2423 'thumbnail': imgUrl,
2424 'description': videoDesc,
2425 'player_url': playerUrl,
2430 class CollegeHumorIE(InfoExtractor):
2431 """Information extractor for collegehumor.com"""
2434 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2435 IE_NAME = u'collegehumor'
2437 def report_manifest(self, video_id):
2438 """Report information extraction."""
2439 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2441 def _real_extract(self, url):
2442 mobj = re.match(self._VALID_URL, url)
2444 raise ExtractorError(u'Invalid URL: %s' % url)
2445 video_id = mobj.group('videoid')
2450 'upload_date': None,
2453 self.report_extraction(video_id)
2454 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2456 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2458 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2460 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2462 videoNode = mdoc.findall('./video')[0]
2463 info['description'] = videoNode.findall('./description')[0].text
2464 info['title'] = videoNode.findall('./caption')[0].text
2465 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2466 manifest_url = videoNode.findall('./file')[0].text
2468 raise ExtractorError(u'Invalid metadata XML file')
2470 manifest_url += '?hdcore=2.10.3'
2471 self.report_manifest(video_id)
2473 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2477 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2479 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2480 node_id = media_node.attrib['url']
2481 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2482 except IndexError as err:
2483 raise ExtractorError(u'Invalid manifest file')
2485 url_pr = compat_urllib_parse_urlparse(manifest_url)
2486 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2493 class XVideosIE(InfoExtractor):
2494 """Information extractor for xvideos.com"""
2496 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2497 IE_NAME = u'xvideos'
2499 def _real_extract(self, url):
2500 mobj = re.match(self._VALID_URL, url)
2502 raise ExtractorError(u'Invalid URL: %s' % url)
2503 video_id = mobj.group(1)
2505 webpage = self._download_webpage(url, video_id)
2507 self.report_extraction(video_id)
2510 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2511 webpage, u'video URL'))
2514 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2517 # Extract video thumbnail
2518 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2519 webpage, u'thumbnail', fatal=False)
2525 'upload_date': None,
2526 'title': video_title,
2528 'thumbnail': video_thumbnail,
2529 'description': None,
2535 class SoundcloudIE(InfoExtractor):
2536 """Information extractor for soundcloud.com
2537 To access the media, the uid of the song and a stream token
2538 must be extracted from the page source and the script must make
2539 a request to media.soundcloud.com/crossdomain.xml. Then
2540 the media can be grabbed by requesting from an url composed
2541 of the stream token and uid
2544 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2545 IE_NAME = u'soundcloud'
2547 def report_resolve(self, video_id):
2548 """Report information extraction."""
2549 self.to_screen(u'%s: Resolving id' % video_id)
2551 def _real_extract(self, url):
2552 mobj = re.match(self._VALID_URL, url)
2554 raise ExtractorError(u'Invalid URL: %s' % url)
2556 # extract uploader (which is in the url)
2557 uploader = mobj.group(1)
2558 # extract simple title (uploader + slug of song title)
2559 slug_title = mobj.group(2)
2560 simple_title = uploader + u'-' + slug_title
2561 full_title = '%s/%s' % (uploader, slug_title)
2563 self.report_resolve(full_title)
2565 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2566 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2567 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2569 info = json.loads(info_json)
2570 video_id = info['id']
2571 self.report_extraction(full_title)
2573 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2574 stream_json = self._download_webpage(streams_url, full_title,
2575 u'Downloading stream definitions',
2576 u'unable to download stream definitions')
2578 streams = json.loads(stream_json)
2579 mediaURL = streams['http_mp3_128_url']
2580 upload_date = unified_strdate(info['created_at'])
2585 'uploader': info['user']['username'],
2586 'upload_date': upload_date,
2587 'title': info['title'],
2589 'description': info['description'],
2592 class SoundcloudSetIE(InfoExtractor):
2593 """Information extractor for soundcloud.com sets
2594 To access the media, the uid of the song and a stream token
2595 must be extracted from the page source and the script must make
2596 a request to media.soundcloud.com/crossdomain.xml. Then
2597 the media can be grabbed by requesting from an url composed
2598 of the stream token and uid
2601 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2602 IE_NAME = u'soundcloud:set'
2604 def report_resolve(self, video_id):
2605 """Report information extraction."""
2606 self.to_screen(u'%s: Resolving id' % video_id)
2608 def _real_extract(self, url):
2609 mobj = re.match(self._VALID_URL, url)
2611 raise ExtractorError(u'Invalid URL: %s' % url)
2613 # extract uploader (which is in the url)
2614 uploader = mobj.group(1)
2615 # extract simple title (uploader + slug of song title)
2616 slug_title = mobj.group(2)
2617 simple_title = uploader + u'-' + slug_title
2618 full_title = '%s/sets/%s' % (uploader, slug_title)
2620 self.report_resolve(full_title)
2622 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2623 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2624 info_json = self._download_webpage(resolv_url, full_title)
2627 info = json.loads(info_json)
2628 if 'errors' in info:
2629 for err in info['errors']:
2630 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2633 self.report_extraction(full_title)
2634 for track in info['tracks']:
2635 video_id = track['id']
2637 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2638 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2640 self.report_extraction(video_id)
2641 streams = json.loads(stream_json)
2642 mediaURL = streams['http_mp3_128_url']
2647 'uploader': track['user']['username'],
2648 'upload_date': unified_strdate(track['created_at']),
2649 'title': track['title'],
2651 'description': track['description'],
2656 class InfoQIE(InfoExtractor):
2657 """Information extractor for infoq.com"""
2658 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2660 def _real_extract(self, url):
2661 mobj = re.match(self._VALID_URL, url)
2663 raise ExtractorError(u'Invalid URL: %s' % url)
2665 webpage = self._download_webpage(url, video_id=url)
2666 self.report_extraction(url)
2669 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2671 raise ExtractorError(u'Unable to extract video url')
2672 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2673 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2676 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2679 # Extract description
2680 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2681 webpage, u'description', fatal=False)
2683 video_filename = video_url.split('/')[-1]
2684 video_id, extension = video_filename.split('.')
2690 'upload_date': None,
2691 'title': video_title,
2692 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2694 'description': video_description,
2699 class MixcloudIE(InfoExtractor):
2700 """Information extractor for www.mixcloud.com"""
2702 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2703 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2704 IE_NAME = u'mixcloud'
2706 def report_download_json(self, file_id):
2707 """Report JSON download."""
2708 self.to_screen(u'Downloading json')
2710 def get_urls(self, jsonData, fmt, bitrate='best'):
2711 """Get urls from 'audio_formats' section in json"""
2714 bitrate_list = jsonData[fmt]
2715 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2716 bitrate = max(bitrate_list) # select highest
2718 url_list = jsonData[fmt][bitrate]
2719 except TypeError: # we have no bitrate info.
2720 url_list = jsonData[fmt]
2723 def check_urls(self, url_list):
2724 """Returns 1st active url from list"""
2725 for url in url_list:
2727 compat_urllib_request.urlopen(url)
2729 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2734 def _print_formats(self, formats):
2735 print('Available formats:')
2736 for fmt in formats.keys():
2737 for b in formats[fmt]:
2739 ext = formats[fmt][b][0]
2740 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2741 except TypeError: # we have no bitrate info
2742 ext = formats[fmt][0]
2743 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2746 def _real_extract(self, url):
2747 mobj = re.match(self._VALID_URL, url)
2749 raise ExtractorError(u'Invalid URL: %s' % url)
2750 # extract uploader & filename from url
2751 uploader = mobj.group(1).decode('utf-8')
2752 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2754 # construct API request
2755 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2756 # retrieve .json file with links to files
2757 request = compat_urllib_request.Request(file_url)
2759 self.report_download_json(file_url)
2760 jsonData = compat_urllib_request.urlopen(request).read()
2761 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2762 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2765 json_data = json.loads(jsonData)
2766 player_url = json_data['player_swf_url']
2767 formats = dict(json_data['audio_formats'])
2769 req_format = self._downloader.params.get('format', None)
2772 if self._downloader.params.get('listformats', None):
2773 self._print_formats(formats)
2776 if req_format is None or req_format == 'best':
2777 for format_param in formats.keys():
2778 url_list = self.get_urls(formats, format_param)
2780 file_url = self.check_urls(url_list)
2781 if file_url is not None:
2784 if req_format not in formats:
2785 raise ExtractorError(u'Format is not available')
2787 url_list = self.get_urls(formats, req_format)
2788 file_url = self.check_urls(url_list)
2789 format_param = req_format
2792 'id': file_id.decode('utf-8'),
2793 'url': file_url.decode('utf-8'),
2794 'uploader': uploader.decode('utf-8'),
2795 'upload_date': None,
2796 'title': json_data['name'],
2797 'ext': file_url.split('.')[-1].decode('utf-8'),
2798 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2799 'thumbnail': json_data['thumbnail_url'],
2800 'description': json_data['description'],
2801 'player_url': player_url.decode('utf-8'),
2804 class StanfordOpenClassroomIE(InfoExtractor):
2805 """Information extractor for Stanford's Open ClassRoom"""
2807 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2808 IE_NAME = u'stanfordoc'
2810 def _real_extract(self, url):
2811 mobj = re.match(self._VALID_URL, url)
2813 raise ExtractorError(u'Invalid URL: %s' % url)
2815 if mobj.group('course') and mobj.group('video'): # A specific video
2816 course = mobj.group('course')
2817 video = mobj.group('video')
2819 'id': course + '_' + video,
2821 'upload_date': None,
2824 self.report_extraction(info['id'])
2825 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2826 xmlUrl = baseUrl + video + '.xml'
2828 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2829 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2830 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2831 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2833 info['title'] = mdoc.findall('./title')[0].text
2834 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2836 raise ExtractorError(u'Invalid metadata XML file')
2837 info['ext'] = info['url'].rpartition('.')[2]
2839 elif mobj.group('course'): # A course page
2840 course = mobj.group('course')
2845 'upload_date': None,
2848 coursepage = self._download_webpage(url, info['id'],
2849 note='Downloading course info page',
2850 errnote='Unable to download course info page')
2852 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2854 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2855 coursepage, u'description', fatal=False)
2857 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2860 'type': 'reference',
2861 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2865 for entry in info['list']:
2866 assert entry['type'] == 'reference'
2867 results += self.extract(entry['url'])
2871 'id': 'Stanford OpenClassroom',
2874 'upload_date': None,
2877 self.report_download_webpage(info['id'])
2878 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2880 rootpage = compat_urllib_request.urlopen(rootURL).read()
2881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2882 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2884 info['title'] = info['id']
2886 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2889 'type': 'reference',
2890 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2895 for entry in info['list']:
2896 assert entry['type'] == 'reference'
2897 results += self.extract(entry['url'])
2900 class MTVIE(InfoExtractor):
2901 """Information extractor for MTV.com"""
2903 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2906 def _real_extract(self, url):
2907 mobj = re.match(self._VALID_URL, url)
2909 raise ExtractorError(u'Invalid URL: %s' % url)
2910 if not mobj.group('proto'):
2911 url = 'http://' + url
2912 video_id = mobj.group('videoid')
2914 webpage = self._download_webpage(url, video_id)
2916 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2917 webpage, u'song name', fatal=False)
2919 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2922 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2923 webpage, u'mtvn_uri', fatal=False)
2925 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2926 webpage, u'content id', fatal=False)
2928 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2929 self.report_extraction(video_id)
2930 request = compat_urllib_request.Request(videogen_url)
2932 metadataXml = compat_urllib_request.urlopen(request).read()
2933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2934 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2936 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2937 renditions = mdoc.findall('.//rendition')
2939 # For now, always pick the highest quality.
2940 rendition = renditions[-1]
2943 _,_,ext = rendition.attrib['type'].partition('/')
2944 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2945 video_url = rendition.find('./src').text
2947 raise ExtractorError('Invalid rendition field.')
2952 'uploader': performer,
2953 'upload_date': None,
2954 'title': video_title,
2962 class YoukuIE(InfoExtractor):
2963 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2966 nowTime = int(time.time() * 1000)
2967 random1 = random.randint(1000,1998)
2968 random2 = random.randint(1000,9999)
2970 return "%d%d%d" %(nowTime,random1,random2)
2972 def _get_file_ID_mix_string(self, seed):
2974 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2976 for i in range(len(source)):
2977 seed = (seed * 211 + 30031 ) % 65536
2978 index = math.floor(seed / 65536 * len(source) )
2979 mixed.append(source[int(index)])
2980 source.remove(source[int(index)])
2981 #return ''.join(mixed)
2984 def _get_file_id(self, fileId, seed):
2985 mixed = self._get_file_ID_mix_string(seed)
2986 ids = fileId.split('*')
2990 realId.append(mixed[int(ch)])
2991 return ''.join(realId)
2993 def _real_extract(self, url):
2994 mobj = re.match(self._VALID_URL, url)
2996 raise ExtractorError(u'Invalid URL: %s' % url)
2997 video_id = mobj.group('ID')
2999 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3001 jsondata = self._download_webpage(info_url, video_id)
3003 self.report_extraction(video_id)
3005 config = json.loads(jsondata)
3007 video_title = config['data'][0]['title']
3008 seed = config['data'][0]['seed']
3010 format = self._downloader.params.get('format', None)
3011 supported_format = list(config['data'][0]['streamfileids'].keys())
3013 if format is None or format == 'best':
3014 if 'hd2' in supported_format:
3019 elif format == 'worst':
3027 fileid = config['data'][0]['streamfileids'][format]
3028 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3029 except (UnicodeDecodeError, ValueError, KeyError):
3030 raise ExtractorError(u'Unable to extract info section')
3033 sid = self._gen_sid()
3034 fileid = self._get_file_id(fileid, seed)
3036 #column 8,9 of fileid represent the segment number
3037 #fileid[7:9] should be changed
3038 for index, key in enumerate(keys):
3040 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3041 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3044 'id': '%s_part%02d' % (video_id, index),
3045 'url': download_url,
3047 'upload_date': None,
3048 'title': video_title,
3051 files_info.append(info)
3056 class XNXXIE(InfoExtractor):
3057 """Information extractor for xnxx.com"""
3059 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3061 VIDEO_URL_RE = r'flv_url=(.*?)&'
3062 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3063 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3065 def _real_extract(self, url):
3066 mobj = re.match(self._VALID_URL, url)
3068 raise ExtractorError(u'Invalid URL: %s' % url)
3069 video_id = mobj.group(1)
3071 # Get webpage content
3072 webpage = self._download_webpage(url, video_id)
3074 video_url = self._search_regex(self.VIDEO_URL_RE,
3075 webpage, u'video URL')
3076 video_url = compat_urllib_parse.unquote(video_url)
3078 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3081 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3082 webpage, u'thumbnail', fatal=False)
3088 'upload_date': None,
3089 'title': video_title,
3091 'thumbnail': video_thumbnail,
3092 'description': None,
3096 class GooglePlusIE(InfoExtractor):
3097 """Information extractor for plus.google.com."""
3099 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3100 IE_NAME = u'plus.google'
3102 def _real_extract(self, url):
3103 # Extract id from URL
3104 mobj = re.match(self._VALID_URL, url)
3106 raise ExtractorError(u'Invalid URL: %s' % url)
3108 post_url = mobj.group(0)
3109 video_id = mobj.group(1)
3111 video_extension = 'flv'
3113 # Step 1, Retrieve post webpage to extract further information
3114 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3116 self.report_extraction(video_id)
3118 # Extract update date
3119 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3120 webpage, u'upload date', fatal=False)
3122 # Convert timestring to a format suitable for filename
3123 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3124 upload_date = upload_date.strftime('%Y%m%d')
3127 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3128 webpage, u'uploader', fatal=False)
3131 # Get the first line for title
3132 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3133 webpage, 'title', default=u'NA')
3135 # Step 2, Stimulate clicking the image box to launch video
3136 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3137 webpage, u'video page URL')
3138 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3140 # Extract video links on video page
3141 """Extract video links of all sizes"""
3142 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3143 mobj = re.findall(pattern, webpage)
3145 raise ExtractorError(u'Unable to extract video links')
3147 # Sort in resolution
3148 links = sorted(mobj)
3150 # Choose the lowest of the sort, i.e. highest resolution
3151 video_url = links[-1]
3152 # Only get the url. The resolution part in the tuple has no use anymore
3153 video_url = video_url[-1]
3154 # Treat escaped \u0026 style hex
3156 video_url = video_url.decode("unicode_escape")
3157 except AttributeError: # Python 3
3158 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3164 'uploader': uploader,
3165 'upload_date': upload_date,
3166 'title': video_title,
3167 'ext': video_extension,
3170 class NBAIE(InfoExtractor):
3171 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3174 def _real_extract(self, url):
3175 mobj = re.match(self._VALID_URL, url)
3177 raise ExtractorError(u'Invalid URL: %s' % url)
3179 video_id = mobj.group(1)
3181 webpage = self._download_webpage(url, video_id)
3183 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3185 shortened_video_id = video_id.rpartition('/')[2]
3186 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3187 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3189 # It isn't there in the HTML it returns to us
3190 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3192 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3195 'id': shortened_video_id,
3199 # 'uploader_date': uploader_date,
3200 'description': description,
3204 class JustinTVIE(InfoExtractor):
3205 """Information extractor for justin.tv and twitch.tv"""
3206 # TODO: One broadcast may be split into multiple videos. The key
3207 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3208 # starts at 1 and increases. Can we treat all parts as one video?
3210 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3212 (?P<channelid>[^/]+)|
3213 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3214 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3218 _JUSTIN_PAGE_LIMIT = 100
3219 IE_NAME = u'justin.tv'
3221 def report_download_page(self, channel, offset):
3222 """Report attempt to download a single page of videos."""
3223 self.to_screen(u'%s: Downloading video information from %d to %d' %
3224 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3226 # Return count of items, list of *valid* items
3227 def _parse_page(self, url, video_id):
3228 webpage = self._download_webpage(url, video_id,
3229 u'Downloading video info JSON',
3230 u'unable to download video info JSON')
3232 response = json.loads(webpage)
3233 if type(response) != list:
3234 error_text = response.get('error', 'unknown error')
3235 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3237 for clip in response:
3238 video_url = clip['video_file_url']
3240 video_extension = os.path.splitext(video_url)[1][1:]
3241 video_date = re.sub('-', '', clip['start_time'][:10])
3242 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3243 video_id = clip['id']
3244 video_title = clip.get('title', video_id)
3248 'title': video_title,
3249 'uploader': clip.get('channel_name', video_uploader_id),
3250 'uploader_id': video_uploader_id,
3251 'upload_date': video_date,
3252 'ext': video_extension,
3254 return (len(response), info)
3256 def _real_extract(self, url):
3257 mobj = re.match(self._VALID_URL, url)
3259 raise ExtractorError(u'invalid URL: %s' % url)
3261 api_base = 'http://api.justin.tv'
3263 if mobj.group('channelid'):
3265 video_id = mobj.group('channelid')
3266 api = api_base + '/channel/archives/%s.json' % video_id
3267 elif mobj.group('chapterid'):
3268 chapter_id = mobj.group('chapterid')
3270 webpage = self._download_webpage(url, chapter_id)
3271 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3273 raise ExtractorError(u'Cannot find archive of a chapter')
3274 archive_id = m.group(1)
3276 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3277 chapter_info_xml = self._download_webpage(api, chapter_id,
3278 note=u'Downloading chapter information',
3279 errnote=u'Chapter information download failed')
3280 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3281 for a in doc.findall('.//archive'):
3282 if archive_id == a.find('./id').text:
3285 raise ExtractorError(u'Could not find chapter in chapter information')
3287 video_url = a.find('./video_file_url').text
3288 video_ext = video_url.rpartition('.')[2] or u'flv'
3290 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3291 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3292 note='Downloading chapter metadata',
3293 errnote='Download of chapter metadata failed')
3294 chapter_info = json.loads(chapter_info_json)
3296 bracket_start = int(doc.find('.//bracket_start').text)
3297 bracket_end = int(doc.find('.//bracket_end').text)
3299 # TODO determine start (and probably fix up file)
3300 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3301 #video_url += u'?start=' + TODO:start_timestamp
3302 # bracket_start is 13290, but we want 51670615
3303 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3304 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3307 'id': u'c' + chapter_id,
3310 'title': chapter_info['title'],
3311 'thumbnail': chapter_info['preview'],
3312 'description': chapter_info['description'],
3313 'uploader': chapter_info['channel']['display_name'],
3314 'uploader_id': chapter_info['channel']['name'],
3318 video_id = mobj.group('videoid')
3319 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3321 self.report_extraction(video_id)
3325 limit = self._JUSTIN_PAGE_LIMIT
3328 self.report_download_page(video_id, offset)
3329 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3330 page_count, page_info = self._parse_page(page_url, video_id)
3331 info.extend(page_info)
3332 if not paged or page_count != limit:
3337 class FunnyOrDieIE(InfoExtractor):
3338 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3340 def _real_extract(self, url):
3341 mobj = re.match(self._VALID_URL, url)
3343 raise ExtractorError(u'invalid URL: %s' % url)
3345 video_id = mobj.group('id')
3346 webpage = self._download_webpage(url, video_id)
3348 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3349 webpage, u'video URL', flags=re.DOTALL)
3351 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3352 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3354 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3355 webpage, u'description', fatal=False, flags=re.DOTALL)
3362 'description': video_description,
3366 class SteamIE(InfoExtractor):
3367 _VALID_URL = r"""http://store\.steampowered\.com/
3369 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3371 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3373 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3374 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3377 def suitable(cls, url):
3378 """Receives a URL and returns True if suitable for this IE."""
3379 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3381 def _real_extract(self, url):
3382 m = re.match(self._VALID_URL, url, re.VERBOSE)
3383 gameID = m.group('gameID')
3385 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3386 webpage = self._download_webpage(videourl, gameID)
3388 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3389 videourl = self._AGECHECK_TEMPLATE % gameID
3390 self.report_age_confirmation()
3391 webpage = self._download_webpage(videourl, gameID)
3393 self.report_extraction(gameID)
3394 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3395 webpage, 'game title')
3397 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3398 mweb = re.finditer(urlRE, webpage)
3399 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3400 titles = re.finditer(namesRE, webpage)
3401 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3402 thumbs = re.finditer(thumbsRE, webpage)
3404 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3405 video_id = vid.group('videoID')
3406 title = vtitle.group('videoName')
3407 video_url = vid.group('videoURL')
3408 video_thumb = thumb.group('thumbnail')
3410 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3415 'title': unescapeHTML(title),
3416 'thumbnail': video_thumb
3419 return [self.playlist_result(videos, gameID, game_title)]
3421 class UstreamIE(InfoExtractor):
3422 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3423 IE_NAME = u'ustream'
3425 def _real_extract(self, url):
3426 m = re.match(self._VALID_URL, url)
3427 video_id = m.group('videoID')
3429 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3430 webpage = self._download_webpage(url, video_id)
3432 self.report_extraction(video_id)
3434 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3437 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3438 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3440 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3441 webpage, u'thumbnail', fatal=False)
3447 'title': video_title,
3448 'uploader': uploader,
3449 'thumbnail': thumbnail,
3453 class WorldStarHipHopIE(InfoExtractor):
3454 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3455 IE_NAME = u'WorldStarHipHop'
3457 def _real_extract(self, url):
3458 m = re.match(self._VALID_URL, url)
3459 video_id = m.group('id')
3461 webpage_src = self._download_webpage(url, video_id)
3463 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3464 webpage_src, u'video URL')
3466 if 'mp4' in video_url:
3471 video_title = self._html_search_regex(r"<title>(.*)</title>",
3472 webpage_src, u'title')
3474 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3475 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3476 webpage_src, u'thumbnail', fatal=False)
3479 _title = r"""candytitles.*>(.*)</span>"""
3480 mobj = re.search(_title, webpage_src)
3481 if mobj is not None:
3482 video_title = mobj.group(1)
3487 'title' : video_title,
3488 'thumbnail' : thumbnail,
3493 class RBMARadioIE(InfoExtractor):
3494 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3496 def _real_extract(self, url):
3497 m = re.match(self._VALID_URL, url)
3498 video_id = m.group('videoID')
3500 webpage = self._download_webpage(url, video_id)
3502 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3503 webpage, u'json data', flags=re.MULTILINE)
3506 data = json.loads(json_data)
3507 except ValueError as e:
3508 raise ExtractorError(u'Invalid JSON: ' + str(e))
3510 video_url = data['akamai_url'] + '&cbr=256'
3511 url_parts = compat_urllib_parse_urlparse(video_url)
3512 video_ext = url_parts.path.rpartition('.')[2]
3517 'title': data['title'],
3518 'description': data.get('teaser_text'),
3519 'location': data.get('country_of_origin'),
3520 'uploader': data.get('host', {}).get('name'),
3521 'uploader_id': data.get('host', {}).get('slug'),
3522 'thumbnail': data.get('image', {}).get('large_url_2x'),
3523 'duration': data.get('duration'),
3528 class YouPornIE(InfoExtractor):
3529 """Information extractor for youporn.com."""
3530 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3532 def _print_formats(self, formats):
3533 """Print all available formats"""
3534 print(u'Available formats:')
3535 print(u'ext\t\tformat')
3536 print(u'---------------------------------')
3537 for format in formats:
3538 print(u'%s\t\t%s' % (format['ext'], format['format']))
3540 def _specific(self, req_format, formats):
3542 if(x["format"]==req_format):
3546 def _real_extract(self, url):
3547 mobj = re.match(self._VALID_URL, url)
3549 raise ExtractorError(u'Invalid URL: %s' % url)
3550 video_id = mobj.group('videoid')
3552 req = compat_urllib_request.Request(url)
3553 req.add_header('Cookie', 'age_verified=1')
3554 webpage = self._download_webpage(req, video_id)
3556 # Get JSON parameters
3557 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3559 params = json.loads(json_params)
3561 raise ExtractorError(u'Invalid JSON')
3563 self.report_extraction(video_id)
3565 video_title = params['title']
3566 upload_date = unified_strdate(params['release_date_f'])
3567 video_description = params['description']
3568 video_uploader = params['submitted_by']
3569 thumbnail = params['thumbnails'][0]['image']
3571 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3573 # Get all of the formats available
3574 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3575 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3576 webpage, u'download list').strip()
3578 # Get all of the links from the page
3579 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3580 links = re.findall(LINK_RE, download_list_html)
3581 if(len(links) == 0):
3582 raise ExtractorError(u'ERROR: no known formats available for video')
3584 self.to_screen(u'Links found: %d' % len(links))
3589 # A link looks like this:
3590 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3591 # A path looks like this:
3592 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3593 video_url = unescapeHTML( link )
3594 path = compat_urllib_parse_urlparse( video_url ).path
3595 extension = os.path.splitext( path )[1][1:]
3596 format = path.split('/')[4].split('_')[:2]
3599 format = "-".join( format )
3600 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3605 'uploader': video_uploader,
3606 'upload_date': upload_date,
3607 'title': video_title,
3610 'thumbnail': thumbnail,
3611 'description': video_description
3614 if self._downloader.params.get('listformats', None):
3615 self._print_formats(formats)
3618 req_format = self._downloader.params.get('format', None)
3619 self.to_screen(u'Format: %s' % req_format)
3621 if req_format is None or req_format == 'best':
3623 elif req_format == 'worst':
3624 return [formats[-1]]
3625 elif req_format in ('-1', 'all'):
3628 format = self._specific( req_format, formats )
3630 raise ExtractorError(u'Requested format not available')
3635 class PornotubeIE(InfoExtractor):
3636 """Information extractor for pornotube.com."""
3637 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3639 def _real_extract(self, url):
3640 mobj = re.match(self._VALID_URL, url)
3642 raise ExtractorError(u'Invalid URL: %s' % url)
3644 video_id = mobj.group('videoid')
3645 video_title = mobj.group('title')
3647 # Get webpage content
3648 webpage = self._download_webpage(url, video_id)
3651 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3652 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3653 video_url = compat_urllib_parse.unquote(video_url)
3655 #Get the uploaded date
3656 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3657 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3658 if upload_date: upload_date = unified_strdate(upload_date)
3660 info = {'id': video_id,
3663 'upload_date': upload_date,
3664 'title': video_title,
3670 class YouJizzIE(InfoExtractor):
3671 """Information extractor for youjizz.com."""
3672 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3674 def _real_extract(self, url):
3675 mobj = re.match(self._VALID_URL, url)
3677 raise ExtractorError(u'Invalid URL: %s' % url)
3679 video_id = mobj.group('videoid')
3681 # Get webpage content
3682 webpage = self._download_webpage(url, video_id)
3684 # Get the video title
3685 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3686 webpage, u'title').strip()
3688 # Get the embed page
3689 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3691 raise ExtractorError(u'ERROR: unable to extract embed page')
3693 embed_page_url = result.group(0).strip()
3694 video_id = result.group('videoid')
3696 webpage = self._download_webpage(embed_page_url, video_id)
3699 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3700 webpage, u'video URL')
3702 info = {'id': video_id,
3704 'title': video_title,
3707 'player_url': embed_page_url}
3711 class EightTracksIE(InfoExtractor):
3713 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3715 def _real_extract(self, url):
3716 mobj = re.match(self._VALID_URL, url)
3718 raise ExtractorError(u'Invalid URL: %s' % url)
3719 playlist_id = mobj.group('id')
3721 webpage = self._download_webpage(url, playlist_id)
3723 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3724 data = json.loads(json_like)
3726 session = str(random.randint(0, 1000000000))
3728 track_count = data['tracks_count']
3729 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3730 next_url = first_url
3732 for i in itertools.count():
3733 api_json = self._download_webpage(next_url, playlist_id,
3734 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3735 errnote=u'Failed to download song information')
3736 api_data = json.loads(api_json)
3737 track_data = api_data[u'set']['track']
3739 'id': track_data['id'],
3740 'url': track_data['track_file_stream_url'],
3741 'title': track_data['performer'] + u' - ' + track_data['name'],
3742 'raw_title': track_data['name'],
3743 'uploader_id': data['user']['login'],
3747 if api_data['set']['at_last_track']:
3749 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3752 class KeekIE(InfoExtractor):
3753 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3756 def _real_extract(self, url):
3757 m = re.match(self._VALID_URL, url)
3758 video_id = m.group('videoID')
3760 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3761 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3762 webpage = self._download_webpage(url, video_id)
3764 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3767 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3768 webpage, u'uploader', fatal=False)
3774 'title': video_title,
3775 'thumbnail': thumbnail,
3776 'uploader': uploader
3780 class TEDIE(InfoExtractor):
3781 _VALID_URL=r'''http://www\.ted\.com/
3783 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3785 ((?P<type_talk>talks)) # We have a simple talk
3787 (/lang/(.*?))? # The url may contain the language
3788 /(?P<name>\w+) # Here goes the name and then ".html"
3792 def suitable(cls, url):
3793 """Receives a URL and returns True if suitable for this IE."""
3794 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3796 def _real_extract(self, url):
3797 m=re.match(self._VALID_URL, url, re.VERBOSE)
3798 if m.group('type_talk'):
3799 return [self._talk_info(url)]
3801 playlist_id=m.group('playlist_id')
3802 name=m.group('name')
3803 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3804 return [self._playlist_videos_info(url,name,playlist_id)]
3806 def _playlist_videos_info(self,url,name,playlist_id=0):
3807 '''Returns the videos of the playlist'''
3809 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3810 ([.\s]*?)data-playlist_item_id="(\d+)"
3811 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3813 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3814 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3815 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3816 m_names=re.finditer(video_name_RE,webpage)
3818 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3819 webpage, 'playlist title')
3821 playlist_entries = []
3822 for m_video, m_name in zip(m_videos,m_names):
3823 video_id=m_video.group('video_id')
3824 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3825 playlist_entries.append(self.url_result(talk_url, 'TED'))
3826 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3828 def _talk_info(self, url, video_id=0):
3829 """Return the video for the talk in the url"""
3830 m = re.match(self._VALID_URL, url,re.VERBOSE)
3831 video_name = m.group('name')
3832 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3833 self.report_extraction(video_name)
3834 # If the url includes the language we get the title translated
3835 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3837 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3838 webpage, 'json data')
3839 info = json.loads(json_data)
3840 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3841 webpage, 'description', flags = re.DOTALL)
3843 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3844 webpage, 'thumbnail')
3847 'url': info['htmlStreams'][-1]['file'],
3850 'thumbnail': thumbnail,
3851 'description': desc,
3855 class MySpassIE(InfoExtractor):
3856 _VALID_URL = r'http://www.myspass.de/.*'
3858 def _real_extract(self, url):
3859 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3861 # video id is the last path element of the URL
3862 # usually there is a trailing slash, so also try the second but last
3863 url_path = compat_urllib_parse_urlparse(url).path
3864 url_parent_path, video_id = os.path.split(url_path)
3866 _, video_id = os.path.split(url_parent_path)
3869 metadata_url = META_DATA_URL_TEMPLATE % video_id
3870 metadata_text = self._download_webpage(metadata_url, video_id)
3871 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3873 # extract values from metadata
3874 url_flv_el = metadata.find('url_flv')
3875 if url_flv_el is None:
3876 raise ExtractorError(u'Unable to extract download url')
3877 video_url = url_flv_el.text
3878 extension = os.path.splitext(video_url)[1][1:]
3879 title_el = metadata.find('title')
3880 if title_el is None:
3881 raise ExtractorError(u'Unable to extract title')
3882 title = title_el.text
3883 format_id_el = metadata.find('format_id')
3884 if format_id_el is None:
3887 format = format_id_el.text
3888 description_el = metadata.find('description')
3889 if description_el is not None:
3890 description = description_el.text
3893 imagePreview_el = metadata.find('imagePreview')
3894 if imagePreview_el is not None:
3895 thumbnail = imagePreview_el.text
3904 'thumbnail': thumbnail,
3905 'description': description
3909 class SpiegelIE(InfoExtractor):
3910 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3912 def _real_extract(self, url):
3913 m = re.match(self._VALID_URL, url)
3914 video_id = m.group('videoID')
3916 webpage = self._download_webpage(url, video_id)
3918 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3921 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3922 xml_code = self._download_webpage(xml_url, video_id,
3923 note=u'Downloading XML', errnote=u'Failed to download XML')
3925 idoc = xml.etree.ElementTree.fromstring(xml_code)
3926 last_type = idoc[-1]
3927 filename = last_type.findall('./filename')[0].text
3928 duration = float(last_type.findall('./duration')[0].text)
3930 video_url = 'http://video2.spiegel.de/flash/' + filename
3931 video_ext = filename.rpartition('.')[2]
3936 'title': video_title,
3937 'duration': duration,
3941 class LiveLeakIE(InfoExtractor):
3943 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3944 IE_NAME = u'liveleak'
3946 def _real_extract(self, url):
3947 mobj = re.match(self._VALID_URL, url)
3949 raise ExtractorError(u'Invalid URL: %s' % url)
3951 video_id = mobj.group('video_id')
3953 webpage = self._download_webpage(url, video_id)
3955 video_url = self._search_regex(r'file: "(.*?)",',
3956 webpage, u'video URL')
3958 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3959 webpage, u'title').replace('LiveLeak.com -', '').strip()
3961 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3962 webpage, u'description', fatal=False)
3964 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3965 webpage, u'uploader', fatal=False)
3971 'title': video_title,
3972 'description': video_description,
3973 'uploader': video_uploader
3978 class ARDIE(InfoExtractor):
3979 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3980 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3981 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3983 def _real_extract(self, url):
3984 # determine video id from url
3985 m = re.match(self._VALID_URL, url)
3987 numid = re.search(r'documentId=([0-9]+)', url)
3989 video_id = numid.group(1)
3991 video_id = m.group('video_id')
3993 # determine title and media streams from webpage
3994 html = self._download_webpage(url, video_id)
3995 title = re.search(self._TITLE, html).group('title')
3996 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3998 assert '"fsk"' in html
3999 raise ExtractorError(u'This video is only available after 8:00 pm')
4001 # choose default media type and highest quality for now
4002 stream = max([s for s in streams if int(s["media_type"]) == 0],
4003 key=lambda s: int(s["quality"]))
4005 # there's two possibilities: RTMP stream or HTTP download
4006 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4007 if stream['rtmp_url']:
4008 self.to_screen(u'RTMP download detected')
4009 assert stream['video_url'].startswith('mp4:')
4010 info["url"] = stream["rtmp_url"]
4011 info["play_path"] = stream['video_url']
4013 assert stream["video_url"].endswith('.mp4')
4014 info["url"] = stream["video_url"]
4017 class ZDFIE(InfoExtractor):
4018 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4019 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4020 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4021 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4022 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4024 def _real_extract(self, url):
4025 mobj = re.match(self._VALID_URL, url)
4027 raise ExtractorError(u'Invalid URL: %s' % url)
4028 video_id = mobj.group('video_id')
4030 html = self._download_webpage(url, video_id)
4031 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4033 raise ExtractorError(u'No media url found.')
4035 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4036 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4037 # choose first/default media type and highest quality for now
4038 for s in streams: #find 300 - dsl1000mbit
4039 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4042 for s in streams: #find veryhigh - dsl2000mbit
4043 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4047 raise ExtractorError(u'No stream found.')
4049 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4051 self.report_extraction(video_id)
4052 mobj = re.search(self._TITLE, html)
4054 raise ExtractorError(u'Cannot extract title')
4055 title = unescapeHTML(mobj.group('title'))
4057 mobj = re.search(self._MMS_STREAM, media_link)
4059 mobj = re.search(self._RTSP_STREAM, media_link)
4061 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4062 mms_url = mobj.group('video_url')
4064 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4066 raise ExtractorError(u'Cannot extract extention')
4067 ext = mobj.group('ext')
4069 return [{'id': video_id,
4075 class TumblrIE(InfoExtractor):
4076 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4078 def _real_extract(self, url):
4079 m_url = re.match(self._VALID_URL, url)
4080 video_id = m_url.group('id')
4081 blog = m_url.group('blog_name')
4083 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4084 webpage = self._download_webpage(url, video_id)
4086 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4087 video = re.search(re_video, webpage)
4089 raise ExtractorError(u'Unable to extract video')
4090 video_url = video.group('video_url')
4091 ext = video.group('ext')
4093 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4094 webpage, u'thumbnail', fatal=False) # We pick the first poster
4095 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4097 # The only place where you can get a title, it's not complete,
4098 # but searching in other places doesn't work for all videos
4099 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4100 webpage, u'title', flags=re.DOTALL)
4102 return [{'id': video_id,
4104 'title': video_title,
4105 'thumbnail': video_thumbnail,
4109 class BandcampIE(InfoExtractor):
4110 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4112 def _real_extract(self, url):
4113 mobj = re.match(self._VALID_URL, url)
4114 title = mobj.group('title')
4115 webpage = self._download_webpage(url, title)
4116 # We get the link to the free download page
4117 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4118 if m_download is None:
4119 raise ExtractorError(u'No free songs found')
4121 download_link = m_download.group(1)
4122 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4123 webpage, re.MULTILINE|re.DOTALL).group('id')
4125 download_webpage = self._download_webpage(download_link, id,
4126 'Downloading free downloads page')
4127 # We get the dictionary of the track from some javascrip code
4128 info = re.search(r'items: (.*?),$',
4129 download_webpage, re.MULTILINE).group(1)
4130 info = json.loads(info)[0]
4131 # We pick mp3-320 for now, until format selection can be easily implemented.
4132 mp3_info = info[u'downloads'][u'mp3-320']
4133 # If we try to use this url it says the link has expired
4134 initial_url = mp3_info[u'url']
4135 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4136 m_url = re.match(re_url, initial_url)
4137 #We build the url we will use to get the final track url
4138 # This url is build in Bandcamp in the script download_bunde_*.js
4139 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4140 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4141 # If we could correctly generate the .rand field the url would be
4142 #in the "download_url" key
4143 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4145 track_info = {'id':id,
4146 'title' : info[u'title'],
4149 'thumbnail' : info[u'thumb_url'],
4150 'uploader' : info[u'artist']
4155 class RedTubeIE(InfoExtractor):
4156 """Information Extractor for redtube"""
4157 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4159 def _real_extract(self,url):
4160 mobj = re.match(self._VALID_URL, url)
4162 raise ExtractorError(u'Invalid URL: %s' % url)
4164 video_id = mobj.group('id')
4165 video_extension = 'mp4'
4166 webpage = self._download_webpage(url, video_id)
4168 self.report_extraction(video_id)
4170 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4171 webpage, u'video URL')
4173 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4179 'ext': video_extension,
4180 'title': video_title,
4183 class InaIE(InfoExtractor):
4184 """Information Extractor for Ina.fr"""
4185 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4187 def _real_extract(self,url):
4188 mobj = re.match(self._VALID_URL, url)
4190 video_id = mobj.group('id')
4191 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4192 video_extension = 'mp4'
4193 webpage = self._download_webpage(mrss_url, video_id)
4195 self.report_extraction(video_id)
4197 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4198 webpage, u'video URL')
4200 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4206 'ext': video_extension,
4207 'title': video_title,
4210 class HowcastIE(InfoExtractor):
4211 """Information Extractor for Howcast.com"""
4212 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4214 def _real_extract(self, url):
4215 mobj = re.match(self._VALID_URL, url)
4217 video_id = mobj.group('id')
4218 webpage_url = 'http://www.howcast.com/videos/' + video_id
4219 webpage = self._download_webpage(webpage_url, video_id)
4221 self.report_extraction(video_id)
4223 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4224 webpage, u'video URL')
4226 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4229 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4230 webpage, u'description', fatal=False)
4232 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4233 webpage, u'thumbnail', fatal=False)
4239 'title': video_title,
4240 'description': video_description,
4241 'thumbnail': thumbnail,
4244 class VineIE(InfoExtractor):
4245 """Information Extractor for Vine.co"""
4246 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4248 def _real_extract(self, url):
4249 mobj = re.match(self._VALID_URL, url)
4251 video_id = mobj.group('id')
4252 webpage_url = 'https://vine.co/v/' + video_id
4253 webpage = self._download_webpage(webpage_url, video_id)
4255 self.report_extraction(video_id)
4257 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4258 webpage, u'video URL')
4260 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4263 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4264 webpage, u'thumbnail', fatal=False)
4266 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4267 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4273 'title': video_title,
4274 'thumbnail': thumbnail,
4275 'uploader': uploader,
4278 class FlickrIE(InfoExtractor):
4279 """Information Extractor for Flickr videos"""
4280 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4282 def _real_extract(self, url):
4283 mobj = re.match(self._VALID_URL, url)
4285 video_id = mobj.group('id')
4286 video_uploader_id = mobj.group('uploader_id')
4287 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4288 webpage = self._download_webpage(webpage_url, video_id)
4290 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4292 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4293 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4295 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4296 first_xml, u'node_id')
4298 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4299 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4301 self.report_extraction(video_id)
4303 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4305 raise ExtractorError(u'Unable to extract video url')
4306 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4308 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4309 webpage, u'video title')
4311 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4312 webpage, u'description', fatal=False)
4314 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4315 webpage, u'thumbnail', fatal=False)
4321 'title': video_title,
4322 'description': video_description,
4323 'thumbnail': thumbnail,
4324 'uploader_id': video_uploader_id,
4327 class TeamcocoIE(InfoExtractor):
4328 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4330 def _real_extract(self, url):
4331 mobj = re.match(self._VALID_URL, url)
4333 raise ExtractorError(u'Invalid URL: %s' % url)
4334 url_title = mobj.group('url_title')
4335 webpage = self._download_webpage(url, url_title)
4337 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4338 webpage, u'video id')
4340 self.report_extraction(video_id)
4342 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4345 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4346 webpage, u'thumbnail', fatal=False)
4348 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4349 webpage, u'description', fatal=False)
4351 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4352 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4354 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4361 'title': video_title,
4362 'thumbnail': thumbnail,
4363 'description': video_description,
4366 class XHamsterIE(InfoExtractor):
4367 """Information Extractor for xHamster"""
4368 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4370 def _real_extract(self,url):
4371 mobj = re.match(self._VALID_URL, url)
4373 video_id = mobj.group('id')
4374 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4375 webpage = self._download_webpage(mrss_url, video_id)
4377 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4379 raise ExtractorError(u'Unable to extract media URL')
4380 if len(mobj.group('server')) == 0:
4381 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4383 video_url = mobj.group('server')+'/key='+mobj.group('file')
4384 video_extension = video_url.split('.')[-1]
4386 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4389 # Can't see the description anywhere in the UI
4390 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4391 # webpage, u'description', fatal=False)
4392 # if video_description: video_description = unescapeHTML(video_description)
4394 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4396 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4398 video_upload_date = None
4399 self._downloader.report_warning(u'Unable to extract upload date')
4401 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4402 webpage, u'uploader id', default=u'anonymous')
4404 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4405 webpage, u'thumbnail', fatal=False)
4410 'ext': video_extension,
4411 'title': video_title,
4412 # 'description': video_description,
4413 'upload_date': video_upload_date,
4414 'uploader_id': video_uploader_id,
4415 'thumbnail': video_thumbnail
4418 class HypemIE(InfoExtractor):
4419 """Information Extractor for hypem"""
4420 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4422 def _real_extract(self, url):
4423 mobj = re.match(self._VALID_URL, url)
4425 raise ExtractorError(u'Invalid URL: %s' % url)
4426 track_id = mobj.group(1)
4428 data = { 'ax': 1, 'ts': time.time() }
4429 data_encoded = compat_urllib_parse.urlencode(data)
4430 complete_url = url + "?" + data_encoded
4431 request = compat_urllib_request.Request(complete_url)
4432 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4433 cookie = urlh.headers.get('Set-Cookie', '')
4435 self.report_extraction(track_id)
4437 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4438 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4440 track_list = json.loads(html_tracks)
4441 track = track_list[u'tracks'][0]
4443 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4446 track_id = track[u"id"]
4447 artist = track[u"artist"]
4448 title = track[u"song"]
4450 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4451 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4452 request.add_header('cookie', cookie)
4453 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4455 song_data = json.loads(song_data_json)
4457 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4458 final_url = song_data[u"url"]
4468 class Vbox7IE(InfoExtractor):
4469 """Information Extractor for Vbox7"""
4470 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4472 def _real_extract(self,url):
4473 mobj = re.match(self._VALID_URL, url)
4475 raise ExtractorError(u'Invalid URL: %s' % url)
4476 video_id = mobj.group(1)
4478 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4479 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4480 redirect_url = urlh.geturl() + new_location
4481 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4483 title = self._html_search_regex(r'<title>(.*)</title>',
4484 webpage, u'title').split('/')[0].strip()
4487 info_url = "http://vbox7.com/play/magare.do"
4488 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4489 info_request = compat_urllib_request.Request(info_url, data)
4490 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4491 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4492 if info_response is None:
4493 raise ExtractorError(u'Unable to extract the media url')
4494 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4501 'thumbnail': thumbnail_url,
4504 class GametrailersIE(InfoExtractor):
4505 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4507 def _real_extract(self, url):
4508 mobj = re.match(self._VALID_URL, url)
4510 raise ExtractorError(u'Invalid URL: %s' % url)
4511 video_id = mobj.group('id')
4512 video_type = mobj.group('type')
4513 webpage = self._download_webpage(url, video_id)
4514 if video_type == 'full-episodes':
4515 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4517 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4518 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4519 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4521 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4522 video_id, u'Downloading video info')
4523 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4524 video_id, u'Downloading video urls info')
4526 self.report_extraction(video_id)
4527 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4528 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4530 <url>(?P<thumb>.*?)</url>.*
4533 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4535 raise ExtractorError(u'Unable to extract video info')
4536 video_title = m_info.group('title')
4537 video_description = m_info.group('description')
4538 video_thumb = m_info.group('thumb')
4540 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4541 if m_urls is None or len(m_urls) == 0:
4542 raise ExtractError(u'Unable to extrat video url')
4543 # They are sorted from worst to best quality
4544 video_url = m_urls[-1].group('url')
4546 return {'url': video_url,
4548 'title': video_title,
4549 # Videos are actually flv not mp4
4551 'thumbnail': video_thumb,
4552 'description': video_description,
4555 class StatigrIE(InfoExtractor):
4556 _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4558 def _real_extract(self, url):
4559 mobj = re.match(self._VALID_URL, url)
4561 raise ExtractorError(u'Invalid URL: %s' % url)
4562 video_id = mobj.group(1)
4563 webpage = self._download_webpage(url, video_id)
4564 video_url = re.search(r'<meta property="og:video:secure_url" content="(.+?)">',webpage).group(1)
4565 thumbnail_url = re.search(r'<meta property="og:image" content="(.+?)" />',webpage).group(1)
4566 title = (re.search(r'<title>(.+?)</title>',webpage).group(1)).strip("| Statigram")
4567 uploader = re.search(r'@(.+) \(Videos\)',title).group(1)
4574 'thumbnail': thumbnail_url,
4575 'uploader' : uploader
4578 def gen_extractors():
4579 """ Return a list of an instance of every supported extractor.
4580 The order does matter; the first extractor matched is the one handling the URL.
4583 YoutubePlaylistIE(),
4608 StanfordOpenClassroomIE(),
4618 WorldStarHipHopIE(),
4648 def get_info_extractor(ie_name):
4649 """Returns the info extractor class with the given ie_name"""
4650 return globals()[ie_name+'IE']