2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
736 if not 'ratebypass' in url: url += '&ratebypass=yes'
737 url_map[url_data['itag'][0]] = url
739 format_limit = self._downloader.params.get('format_limit', None)
740 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
741 if format_limit is not None and format_limit in available_formats:
742 format_list = available_formats[available_formats.index(format_limit):]
744 format_list = available_formats
745 existing_formats = [x for x in format_list if x in url_map]
746 if len(existing_formats) == 0:
747 raise ExtractorError(u'no known formats available for video')
748 if self._downloader.params.get('listformats', None):
749 self._print_formats(existing_formats)
751 if req_format is None or req_format == 'best':
752 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
753 elif req_format == 'worst':
754 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
755 elif req_format in ('-1', 'all'):
756 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
758 # Specific formats. We pick the first in a slash-delimeted sequence.
759 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
760 req_formats = req_format.split('/')
761 video_url_list = None
762 for rf in req_formats:
764 video_url_list = [(rf, url_map[rf])]
766 if video_url_list is None:
767 raise ExtractorError(u'requested format not available')
769 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
772 for format_param, video_real_url in video_url_list:
774 video_extension = self._video_extensions.get(format_param, 'flv')
776 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
777 self._video_dimensions.get(format_param, '???'))
781 'url': video_real_url,
782 'uploader': video_uploader,
783 'uploader_id': video_uploader_id,
784 'upload_date': upload_date,
785 'title': video_title,
786 'ext': video_extension,
787 'format': video_format,
788 'thumbnail': video_thumbnail,
789 'description': video_description,
790 'player_url': player_url,
791 'subtitles': video_subtitles,
792 'duration': video_duration
797 class MetacafeIE(InfoExtractor):
798 """Information Extractor for metacafe.com."""
800 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
801 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
802 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
803 IE_NAME = u'metacafe'
805 def report_disclaimer(self):
806 """Report disclaimer retrieval."""
807 self.to_screen(u'Retrieving disclaimer')
809 def _real_initialize(self):
810 # Retrieve disclaimer
811 request = compat_urllib_request.Request(self._DISCLAIMER)
813 self.report_disclaimer()
814 disclaimer = compat_urllib_request.urlopen(request).read()
815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
816 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
821 'submit': "Continue - I'm over 18",
823 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
825 self.report_age_confirmation()
826 disclaimer = compat_urllib_request.urlopen(request).read()
827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
830 def _real_extract(self, url):
831 # Extract id and simplified title from URL
832 mobj = re.match(self._VALID_URL, url)
834 raise ExtractorError(u'Invalid URL: %s' % url)
836 video_id = mobj.group(1)
838 # Check if video comes from YouTube
839 mobj2 = re.match(r'^yt-(.*)$', video_id)
840 if mobj2 is not None:
841 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
843 # Retrieve video webpage to extract further information
844 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
846 # Extract URL, uploader and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
850 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
851 video_extension = mediaURL[-3:]
853 # Extract gdaKey if available
854 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
858 gdaKey = mobj.group(1)
859 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
861 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
863 raise ExtractorError(u'Unable to extract media URL')
864 vardict = compat_parse_qs(mobj.group(1))
865 if 'mediaData' not in vardict:
866 raise ExtractorError(u'Unable to extract media URL')
867 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
869 raise ExtractorError(u'Unable to extract media URL')
870 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
871 video_extension = mediaURL[-3:]
872 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
874 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
876 raise ExtractorError(u'Unable to extract title')
877 video_title = mobj.group(1).decode('utf-8')
879 mobj = re.search(r'submitter=(.*?);', webpage)
881 raise ExtractorError(u'Unable to extract uploader nickname')
882 video_uploader = mobj.group(1)
885 'id': video_id.decode('utf-8'),
886 'url': video_url.decode('utf-8'),
887 'uploader': video_uploader.decode('utf-8'),
889 'title': video_title,
890 'ext': video_extension.decode('utf-8'),
893 class DailymotionIE(InfoExtractor):
894 """Information Extractor for Dailymotion"""
896 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
897 IE_NAME = u'dailymotion'
899 def _real_extract(self, url):
900 # Extract id and simplified title from URL
901 mobj = re.match(self._VALID_URL, url)
903 raise ExtractorError(u'Invalid URL: %s' % url)
905 video_id = mobj.group(1).split('_')[0].split('?')[0]
907 video_extension = 'mp4'
909 # Retrieve video webpage to extract further information
910 request = compat_urllib_request.Request(url)
911 request.add_header('Cookie', 'family_filter=off')
912 webpage = self._download_webpage(request, video_id)
914 # Extract URL, uploader and title from webpage
915 self.report_extraction(video_id)
916 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
918 raise ExtractorError(u'Unable to extract media URL')
919 flashvars = compat_urllib_parse.unquote(mobj.group(1))
921 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
924 self.to_screen(u'Using %s' % key)
927 raise ExtractorError(u'Unable to extract video URL')
929 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
931 raise ExtractorError(u'Unable to extract video URL')
933 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
935 # TODO: support choosing qualities
937 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
939 raise ExtractorError(u'Unable to extract title')
940 video_title = unescapeHTML(mobj.group('title'))
942 video_uploader = None
943 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
945 # lookin for official user
946 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
947 if mobj_official is None:
948 self._downloader.report_warning(u'unable to extract uploader nickname')
950 video_uploader = mobj_official.group(1)
952 video_uploader = mobj.group(1)
954 video_upload_date = None
955 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
957 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
962 'uploader': video_uploader,
963 'upload_date': video_upload_date,
964 'title': video_title,
965 'ext': video_extension,
969 class PhotobucketIE(InfoExtractor):
970 """Information extractor for photobucket.com."""
972 # TODO: the original _VALID_URL was:
973 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
974 # Check if it's necessary to keep the old extracion process
975 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
976 IE_NAME = u'photobucket'
978 def _real_extract(self, url):
979 # Extract id from URL
980 mobj = re.match(self._VALID_URL, url)
982 raise ExtractorError(u'Invalid URL: %s' % url)
984 video_id = mobj.group('id')
986 video_extension = mobj.group('ext')
988 # Retrieve video webpage to extract further information
989 webpage = self._download_webpage(url, video_id)
991 # Extract URL, uploader, and title from webpage
992 self.report_extraction(video_id)
993 # We try first by looking the javascript code:
994 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
996 info = json.loads(mobj.group('json'))
999 'url': info[u'downloadUrl'],
1000 'uploader': info[u'username'],
1001 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1002 'title': info[u'title'],
1003 'ext': video_extension,
1004 'thumbnail': info[u'thumbUrl'],
1007 # We try looking in other parts of the webpage
1008 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1009 webpage, u'video URL')
1011 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1013 raise ExtractorError(u'Unable to extract title')
1014 video_title = mobj.group(1).decode('utf-8')
1015 video_uploader = mobj.group(2).decode('utf-8')
1018 'id': video_id.decode('utf-8'),
1019 'url': video_url.decode('utf-8'),
1020 'uploader': video_uploader,
1021 'upload_date': None,
1022 'title': video_title,
1023 'ext': video_extension.decode('utf-8'),
1027 class YahooIE(InfoExtractor):
1028 """Information extractor for screen.yahoo.com."""
1029 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1031 def _real_extract(self, url):
1032 mobj = re.match(self._VALID_URL, url)
1034 raise ExtractorError(u'Invalid URL: %s' % url)
1035 video_id = mobj.group('id')
1036 webpage = self._download_webpage(url, video_id)
1037 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1040 # TODO: Check which url parameters are required
1041 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1042 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1043 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1044 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1045 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1046 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1048 self.report_extraction(video_id)
1049 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1051 raise ExtractorError(u'Unable to extract video info')
1052 video_title = m_info.group('title')
1053 video_description = m_info.group('description')
1054 video_thumb = m_info.group('thumb')
1055 video_date = m_info.group('date')
1056 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1058 # TODO: Find a way to get mp4 videos
1059 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1061 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1062 video_url = m_rest.group('url')
1063 video_path = m_rest.group('path')
1065 raise ExtractorError(u'Unable to extract video url')
1067 else: # We have to use a different method if another id is defined
1068 long_id = m_id.group('new_id')
1069 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1070 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1071 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1072 info = json.loads(json_str)
1073 res = info[u'query'][u'results'][u'mediaObj'][0]
1074 stream = res[u'streams'][0]
1075 video_path = stream[u'path']
1076 video_url = stream[u'host']
1078 video_title = meta[u'title']
1079 video_description = meta[u'description']
1080 video_thumb = meta[u'thumbnail']
1081 video_date = None # I can't find it
1086 'play_path': video_path,
1087 'title':video_title,
1088 'description': video_description,
1089 'thumbnail': video_thumb,
1090 'upload_date': video_date,
1095 class VimeoIE(InfoExtractor):
1096 """Information extractor for vimeo.com."""
1098 # _VALID_URL matches Vimeo URLs
1099 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1102 def _real_extract(self, url, new_video=True):
1103 # Extract ID from URL
1104 mobj = re.match(self._VALID_URL, url)
1106 raise ExtractorError(u'Invalid URL: %s' % url)
1108 video_id = mobj.group('id')
1109 if not mobj.group('proto'):
1110 url = 'https://' + url
1111 if mobj.group('direct_link') or mobj.group('pro'):
1112 url = 'https://vimeo.com/' + video_id
1114 # Retrieve video webpage to extract further information
1115 request = compat_urllib_request.Request(url, None, std_headers)
1116 webpage = self._download_webpage(request, video_id)
1118 # Now we begin extracting as much information as we can from what we
1119 # retrieved. First we extract the information common to all extractors,
1120 # and latter we extract those that are Vimeo specific.
1121 self.report_extraction(video_id)
1123 # Extract the config JSON
1125 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1126 config = json.loads(config)
1128 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1129 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1131 raise ExtractorError(u'Unable to extract info section')
1134 video_title = config["video"]["title"]
1136 # Extract uploader and uploader_id
1137 video_uploader = config["video"]["owner"]["name"]
1138 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1140 # Extract video thumbnail
1141 video_thumbnail = config["video"]["thumbnail"]
1143 # Extract video description
1144 video_description = get_element_by_attribute("itemprop", "description", webpage)
1145 if video_description: video_description = clean_html(video_description)
1146 else: video_description = u''
1148 # Extract upload date
1149 video_upload_date = None
1150 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1151 if mobj is not None:
1152 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1154 # Vimeo specific: extract request signature and timestamp
1155 sig = config['request']['signature']
1156 timestamp = config['request']['timestamp']
1158 # Vimeo specific: extract video codec and quality information
1159 # First consider quality, then codecs, then take everything
1160 # TODO bind to format param
1161 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1162 files = { 'hd': [], 'sd': [], 'other': []}
1163 for codec_name, codec_extension in codecs:
1164 if codec_name in config["video"]["files"]:
1165 if 'hd' in config["video"]["files"][codec_name]:
1166 files['hd'].append((codec_name, codec_extension, 'hd'))
1167 elif 'sd' in config["video"]["files"][codec_name]:
1168 files['sd'].append((codec_name, codec_extension, 'sd'))
1170 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1172 for quality in ('hd', 'sd', 'other'):
1173 if len(files[quality]) > 0:
1174 video_quality = files[quality][0][2]
1175 video_codec = files[quality][0][0]
1176 video_extension = files[quality][0][1]
1177 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1180 raise ExtractorError(u'No known codec found')
1182 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1183 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1188 'uploader': video_uploader,
1189 'uploader_id': video_uploader_id,
1190 'upload_date': video_upload_date,
1191 'title': video_title,
1192 'ext': video_extension,
1193 'thumbnail': video_thumbnail,
1194 'description': video_description,
1198 class ArteTvIE(InfoExtractor):
1199 """arte.tv information extractor."""
1201 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1202 _LIVE_URL = r'index-[0-9]+\.html$'
1204 IE_NAME = u'arte.tv'
1206 def fetch_webpage(self, url):
1207 request = compat_urllib_request.Request(url)
1209 self.report_download_webpage(url)
1210 webpage = compat_urllib_request.urlopen(request).read()
1211 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213 except ValueError as err:
1214 raise ExtractorError(u'Invalid URL: %s' % url)
1217 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1218 page = self.fetch_webpage(url)
1219 mobj = re.search(regex, page, regexFlags)
1223 raise ExtractorError(u'Invalid URL: %s' % url)
1225 for (i, key, err) in matchTuples:
1226 if mobj.group(i) is None:
1227 raise ExtractorError(err)
1229 info[key] = mobj.group(i)
1233 def extractLiveStream(self, url):
1234 video_lang = url.split('/')[-4]
1235 info = self.grep_webpage(
1237 r'src="(.*?/videothek_js.*?\.js)',
1240 (1, 'url', u'Invalid URL: %s' % url)
1243 http_host = url.split('/')[2]
1244 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245 info = self.grep_webpage(
1247 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248 '(http://.*?\.swf).*?' +
1252 (1, 'path', u'could not extract video path: %s' % url),
1253 (2, 'player', u'could not extract video player: %s' % url),
1254 (3, 'url', u'could not extract video url: %s' % url)
1257 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1259 def extractPlus7Stream(self, url):
1260 video_lang = url.split('/')[-3]
1261 info = self.grep_webpage(
1263 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1266 (1, 'url', u'Invalid URL: %s' % url)
1269 next_url = compat_urllib_parse.unquote(info.get('url'))
1270 info = self.grep_webpage(
1272 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1275 (1, 'url', u'Could not find <video> tag: %s' % url)
1278 next_url = compat_urllib_parse.unquote(info.get('url'))
1280 info = self.grep_webpage(
1282 r'<video id="(.*?)".*?>.*?' +
1283 '<name>(.*?)</name>.*?' +
1284 '<dateVideo>(.*?)</dateVideo>.*?' +
1285 '<url quality="hd">(.*?)</url>',
1288 (1, 'id', u'could not extract video id: %s' % url),
1289 (2, 'title', u'could not extract video title: %s' % url),
1290 (3, 'date', u'could not extract video date: %s' % url),
1291 (4, 'url', u'could not extract video url: %s' % url)
1296 'id': info.get('id'),
1297 'url': compat_urllib_parse.unquote(info.get('url')),
1298 'uploader': u'arte.tv',
1299 'upload_date': unified_strdate(info.get('date')),
1300 'title': info.get('title').decode('utf-8'),
1306 def _real_extract(self, url):
1307 video_id = url.split('/')[-1]
1308 self.report_extraction(video_id)
1310 if re.search(self._LIVE_URL, video_id) is not None:
1311 self.extractLiveStream(url)
1314 info = self.extractPlus7Stream(url)
1319 class GenericIE(InfoExtractor):
1320 """Generic last-resort information extractor."""
1323 IE_NAME = u'generic'
1325 def report_download_webpage(self, video_id):
1326 """Report webpage download."""
1327 if not self._downloader.params.get('test', False):
1328 self._downloader.report_warning(u'Falling back on generic information extractor.')
1329 super(GenericIE, self).report_download_webpage(video_id)
1331 def report_following_redirect(self, new_url):
1332 """Report information extraction."""
1333 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1335 def _test_redirect(self, url):
1336 """Check if it is a redirect, like url shorteners, in case return the new url."""
1337 class HeadRequest(compat_urllib_request.Request):
1338 def get_method(self):
1341 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1343 Subclass the HTTPRedirectHandler to make it use our
1344 HeadRequest also on the redirected URL
1346 def redirect_request(self, req, fp, code, msg, headers, newurl):
1347 if code in (301, 302, 303, 307):
1348 newurl = newurl.replace(' ', '%20')
1349 newheaders = dict((k,v) for k,v in req.headers.items()
1350 if k.lower() not in ("content-length", "content-type"))
1351 return HeadRequest(newurl,
1353 origin_req_host=req.get_origin_req_host(),
1356 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1358 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1360 Fallback to GET if HEAD is not allowed (405 HTTP error)
1362 def http_error_405(self, req, fp, code, msg, headers):
1366 newheaders = dict((k,v) for k,v in req.headers.items()
1367 if k.lower() not in ("content-length", "content-type"))
1368 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1370 origin_req_host=req.get_origin_req_host(),
1374 opener = compat_urllib_request.OpenerDirector()
1375 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1376 HTTPMethodFallback, HEADRedirectHandler,
1377 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1378 opener.add_handler(handler())
1380 response = opener.open(HeadRequest(url))
1381 if response is None:
1382 raise ExtractorError(u'Invalid URL protocol')
1383 new_url = response.geturl()
1388 self.report_following_redirect(new_url)
1391 def _real_extract(self, url):
1392 new_url = self._test_redirect(url)
1393 if new_url: return [self.url_result(new_url)]
1395 video_id = url.split('/')[-1]
1397 webpage = self._download_webpage(url, video_id)
1398 except ValueError as err:
1399 # since this is the last-resort InfoExtractor, if
1400 # this error is thrown, it'll be thrown here
1401 raise ExtractorError(u'Invalid URL: %s' % url)
1403 self.report_extraction(video_id)
1404 # Start with something easy: JW Player in SWFObject
1405 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1407 # Broaden the search a little bit
1408 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1410 # Broaden the search a little bit: JWPlayer JS loader
1411 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1413 # Try to find twitter cards info
1414 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1416 raise ExtractorError(u'Invalid URL: %s' % url)
1418 # It's possible that one of the regexes
1419 # matched, but returned an empty group:
1420 if mobj.group(1) is None:
1421 raise ExtractorError(u'Invalid URL: %s' % url)
1423 video_url = compat_urllib_parse.unquote(mobj.group(1))
1424 video_id = os.path.basename(video_url)
1426 # here's a fun little line of code for you:
1427 video_extension = os.path.splitext(video_id)[1][1:]
1428 video_id = os.path.splitext(video_id)[0]
1430 # it's tempting to parse this further, but you would
1431 # have to take into account all the variations like
1432 # Video Title - Site Name
1433 # Site Name | Video Title
1434 # Video Title - Tagline | Site Name
1435 # and so on and so forth; it's just not practical
1436 video_title = self._html_search_regex(r'<title>(.*)</title>',
1437 webpage, u'video title')
1439 # video uploader is domain name
1440 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1441 url, u'video uploader')
1446 'uploader': video_uploader,
1447 'upload_date': None,
1448 'title': video_title,
1449 'ext': video_extension,
1453 class YoutubeSearchIE(SearchInfoExtractor):
1454 """Information Extractor for YouTube search queries."""
1455 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1457 IE_NAME = u'youtube:search'
1458 _SEARCH_KEY = 'ytsearch'
1460 def report_download_page(self, query, pagenum):
1461 """Report attempt to download search page with given number."""
1462 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1464 def _get_n_results(self, query, n):
1465 """Get a specified number of results for a query"""
1471 while (50 * pagenum) < limit:
1472 self.report_download_page(query, pagenum+1)
1473 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474 request = compat_urllib_request.Request(result_url)
1476 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1478 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1479 api_response = json.loads(data)['data']
1481 if not 'items' in api_response:
1482 raise ExtractorError(u'[youtube] No video results')
1484 new_ids = list(video['id'] for video in api_response['items'])
1485 video_ids += new_ids
1487 limit = min(n, api_response['totalItems'])
1490 if len(video_ids) > n:
1491 video_ids = video_ids[:n]
1492 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1493 return self.playlist_result(videos, query)
1496 class GoogleSearchIE(SearchInfoExtractor):
1497 """Information Extractor for Google Video search queries."""
1498 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1500 IE_NAME = u'video.google:search'
1501 _SEARCH_KEY = 'gvsearch'
1503 def _get_n_results(self, query, n):
1504 """Get a specified number of results for a query"""
1507 '_type': 'playlist',
1512 for pagenum in itertools.count(1):
1513 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1514 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1515 note='Downloading result page ' + str(pagenum))
1517 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1520 'url': mobj.group(1)
1522 res['entries'].append(e)
1524 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1527 class YahooSearchIE(SearchInfoExtractor):
1528 """Information Extractor for Yahoo! Video search queries."""
1531 IE_NAME = u'screen.yahoo:search'
1532 _SEARCH_KEY = 'yvsearch'
1534 def _get_n_results(self, query, n):
1535 """Get a specified number of results for a query"""
1538 '_type': 'playlist',
1542 for pagenum in itertools.count(0):
1543 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1544 webpage = self._download_webpage(result_url, query,
1545 note='Downloading results page '+str(pagenum+1))
1546 info = json.loads(webpage)
1548 results = info[u'results']
1550 for (i, r) in enumerate(results):
1551 if (pagenum * 30) +i >= n:
1553 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1554 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1555 res['entries'].append(e)
1556 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1562 class YoutubePlaylistIE(InfoExtractor):
1563 """Information Extractor for YouTube playlists."""
1565 _VALID_URL = r"""(?:
1570 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1571 \? (?:.*?&)*? (?:p|a|list)=
1574 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1577 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1579 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1581 IE_NAME = u'youtube:playlist'
1584 def suitable(cls, url):
1585 """Receives a URL and returns True if suitable for this IE."""
1586 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1588 def _real_extract(self, url):
1589 # Extract playlist id
1590 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1592 raise ExtractorError(u'Invalid URL: %s' % url)
1594 # Download playlist videos from API
1595 playlist_id = mobj.group(1) or mobj.group(2)
1600 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1601 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1604 response = json.loads(page)
1605 except ValueError as err:
1606 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1608 if 'feed' not in response:
1609 raise ExtractorError(u'Got a malformed response from YouTube API')
1610 playlist_title = response['feed']['title']['$t']
1611 if 'entry' not in response['feed']:
1612 # Number of videos is a multiple of self._MAX_RESULTS
1615 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1616 for entry in response['feed']['entry']
1617 if 'content' in entry ]
1619 if len(response['feed']['entry']) < self._MAX_RESULTS:
1623 videos = [v[1] for v in sorted(videos)]
1625 url_results = [self.url_result(url, 'Youtube') for url in videos]
1626 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1629 class YoutubeChannelIE(InfoExtractor):
1630 """Information Extractor for YouTube channels."""
1632 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1633 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1634 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1635 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1636 IE_NAME = u'youtube:channel'
1638 def extract_videos_from_page(self, page):
1640 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1641 if mobj.group(1) not in ids_in_page:
1642 ids_in_page.append(mobj.group(1))
1645 def _real_extract(self, url):
1646 # Extract channel id
1647 mobj = re.match(self._VALID_URL, url)
1649 raise ExtractorError(u'Invalid URL: %s' % url)
1651 # Download channel page
1652 channel_id = mobj.group(1)
1656 url = self._TEMPLATE_URL % (channel_id, pagenum)
1657 page = self._download_webpage(url, channel_id,
1658 u'Downloading page #%s' % pagenum)
1660 # Extract video identifiers
1661 ids_in_page = self.extract_videos_from_page(page)
1662 video_ids.extend(ids_in_page)
1664 # Download any subsequent channel pages using the json-based channel_ajax query
1665 if self._MORE_PAGES_INDICATOR in page:
1667 pagenum = pagenum + 1
1669 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1670 page = self._download_webpage(url, channel_id,
1671 u'Downloading page #%s' % pagenum)
1673 page = json.loads(page)
1675 ids_in_page = self.extract_videos_from_page(page['content_html'])
1676 video_ids.extend(ids_in_page)
1678 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1681 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1683 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1684 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1685 return [self.playlist_result(url_entries, channel_id)]
1688 class YoutubeUserIE(InfoExtractor):
1689 """Information Extractor for YouTube users."""
1691 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1692 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1693 _GDATA_PAGE_SIZE = 50
1694 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1695 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1696 IE_NAME = u'youtube:user'
1698 def _real_extract(self, url):
1700 mobj = re.match(self._VALID_URL, url)
1702 raise ExtractorError(u'Invalid URL: %s' % url)
1704 username = mobj.group(1)
1706 # Download video ids using YouTube Data API. Result size per
1707 # query is limited (currently to 50 videos) so we need to query
1708 # page by page until there are no video ids - it means we got
1715 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1717 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1718 page = self._download_webpage(gdata_url, username,
1719 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1721 # Extract video identifiers
1724 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1725 if mobj.group(1) not in ids_in_page:
1726 ids_in_page.append(mobj.group(1))
1728 video_ids.extend(ids_in_page)
1730 # A little optimization - if current page is not
1731 # "full", ie. does not contain PAGE_SIZE video ids then
1732 # we can assume that this page is the last one - there
1733 # are no more ids on further pages - no need to query
1736 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1741 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1742 url_results = [self.url_result(url, 'Youtube') for url in urls]
1743 return [self.playlist_result(url_results, playlist_title = username)]
1746 class BlipTVUserIE(InfoExtractor):
1747 """Information Extractor for blip.tv users."""
1749 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1751 IE_NAME = u'blip.tv:user'
1753 def _real_extract(self, url):
1755 mobj = re.match(self._VALID_URL, url)
1757 raise ExtractorError(u'Invalid URL: %s' % url)
1759 username = mobj.group(1)
1761 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1763 page = self._download_webpage(url, username, u'Downloading user page')
1764 mobj = re.search(r'data-users-id="([^"]+)"', page)
1765 page_base = page_base % mobj.group(1)
1768 # Download video ids using BlipTV Ajax calls. Result size per
1769 # query is limited (currently to 12 videos) so we need to query
1770 # page by page until there are no video ids - it means we got
1777 url = page_base + "&page=" + str(pagenum)
1778 page = self._download_webpage(url, username,
1779 u'Downloading video ids from page %d' % pagenum)
1781 # Extract video identifiers
1784 for mobj in re.finditer(r'href="/([^"]+)"', page):
1785 if mobj.group(1) not in ids_in_page:
1786 ids_in_page.append(unescapeHTML(mobj.group(1)))
1788 video_ids.extend(ids_in_page)
1790 # A little optimization - if current page is not
1791 # "full", ie. does not contain PAGE_SIZE video ids then
1792 # we can assume that this page is the last one - there
1793 # are no more ids on further pages - no need to query
1796 if len(ids_in_page) < self._PAGE_SIZE:
1801 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1802 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1803 return [self.playlist_result(url_entries, playlist_title = username)]
1806 class DepositFilesIE(InfoExtractor):
1807 """Information extractor for depositfiles.com"""
1809 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1811 def _real_extract(self, url):
1812 file_id = url.split('/')[-1]
1813 # Rebuild url in english locale
1814 url = 'http://depositfiles.com/en/files/' + file_id
1816 # Retrieve file webpage with 'Free download' button pressed
1817 free_download_indication = { 'gateway_result' : '1' }
1818 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1820 self.report_download_webpage(file_id)
1821 webpage = compat_urllib_request.urlopen(request).read()
1822 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1823 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1825 # Search for the real file URL
1826 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1827 if (mobj is None) or (mobj.group(1) is None):
1828 # Try to figure out reason of the error.
1829 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1830 if (mobj is not None) and (mobj.group(1) is not None):
1831 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1832 raise ExtractorError(u'%s' % restriction_message)
1834 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1836 file_url = mobj.group(1)
1837 file_extension = os.path.splitext(file_url)[1][1:]
1839 # Search for file title
1840 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1843 'id': file_id.decode('utf-8'),
1844 'url': file_url.decode('utf-8'),
1846 'upload_date': None,
1847 'title': file_title,
1848 'ext': file_extension.decode('utf-8'),
1852 class FacebookIE(InfoExtractor):
1853 """Information Extractor for Facebook"""
1855 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1856 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1857 _NETRC_MACHINE = 'facebook'
1858 IE_NAME = u'facebook'
1860 def report_login(self):
1861 """Report attempt to log in."""
1862 self.to_screen(u'Logging in')
1864 def _real_initialize(self):
1865 if self._downloader is None:
1870 downloader_params = self._downloader.params
1872 # Attempt to use provided username and password or .netrc data
1873 if downloader_params.get('username', None) is not None:
1874 useremail = downloader_params['username']
1875 password = downloader_params['password']
1876 elif downloader_params.get('usenetrc', False):
1878 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1879 if info is not None:
1883 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1884 except (IOError, netrc.NetrcParseError) as err:
1885 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1888 if useremail is None:
1897 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1900 login_results = compat_urllib_request.urlopen(request).read()
1901 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1902 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1904 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1908 def _real_extract(self, url):
1909 mobj = re.match(self._VALID_URL, url)
1911 raise ExtractorError(u'Invalid URL: %s' % url)
1912 video_id = mobj.group('ID')
1914 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1915 webpage = self._download_webpage(url, video_id)
1917 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1918 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1919 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1921 raise ExtractorError(u'Cannot parse data')
1922 data = dict(json.loads(m.group(1)))
1923 params_raw = compat_urllib_parse.unquote(data['params'])
1924 params = json.loads(params_raw)
1925 video_data = params['video_data'][0]
1926 video_url = video_data.get('hd_src')
1928 video_url = video_data['sd_src']
1930 raise ExtractorError(u'Cannot find video URL')
1931 video_duration = int(video_data['video_duration'])
1932 thumbnail = video_data['thumbnail_src']
1934 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1939 'title': video_title,
1942 'duration': video_duration,
1943 'thumbnail': thumbnail,
1948 class BlipTVIE(InfoExtractor):
1949 """Information extractor for blip.tv"""
1951 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1952 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1953 IE_NAME = u'blip.tv'
1955 def report_direct_download(self, title):
1956 """Report information extraction."""
1957 self.to_screen(u'%s: Direct download detected' % title)
1959 def _real_extract(self, url):
1960 mobj = re.match(self._VALID_URL, url)
1962 raise ExtractorError(u'Invalid URL: %s' % url)
1964 # See https://github.com/rg3/youtube-dl/issues/857
1965 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1966 if api_mobj is not None:
1967 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1968 urlp = compat_urllib_parse_urlparse(url)
1969 if urlp.path.startswith('/play/'):
1970 request = compat_urllib_request.Request(url)
1971 response = compat_urllib_request.urlopen(request)
1972 redirecturl = response.geturl()
1973 rurlp = compat_urllib_parse_urlparse(redirecturl)
1974 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1975 url = 'http://blip.tv/a/a-' + file_id
1976 return self._real_extract(url)
1983 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1984 request = compat_urllib_request.Request(json_url)
1985 request.add_header('User-Agent', 'iTunes/10.6.1')
1986 self.report_extraction(mobj.group(1))
1989 urlh = compat_urllib_request.urlopen(request)
1990 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1991 basename = url.split('/')[-1]
1992 title,ext = os.path.splitext(basename)
1993 title = title.decode('UTF-8')
1994 ext = ext.replace('.', '')
1995 self.report_direct_download(title)
2000 'upload_date': None,
2005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2007 if info is None: # Regular URL
2009 json_code_bytes = urlh.read()
2010 json_code = json_code_bytes.decode('utf-8')
2011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2015 json_data = json.loads(json_code)
2016 if 'Post' in json_data:
2017 data = json_data['Post']
2021 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2022 video_url = data['media']['url']
2023 umobj = re.match(self._URL_EXT, video_url)
2025 raise ValueError('Can not determine filename extension')
2026 ext = umobj.group(1)
2029 'id': data['item_id'],
2031 'uploader': data['display_name'],
2032 'upload_date': upload_date,
2033 'title': data['title'],
2035 'format': data['media']['mimeType'],
2036 'thumbnail': data['thumbnailUrl'],
2037 'description': data['description'],
2038 'player_url': data['embedUrl'],
2039 'user_agent': 'iTunes/10.6.1',
2041 except (ValueError,KeyError) as err:
2042 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2047 class MyVideoIE(InfoExtractor):
2048 """Information Extractor for myvideo.de."""
2050 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2051 IE_NAME = u'myvideo'
2053 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2054 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2055 # https://github.com/rg3/youtube-dl/pull/842
2056 def __rc4crypt(self,data, key):
2058 box = list(range(256))
2059 for i in list(range(256)):
2060 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2061 box[i], box[x] = box[x], box[i]
2067 y = (y + box[x]) % 256
2068 box[x], box[y] = box[y], box[x]
2069 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2073 return hashlib.md5(s).hexdigest().encode()
2075 def _real_extract(self,url):
2076 mobj = re.match(self._VALID_URL, url)
2078 raise ExtractorError(u'invalid URL: %s' % url)
2080 video_id = mobj.group(1)
2083 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2084 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2085 b'TnpsbA0KTVRkbU1tSTRNdz09'
2089 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2090 webpage = self._download_webpage(webpage_url, video_id)
2092 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2093 if mobj is not None:
2094 self.report_extraction(video_id)
2095 video_url = mobj.group(1) + '.flv'
2097 video_title = self._html_search_regex('<title>([^<]+)</title>',
2100 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2106 'upload_date': None,
2107 'title': video_title,
2112 mobj = re.search('var flashvars={(.+?)}', webpage)
2114 raise ExtractorError(u'Unable to extract video')
2119 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2120 if not a == '_encxml':
2123 encxml = compat_urllib_parse.unquote(b)
2124 if not params.get('domain'):
2125 params['domain'] = 'www.myvideo.de'
2126 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2127 if 'flash_playertype=MTV' in xmldata_url:
2128 self._downloader.report_warning(u'avoiding MTV player')
2130 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2131 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2135 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2136 enc_data_b = binascii.unhexlify(enc_data)
2138 base64.b64decode(base64.b64decode(GK)) +
2140 str(video_id).encode('utf-8')
2143 dec_data = self.__rc4crypt(enc_data_b, sk)
2146 self.report_extraction(video_id)
2149 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2151 video_url = compat_urllib_parse.unquote(mobj.group(1))
2152 if 'myvideo2flash' in video_url:
2153 self._downloader.report_warning(u'forcing RTMPT ...')
2154 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2157 # extract non rtmp videos
2158 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2160 raise ExtractorError(u'unable to extract url')
2161 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2163 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2164 video_file = compat_urllib_parse.unquote(video_file)
2166 if not video_file.endswith('f4m'):
2167 ppath, prefix = video_file.split('.')
2168 video_playpath = '%s:%s' % (prefix, ppath)
2169 video_hls_playlist = ''
2172 video_hls_playlist = (
2173 video_filepath + video_file
2174 ).replace('.f4m', '.m3u8')
2176 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2177 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2179 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2185 'tc_url': video_url,
2187 'upload_date': None,
2188 'title': video_title,
2190 'play_path': video_playpath,
2191 'video_file': video_file,
2192 'video_hls_playlist': video_hls_playlist,
2193 'player_url': video_swfobj,
2197 class ComedyCentralIE(InfoExtractor):
2198 """Information extractor for The Daily Show and Colbert Report """
2200 # urls can be abbreviations like :thedailyshow or :colbert
2201 # urls for episodes like:
2202 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2203 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2204 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2205 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2206 |(https?://)?(www\.)?
2207 (?P<showname>thedailyshow|colbertnation)\.com/
2208 (full-episodes/(?P<episode>.*)|
2210 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2211 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2214 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2216 _video_extensions = {
2224 _video_dimensions = {
2234 def suitable(cls, url):
2235 """Receives a URL and returns True if suitable for this IE."""
2236 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2238 def _print_formats(self, formats):
2239 print('Available formats:')
2241 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2244 def _real_extract(self, url):
2245 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2247 raise ExtractorError(u'Invalid URL: %s' % url)
2249 if mobj.group('shortname'):
2250 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2251 url = u'http://www.thedailyshow.com/full-episodes/'
2253 url = u'http://www.colbertnation.com/full-episodes/'
2254 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2255 assert mobj is not None
2257 if mobj.group('clip'):
2258 if mobj.group('showname') == 'thedailyshow':
2259 epTitle = mobj.group('tdstitle')
2261 epTitle = mobj.group('cntitle')
2264 dlNewest = not mobj.group('episode')
2266 epTitle = mobj.group('showname')
2268 epTitle = mobj.group('episode')
2270 self.report_extraction(epTitle)
2271 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2273 url = htmlHandle.geturl()
2274 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2276 raise ExtractorError(u'Invalid redirected URL: ' + url)
2277 if mobj.group('episode') == '':
2278 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2279 epTitle = mobj.group('episode')
2281 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2283 if len(mMovieParams) == 0:
2284 # The Colbert Report embeds the information in a without
2285 # a URL prefix; so extract the alternate reference
2286 # and then add the URL prefix manually.
2288 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2289 if len(altMovieParams) == 0:
2290 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2292 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2294 uri = mMovieParams[0][1]
2295 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2296 indexXml = self._download_webpage(indexUrl, epTitle,
2297 u'Downloading show index',
2298 u'unable to download episode index')
2302 idoc = xml.etree.ElementTree.fromstring(indexXml)
2303 itemEls = idoc.findall('.//item')
2304 for partNum,itemEl in enumerate(itemEls):
2305 mediaId = itemEl.findall('./guid')[0].text
2306 shortMediaId = mediaId.split(':')[-1]
2307 showId = mediaId.split(':')[-2].replace('.com', '')
2308 officialTitle = itemEl.findall('./title')[0].text
2309 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2311 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2312 compat_urllib_parse.urlencode({'uri': mediaId}))
2313 configXml = self._download_webpage(configUrl, epTitle,
2314 u'Downloading configuration for %s' % shortMediaId)
2316 cdoc = xml.etree.ElementTree.fromstring(configXml)
2318 for rendition in cdoc.findall('.//rendition'):
2319 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2323 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2326 if self._downloader.params.get('listformats', None):
2327 self._print_formats([i[0] for i in turls])
2330 # For now, just pick the highest bitrate
2331 format,rtmp_video_url = turls[-1]
2333 # Get the format arg from the arg stream
2334 req_format = self._downloader.params.get('format', None)
2336 # Select format if we can find one
2339 format, rtmp_video_url = f, v
2342 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2344 raise ExtractorError(u'Cannot transform RTMP url')
2345 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2346 video_url = base + m.group('finalid')
2348 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2353 'upload_date': officialDate,
2358 'description': officialTitle,
2360 results.append(info)
2365 class EscapistIE(InfoExtractor):
2366 """Information extractor for The Escapist """
2368 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2369 IE_NAME = u'escapist'
2371 def _real_extract(self, url):
2372 mobj = re.match(self._VALID_URL, url)
2374 raise ExtractorError(u'Invalid URL: %s' % url)
2375 showName = mobj.group('showname')
2376 videoId = mobj.group('episode')
2378 self.report_extraction(videoId)
2379 webpage = self._download_webpage(url, videoId)
2381 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2382 webpage, u'description', fatal=False)
2384 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2385 webpage, u'thumbnail', fatal=False)
2387 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2388 webpage, u'player url')
2390 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2391 webpage, u'player url').split(' : ')[-1]
2393 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2394 configUrl = compat_urllib_parse.unquote(configUrl)
2396 configJSON = self._download_webpage(configUrl, videoId,
2397 u'Downloading configuration',
2398 u'unable to download configuration')
2400 # Technically, it's JavaScript, not JSON
2401 configJSON = configJSON.replace("'", '"')
2404 config = json.loads(configJSON)
2405 except (ValueError,) as err:
2406 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2408 playlist = config['playlist']
2409 videoUrl = playlist[1]['url']
2414 'uploader': showName,
2415 'upload_date': None,
2418 'thumbnail': imgUrl,
2419 'description': videoDesc,
2420 'player_url': playerUrl,
2425 class CollegeHumorIE(InfoExtractor):
2426 """Information extractor for collegehumor.com"""
2429 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2430 IE_NAME = u'collegehumor'
2432 def report_manifest(self, video_id):
2433 """Report information extraction."""
2434 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2436 def _real_extract(self, url):
2437 mobj = re.match(self._VALID_URL, url)
2439 raise ExtractorError(u'Invalid URL: %s' % url)
2440 video_id = mobj.group('videoid')
2445 'upload_date': None,
2448 self.report_extraction(video_id)
2449 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2451 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2455 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2457 videoNode = mdoc.findall('./video')[0]
2458 info['description'] = videoNode.findall('./description')[0].text
2459 info['title'] = videoNode.findall('./caption')[0].text
2460 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461 manifest_url = videoNode.findall('./file')[0].text
2463 raise ExtractorError(u'Invalid metadata XML file')
2465 manifest_url += '?hdcore=2.10.3'
2466 self.report_manifest(video_id)
2468 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2472 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2474 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2475 node_id = media_node.attrib['url']
2476 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2477 except IndexError as err:
2478 raise ExtractorError(u'Invalid manifest file')
2480 url_pr = compat_urllib_parse_urlparse(manifest_url)
2481 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2488 class XVideosIE(InfoExtractor):
2489 """Information extractor for xvideos.com"""
2491 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2492 IE_NAME = u'xvideos'
2494 def _real_extract(self, url):
2495 mobj = re.match(self._VALID_URL, url)
2497 raise ExtractorError(u'Invalid URL: %s' % url)
2498 video_id = mobj.group(1)
2500 webpage = self._download_webpage(url, video_id)
2502 self.report_extraction(video_id)
2505 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2506 webpage, u'video URL'))
2509 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2512 # Extract video thumbnail
2513 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2514 webpage, u'thumbnail', fatal=False)
2520 'upload_date': None,
2521 'title': video_title,
2523 'thumbnail': video_thumbnail,
2524 'description': None,
2530 class SoundcloudIE(InfoExtractor):
2531 """Information extractor for soundcloud.com
2532 To access the media, the uid of the song and a stream token
2533 must be extracted from the page source and the script must make
2534 a request to media.soundcloud.com/crossdomain.xml. Then
2535 the media can be grabbed by requesting from an url composed
2536 of the stream token and uid
2539 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2540 IE_NAME = u'soundcloud'
2542 def report_resolve(self, video_id):
2543 """Report information extraction."""
2544 self.to_screen(u'%s: Resolving id' % video_id)
2546 def _real_extract(self, url):
2547 mobj = re.match(self._VALID_URL, url)
2549 raise ExtractorError(u'Invalid URL: %s' % url)
2551 # extract uploader (which is in the url)
2552 uploader = mobj.group(1)
2553 # extract simple title (uploader + slug of song title)
2554 slug_title = mobj.group(2)
2555 simple_title = uploader + u'-' + slug_title
2556 full_title = '%s/%s' % (uploader, slug_title)
2558 self.report_resolve(full_title)
2560 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2561 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2562 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2564 info = json.loads(info_json)
2565 video_id = info['id']
2566 self.report_extraction(full_title)
2568 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2569 stream_json = self._download_webpage(streams_url, full_title,
2570 u'Downloading stream definitions',
2571 u'unable to download stream definitions')
2573 streams = json.loads(stream_json)
2574 mediaURL = streams['http_mp3_128_url']
2575 upload_date = unified_strdate(info['created_at'])
2580 'uploader': info['user']['username'],
2581 'upload_date': upload_date,
2582 'title': info['title'],
2584 'description': info['description'],
2587 class SoundcloudSetIE(InfoExtractor):
2588 """Information extractor for soundcloud.com sets
2589 To access the media, the uid of the song and a stream token
2590 must be extracted from the page source and the script must make
2591 a request to media.soundcloud.com/crossdomain.xml. Then
2592 the media can be grabbed by requesting from an url composed
2593 of the stream token and uid
2596 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2597 IE_NAME = u'soundcloud:set'
2599 def report_resolve(self, video_id):
2600 """Report information extraction."""
2601 self.to_screen(u'%s: Resolving id' % video_id)
2603 def _real_extract(self, url):
2604 mobj = re.match(self._VALID_URL, url)
2606 raise ExtractorError(u'Invalid URL: %s' % url)
2608 # extract uploader (which is in the url)
2609 uploader = mobj.group(1)
2610 # extract simple title (uploader + slug of song title)
2611 slug_title = mobj.group(2)
2612 simple_title = uploader + u'-' + slug_title
2613 full_title = '%s/sets/%s' % (uploader, slug_title)
2615 self.report_resolve(full_title)
2617 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2618 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2619 info_json = self._download_webpage(resolv_url, full_title)
2622 info = json.loads(info_json)
2623 if 'errors' in info:
2624 for err in info['errors']:
2625 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2628 self.report_extraction(full_title)
2629 for track in info['tracks']:
2630 video_id = track['id']
2632 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2633 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2635 self.report_extraction(video_id)
2636 streams = json.loads(stream_json)
2637 mediaURL = streams['http_mp3_128_url']
2642 'uploader': track['user']['username'],
2643 'upload_date': unified_strdate(track['created_at']),
2644 'title': track['title'],
2646 'description': track['description'],
2651 class InfoQIE(InfoExtractor):
2652 """Information extractor for infoq.com"""
2653 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2655 def _real_extract(self, url):
2656 mobj = re.match(self._VALID_URL, url)
2658 raise ExtractorError(u'Invalid URL: %s' % url)
2660 webpage = self._download_webpage(url, video_id=url)
2661 self.report_extraction(url)
2664 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2666 raise ExtractorError(u'Unable to extract video url')
2667 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2668 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2671 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2674 # Extract description
2675 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2676 webpage, u'description', fatal=False)
2678 video_filename = video_url.split('/')[-1]
2679 video_id, extension = video_filename.split('.')
2685 'upload_date': None,
2686 'title': video_title,
2687 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2689 'description': video_description,
2694 class MixcloudIE(InfoExtractor):
2695 """Information extractor for www.mixcloud.com"""
2697 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2698 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2699 IE_NAME = u'mixcloud'
2701 def report_download_json(self, file_id):
2702 """Report JSON download."""
2703 self.to_screen(u'Downloading json')
2705 def get_urls(self, jsonData, fmt, bitrate='best'):
2706 """Get urls from 'audio_formats' section in json"""
2709 bitrate_list = jsonData[fmt]
2710 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2711 bitrate = max(bitrate_list) # select highest
2713 url_list = jsonData[fmt][bitrate]
2714 except TypeError: # we have no bitrate info.
2715 url_list = jsonData[fmt]
2718 def check_urls(self, url_list):
2719 """Returns 1st active url from list"""
2720 for url in url_list:
2722 compat_urllib_request.urlopen(url)
2724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2729 def _print_formats(self, formats):
2730 print('Available formats:')
2731 for fmt in formats.keys():
2732 for b in formats[fmt]:
2734 ext = formats[fmt][b][0]
2735 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2736 except TypeError: # we have no bitrate info
2737 ext = formats[fmt][0]
2738 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2741 def _real_extract(self, url):
2742 mobj = re.match(self._VALID_URL, url)
2744 raise ExtractorError(u'Invalid URL: %s' % url)
2745 # extract uploader & filename from url
2746 uploader = mobj.group(1).decode('utf-8')
2747 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2749 # construct API request
2750 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2751 # retrieve .json file with links to files
2752 request = compat_urllib_request.Request(file_url)
2754 self.report_download_json(file_url)
2755 jsonData = compat_urllib_request.urlopen(request).read()
2756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2757 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2760 json_data = json.loads(jsonData)
2761 player_url = json_data['player_swf_url']
2762 formats = dict(json_data['audio_formats'])
2764 req_format = self._downloader.params.get('format', None)
2767 if self._downloader.params.get('listformats', None):
2768 self._print_formats(formats)
2771 if req_format is None or req_format == 'best':
2772 for format_param in formats.keys():
2773 url_list = self.get_urls(formats, format_param)
2775 file_url = self.check_urls(url_list)
2776 if file_url is not None:
2779 if req_format not in formats:
2780 raise ExtractorError(u'Format is not available')
2782 url_list = self.get_urls(formats, req_format)
2783 file_url = self.check_urls(url_list)
2784 format_param = req_format
2787 'id': file_id.decode('utf-8'),
2788 'url': file_url.decode('utf-8'),
2789 'uploader': uploader.decode('utf-8'),
2790 'upload_date': None,
2791 'title': json_data['name'],
2792 'ext': file_url.split('.')[-1].decode('utf-8'),
2793 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2794 'thumbnail': json_data['thumbnail_url'],
2795 'description': json_data['description'],
2796 'player_url': player_url.decode('utf-8'),
2799 class StanfordOpenClassroomIE(InfoExtractor):
2800 """Information extractor for Stanford's Open ClassRoom"""
2802 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2803 IE_NAME = u'stanfordoc'
2805 def _real_extract(self, url):
2806 mobj = re.match(self._VALID_URL, url)
2808 raise ExtractorError(u'Invalid URL: %s' % url)
2810 if mobj.group('course') and mobj.group('video'): # A specific video
2811 course = mobj.group('course')
2812 video = mobj.group('video')
2814 'id': course + '_' + video,
2816 'upload_date': None,
2819 self.report_extraction(info['id'])
2820 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2821 xmlUrl = baseUrl + video + '.xml'
2823 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2824 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2825 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2826 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2828 info['title'] = mdoc.findall('./title')[0].text
2829 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2831 raise ExtractorError(u'Invalid metadata XML file')
2832 info['ext'] = info['url'].rpartition('.')[2]
2834 elif mobj.group('course'): # A course page
2835 course = mobj.group('course')
2840 'upload_date': None,
2843 coursepage = self._download_webpage(url, info['id'],
2844 note='Downloading course info page',
2845 errnote='Unable to download course info page')
2847 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2849 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2850 coursepage, u'description', fatal=False)
2852 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2855 'type': 'reference',
2856 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2860 for entry in info['list']:
2861 assert entry['type'] == 'reference'
2862 results += self.extract(entry['url'])
2866 'id': 'Stanford OpenClassroom',
2869 'upload_date': None,
2872 self.report_download_webpage(info['id'])
2873 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2875 rootpage = compat_urllib_request.urlopen(rootURL).read()
2876 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2877 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2879 info['title'] = info['id']
2881 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2884 'type': 'reference',
2885 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2890 for entry in info['list']:
2891 assert entry['type'] == 'reference'
2892 results += self.extract(entry['url'])
2895 class MTVIE(InfoExtractor):
2896 """Information extractor for MTV.com"""
2898 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2901 def _real_extract(self, url):
2902 mobj = re.match(self._VALID_URL, url)
2904 raise ExtractorError(u'Invalid URL: %s' % url)
2905 if not mobj.group('proto'):
2906 url = 'http://' + url
2907 video_id = mobj.group('videoid')
2909 webpage = self._download_webpage(url, video_id)
2911 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2912 webpage, u'song name', fatal=False)
2914 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2917 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2918 webpage, u'mtvn_uri', fatal=False)
2920 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2921 webpage, u'content id', fatal=False)
2923 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2924 self.report_extraction(video_id)
2925 request = compat_urllib_request.Request(videogen_url)
2927 metadataXml = compat_urllib_request.urlopen(request).read()
2928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2929 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2931 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2932 renditions = mdoc.findall('.//rendition')
2934 # For now, always pick the highest quality.
2935 rendition = renditions[-1]
2938 _,_,ext = rendition.attrib['type'].partition('/')
2939 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2940 video_url = rendition.find('./src').text
2942 raise ExtractorError('Invalid rendition field.')
2947 'uploader': performer,
2948 'upload_date': None,
2949 'title': video_title,
2957 class YoukuIE(InfoExtractor):
2958 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2961 nowTime = int(time.time() * 1000)
2962 random1 = random.randint(1000,1998)
2963 random2 = random.randint(1000,9999)
2965 return "%d%d%d" %(nowTime,random1,random2)
2967 def _get_file_ID_mix_string(self, seed):
2969 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2971 for i in range(len(source)):
2972 seed = (seed * 211 + 30031 ) % 65536
2973 index = math.floor(seed / 65536 * len(source) )
2974 mixed.append(source[int(index)])
2975 source.remove(source[int(index)])
2976 #return ''.join(mixed)
2979 def _get_file_id(self, fileId, seed):
2980 mixed = self._get_file_ID_mix_string(seed)
2981 ids = fileId.split('*')
2985 realId.append(mixed[int(ch)])
2986 return ''.join(realId)
2988 def _real_extract(self, url):
2989 mobj = re.match(self._VALID_URL, url)
2991 raise ExtractorError(u'Invalid URL: %s' % url)
2992 video_id = mobj.group('ID')
2994 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2996 jsondata = self._download_webpage(info_url, video_id)
2998 self.report_extraction(video_id)
3000 config = json.loads(jsondata)
3002 video_title = config['data'][0]['title']
3003 seed = config['data'][0]['seed']
3005 format = self._downloader.params.get('format', None)
3006 supported_format = list(config['data'][0]['streamfileids'].keys())
3008 if format is None or format == 'best':
3009 if 'hd2' in supported_format:
3014 elif format == 'worst':
3022 fileid = config['data'][0]['streamfileids'][format]
3023 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3024 except (UnicodeDecodeError, ValueError, KeyError):
3025 raise ExtractorError(u'Unable to extract info section')
3028 sid = self._gen_sid()
3029 fileid = self._get_file_id(fileid, seed)
3031 #column 8,9 of fileid represent the segment number
3032 #fileid[7:9] should be changed
3033 for index, key in enumerate(keys):
3035 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3036 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3039 'id': '%s_part%02d' % (video_id, index),
3040 'url': download_url,
3042 'upload_date': None,
3043 'title': video_title,
3046 files_info.append(info)
3051 class XNXXIE(InfoExtractor):
3052 """Information extractor for xnxx.com"""
3054 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3056 VIDEO_URL_RE = r'flv_url=(.*?)&'
3057 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3058 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3060 def _real_extract(self, url):
3061 mobj = re.match(self._VALID_URL, url)
3063 raise ExtractorError(u'Invalid URL: %s' % url)
3064 video_id = mobj.group(1)
3066 # Get webpage content
3067 webpage = self._download_webpage(url, video_id)
3069 video_url = self._search_regex(self.VIDEO_URL_RE,
3070 webpage, u'video URL')
3071 video_url = compat_urllib_parse.unquote(video_url)
3073 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3076 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3077 webpage, u'thumbnail', fatal=False)
3083 'upload_date': None,
3084 'title': video_title,
3086 'thumbnail': video_thumbnail,
3087 'description': None,
3091 class GooglePlusIE(InfoExtractor):
3092 """Information extractor for plus.google.com."""
3094 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3095 IE_NAME = u'plus.google'
3097 def _real_extract(self, url):
3098 # Extract id from URL
3099 mobj = re.match(self._VALID_URL, url)
3101 raise ExtractorError(u'Invalid URL: %s' % url)
3103 post_url = mobj.group(0)
3104 video_id = mobj.group(1)
3106 video_extension = 'flv'
3108 # Step 1, Retrieve post webpage to extract further information
3109 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3111 self.report_extraction(video_id)
3113 # Extract update date
3114 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3115 webpage, u'upload date', fatal=False)
3117 # Convert timestring to a format suitable for filename
3118 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3119 upload_date = upload_date.strftime('%Y%m%d')
3122 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3123 webpage, u'uploader', fatal=False)
3126 # Get the first line for title
3127 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3128 webpage, 'title', default=u'NA')
3130 # Step 2, Stimulate clicking the image box to launch video
3131 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3132 webpage, u'video page URL')
3133 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3135 # Extract video links on video page
3136 """Extract video links of all sizes"""
3137 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3138 mobj = re.findall(pattern, webpage)
3140 raise ExtractorError(u'Unable to extract video links')
3142 # Sort in resolution
3143 links = sorted(mobj)
3145 # Choose the lowest of the sort, i.e. highest resolution
3146 video_url = links[-1]
3147 # Only get the url. The resolution part in the tuple has no use anymore
3148 video_url = video_url[-1]
3149 # Treat escaped \u0026 style hex
3151 video_url = video_url.decode("unicode_escape")
3152 except AttributeError: # Python 3
3153 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3159 'uploader': uploader,
3160 'upload_date': upload_date,
3161 'title': video_title,
3162 'ext': video_extension,
3165 class NBAIE(InfoExtractor):
3166 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3169 def _real_extract(self, url):
3170 mobj = re.match(self._VALID_URL, url)
3172 raise ExtractorError(u'Invalid URL: %s' % url)
3174 video_id = mobj.group(1)
3176 webpage = self._download_webpage(url, video_id)
3178 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3180 shortened_video_id = video_id.rpartition('/')[2]
3181 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3182 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3184 # It isn't there in the HTML it returns to us
3185 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3187 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3190 'id': shortened_video_id,
3194 # 'uploader_date': uploader_date,
3195 'description': description,
3199 class JustinTVIE(InfoExtractor):
3200 """Information extractor for justin.tv and twitch.tv"""
3201 # TODO: One broadcast may be split into multiple videos. The key
3202 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3203 # starts at 1 and increases. Can we treat all parts as one video?
3205 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3207 (?P<channelid>[^/]+)|
3208 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3209 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3213 _JUSTIN_PAGE_LIMIT = 100
3214 IE_NAME = u'justin.tv'
3216 def report_download_page(self, channel, offset):
3217 """Report attempt to download a single page of videos."""
3218 self.to_screen(u'%s: Downloading video information from %d to %d' %
3219 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3221 # Return count of items, list of *valid* items
3222 def _parse_page(self, url, video_id):
3223 webpage = self._download_webpage(url, video_id,
3224 u'Downloading video info JSON',
3225 u'unable to download video info JSON')
3227 response = json.loads(webpage)
3228 if type(response) != list:
3229 error_text = response.get('error', 'unknown error')
3230 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3232 for clip in response:
3233 video_url = clip['video_file_url']
3235 video_extension = os.path.splitext(video_url)[1][1:]
3236 video_date = re.sub('-', '', clip['start_time'][:10])
3237 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3238 video_id = clip['id']
3239 video_title = clip.get('title', video_id)
3243 'title': video_title,
3244 'uploader': clip.get('channel_name', video_uploader_id),
3245 'uploader_id': video_uploader_id,
3246 'upload_date': video_date,
3247 'ext': video_extension,
3249 return (len(response), info)
3251 def _real_extract(self, url):
3252 mobj = re.match(self._VALID_URL, url)
3254 raise ExtractorError(u'invalid URL: %s' % url)
3256 api_base = 'http://api.justin.tv'
3258 if mobj.group('channelid'):
3260 video_id = mobj.group('channelid')
3261 api = api_base + '/channel/archives/%s.json' % video_id
3262 elif mobj.group('chapterid'):
3263 chapter_id = mobj.group('chapterid')
3265 webpage = self._download_webpage(url, chapter_id)
3266 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3268 raise ExtractorError(u'Cannot find archive of a chapter')
3269 archive_id = m.group(1)
3271 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3272 chapter_info_xml = self._download_webpage(api, chapter_id,
3273 note=u'Downloading chapter information',
3274 errnote=u'Chapter information download failed')
3275 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3276 for a in doc.findall('.//archive'):
3277 if archive_id == a.find('./id').text:
3280 raise ExtractorError(u'Could not find chapter in chapter information')
3282 video_url = a.find('./video_file_url').text
3283 video_ext = video_url.rpartition('.')[2] or u'flv'
3285 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3286 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3287 note='Downloading chapter metadata',
3288 errnote='Download of chapter metadata failed')
3289 chapter_info = json.loads(chapter_info_json)
3291 bracket_start = int(doc.find('.//bracket_start').text)
3292 bracket_end = int(doc.find('.//bracket_end').text)
3294 # TODO determine start (and probably fix up file)
3295 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3296 #video_url += u'?start=' + TODO:start_timestamp
3297 # bracket_start is 13290, but we want 51670615
3298 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3299 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3302 'id': u'c' + chapter_id,
3305 'title': chapter_info['title'],
3306 'thumbnail': chapter_info['preview'],
3307 'description': chapter_info['description'],
3308 'uploader': chapter_info['channel']['display_name'],
3309 'uploader_id': chapter_info['channel']['name'],
3313 video_id = mobj.group('videoid')
3314 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3316 self.report_extraction(video_id)
3320 limit = self._JUSTIN_PAGE_LIMIT
3323 self.report_download_page(video_id, offset)
3324 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3325 page_count, page_info = self._parse_page(page_url, video_id)
3326 info.extend(page_info)
3327 if not paged or page_count != limit:
3332 class FunnyOrDieIE(InfoExtractor):
3333 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3335 def _real_extract(self, url):
3336 mobj = re.match(self._VALID_URL, url)
3338 raise ExtractorError(u'invalid URL: %s' % url)
3340 video_id = mobj.group('id')
3341 webpage = self._download_webpage(url, video_id)
3343 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3344 webpage, u'video URL', flags=re.DOTALL)
3346 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3347 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3349 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3350 webpage, u'description', fatal=False, flags=re.DOTALL)
3357 'description': video_description,
3361 class SteamIE(InfoExtractor):
3362 _VALID_URL = r"""http://store\.steampowered\.com/
3364 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3366 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3368 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3369 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3372 def suitable(cls, url):
3373 """Receives a URL and returns True if suitable for this IE."""
3374 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3376 def _real_extract(self, url):
3377 m = re.match(self._VALID_URL, url, re.VERBOSE)
3378 gameID = m.group('gameID')
3380 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3381 webpage = self._download_webpage(videourl, gameID)
3383 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3384 videourl = self._AGECHECK_TEMPLATE % gameID
3385 self.report_age_confirmation()
3386 webpage = self._download_webpage(videourl, gameID)
3388 self.report_extraction(gameID)
3389 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3390 webpage, 'game title')
3392 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3393 mweb = re.finditer(urlRE, webpage)
3394 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3395 titles = re.finditer(namesRE, webpage)
3396 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3397 thumbs = re.finditer(thumbsRE, webpage)
3399 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3400 video_id = vid.group('videoID')
3401 title = vtitle.group('videoName')
3402 video_url = vid.group('videoURL')
3403 video_thumb = thumb.group('thumbnail')
3405 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3410 'title': unescapeHTML(title),
3411 'thumbnail': video_thumb
3414 return [self.playlist_result(videos, gameID, game_title)]
3416 class UstreamIE(InfoExtractor):
3417 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3418 IE_NAME = u'ustream'
3420 def _real_extract(self, url):
3421 m = re.match(self._VALID_URL, url)
3422 video_id = m.group('videoID')
3424 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3425 webpage = self._download_webpage(url, video_id)
3427 self.report_extraction(video_id)
3429 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3432 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3433 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3435 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3436 webpage, u'thumbnail', fatal=False)
3442 'title': video_title,
3443 'uploader': uploader,
3444 'thumbnail': thumbnail,
3448 class WorldStarHipHopIE(InfoExtractor):
3449 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3450 IE_NAME = u'WorldStarHipHop'
3452 def _real_extract(self, url):
3453 m = re.match(self._VALID_URL, url)
3454 video_id = m.group('id')
3456 webpage_src = self._download_webpage(url, video_id)
3458 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3459 webpage_src, u'video URL')
3461 if 'mp4' in video_url:
3466 video_title = self._html_search_regex(r"<title>(.*)</title>",
3467 webpage_src, u'title')
3469 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3470 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3471 webpage_src, u'thumbnail', fatal=False)
3474 _title = r"""candytitles.*>(.*)</span>"""
3475 mobj = re.search(_title, webpage_src)
3476 if mobj is not None:
3477 video_title = mobj.group(1)
3482 'title' : video_title,
3483 'thumbnail' : thumbnail,
3488 class RBMARadioIE(InfoExtractor):
3489 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3491 def _real_extract(self, url):
3492 m = re.match(self._VALID_URL, url)
3493 video_id = m.group('videoID')
3495 webpage = self._download_webpage(url, video_id)
3497 json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3498 webpage, u'json data')
3501 data = json.loads(json_data)
3502 except ValueError as e:
3503 raise ExtractorError(u'Invalid JSON: ' + str(e))
3505 video_url = data['akamai_url'] + '&cbr=256'
3506 url_parts = compat_urllib_parse_urlparse(video_url)
3507 video_ext = url_parts.path.rpartition('.')[2]
3512 'title': data['title'],
3513 'description': data.get('teaser_text'),
3514 'location': data.get('country_of_origin'),
3515 'uploader': data.get('host', {}).get('name'),
3516 'uploader_id': data.get('host', {}).get('slug'),
3517 'thumbnail': data.get('image', {}).get('large_url_2x'),
3518 'duration': data.get('duration'),
3523 class YouPornIE(InfoExtractor):
3524 """Information extractor for youporn.com."""
3525 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3527 def _print_formats(self, formats):
3528 """Print all available formats"""
3529 print(u'Available formats:')
3530 print(u'ext\t\tformat')
3531 print(u'---------------------------------')
3532 for format in formats:
3533 print(u'%s\t\t%s' % (format['ext'], format['format']))
3535 def _specific(self, req_format, formats):
3537 if(x["format"]==req_format):
3541 def _real_extract(self, url):
3542 mobj = re.match(self._VALID_URL, url)
3544 raise ExtractorError(u'Invalid URL: %s' % url)
3545 video_id = mobj.group('videoid')
3547 req = compat_urllib_request.Request(url)
3548 req.add_header('Cookie', 'age_verified=1')
3549 webpage = self._download_webpage(req, video_id)
3551 # Get JSON parameters
3552 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3554 params = json.loads(json_params)
3556 raise ExtractorError(u'Invalid JSON')
3558 self.report_extraction(video_id)
3560 video_title = params['title']
3561 upload_date = unified_strdate(params['release_date_f'])
3562 video_description = params['description']
3563 video_uploader = params['submitted_by']
3564 thumbnail = params['thumbnails'][0]['image']
3566 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3568 # Get all of the formats available
3569 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3570 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3571 webpage, u'download list').strip()
3573 # Get all of the links from the page
3574 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3575 links = re.findall(LINK_RE, download_list_html)
3576 if(len(links) == 0):
3577 raise ExtractorError(u'ERROR: no known formats available for video')
3579 self.to_screen(u'Links found: %d' % len(links))
3584 # A link looks like this:
3585 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3586 # A path looks like this:
3587 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3588 video_url = unescapeHTML( link )
3589 path = compat_urllib_parse_urlparse( video_url ).path
3590 extension = os.path.splitext( path )[1][1:]
3591 format = path.split('/')[4].split('_')[:2]
3594 format = "-".join( format )
3595 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3600 'uploader': video_uploader,
3601 'upload_date': upload_date,
3602 'title': video_title,
3605 'thumbnail': thumbnail,
3606 'description': video_description
3609 if self._downloader.params.get('listformats', None):
3610 self._print_formats(formats)
3613 req_format = self._downloader.params.get('format', None)
3614 self.to_screen(u'Format: %s' % req_format)
3616 if req_format is None or req_format == 'best':
3618 elif req_format == 'worst':
3619 return [formats[-1]]
3620 elif req_format in ('-1', 'all'):
3623 format = self._specific( req_format, formats )
3625 raise ExtractorError(u'Requested format not available')
3630 class PornotubeIE(InfoExtractor):
3631 """Information extractor for pornotube.com."""
3632 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3634 def _real_extract(self, url):
3635 mobj = re.match(self._VALID_URL, url)
3637 raise ExtractorError(u'Invalid URL: %s' % url)
3639 video_id = mobj.group('videoid')
3640 video_title = mobj.group('title')
3642 # Get webpage content
3643 webpage = self._download_webpage(url, video_id)
3646 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3647 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3648 video_url = compat_urllib_parse.unquote(video_url)
3650 #Get the uploaded date
3651 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3652 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3653 if upload_date: upload_date = unified_strdate(upload_date)
3655 info = {'id': video_id,
3658 'upload_date': upload_date,
3659 'title': video_title,
3665 class YouJizzIE(InfoExtractor):
3666 """Information extractor for youjizz.com."""
3667 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3669 def _real_extract(self, url):
3670 mobj = re.match(self._VALID_URL, url)
3672 raise ExtractorError(u'Invalid URL: %s' % url)
3674 video_id = mobj.group('videoid')
3676 # Get webpage content
3677 webpage = self._download_webpage(url, video_id)
3679 # Get the video title
3680 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3681 webpage, u'title').strip()
3683 # Get the embed page
3684 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3686 raise ExtractorError(u'ERROR: unable to extract embed page')
3688 embed_page_url = result.group(0).strip()
3689 video_id = result.group('videoid')
3691 webpage = self._download_webpage(embed_page_url, video_id)
3694 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3695 webpage, u'video URL')
3697 info = {'id': video_id,
3699 'title': video_title,
3702 'player_url': embed_page_url}
3706 class EightTracksIE(InfoExtractor):
3708 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3710 def _real_extract(self, url):
3711 mobj = re.match(self._VALID_URL, url)
3713 raise ExtractorError(u'Invalid URL: %s' % url)
3714 playlist_id = mobj.group('id')
3716 webpage = self._download_webpage(url, playlist_id)
3718 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3719 data = json.loads(json_like)
3721 session = str(random.randint(0, 1000000000))
3723 track_count = data['tracks_count']
3724 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3725 next_url = first_url
3727 for i in itertools.count():
3728 api_json = self._download_webpage(next_url, playlist_id,
3729 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3730 errnote=u'Failed to download song information')
3731 api_data = json.loads(api_json)
3732 track_data = api_data[u'set']['track']
3734 'id': track_data['id'],
3735 'url': track_data['track_file_stream_url'],
3736 'title': track_data['performer'] + u' - ' + track_data['name'],
3737 'raw_title': track_data['name'],
3738 'uploader_id': data['user']['login'],
3742 if api_data['set']['at_last_track']:
3744 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3747 class KeekIE(InfoExtractor):
3748 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3751 def _real_extract(self, url):
3752 m = re.match(self._VALID_URL, url)
3753 video_id = m.group('videoID')
3755 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3756 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3757 webpage = self._download_webpage(url, video_id)
3759 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3762 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3763 webpage, u'uploader', fatal=False)
3769 'title': video_title,
3770 'thumbnail': thumbnail,
3771 'uploader': uploader
3775 class TEDIE(InfoExtractor):
3776 _VALID_URL=r'''http://www\.ted\.com/
3778 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3780 ((?P<type_talk>talks)) # We have a simple talk
3782 (/lang/(.*?))? # The url may contain the language
3783 /(?P<name>\w+) # Here goes the name and then ".html"
3787 def suitable(cls, url):
3788 """Receives a URL and returns True if suitable for this IE."""
3789 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3791 def _real_extract(self, url):
3792 m=re.match(self._VALID_URL, url, re.VERBOSE)
3793 if m.group('type_talk'):
3794 return [self._talk_info(url)]
3796 playlist_id=m.group('playlist_id')
3797 name=m.group('name')
3798 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3799 return [self._playlist_videos_info(url,name,playlist_id)]
3801 def _talk_video_link(self,mediaSlug):
3802 '''Returns the video link for that mediaSlug'''
3803 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3805 def _playlist_videos_info(self,url,name,playlist_id=0):
3806 '''Returns the videos of the playlist'''
3808 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3809 ([.\s]*?)data-playlist_item_id="(\d+)"
3810 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3812 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3813 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3814 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3815 m_names=re.finditer(video_name_RE,webpage)
3817 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3818 m_playlist = re.search(playlist_RE, webpage)
3819 playlist_title = m_playlist.group('playlist_title')
3821 playlist_entries = []
3822 for m_video, m_name in zip(m_videos,m_names):
3823 video_id=m_video.group('video_id')
3824 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3825 playlist_entries.append(self.url_result(talk_url, 'TED'))
3826 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3828 def _talk_info(self, url, video_id=0):
3829 """Return the video for the talk in the url"""
3830 m=re.match(self._VALID_URL, url,re.VERBOSE)
3831 videoName=m.group('name')
3832 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3833 # If the url includes the language we get the title translated
3834 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3835 title=re.search(title_RE, webpage).group('title')
3836 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3837 "id":(?P<videoID>[\d]+).*?
3838 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3839 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3840 thumb_match=re.search(thumb_RE,webpage)
3841 info_match=re.search(info_RE,webpage,re.VERBOSE)
3842 video_id=info_match.group('videoID')
3843 mediaSlug=info_match.group('mediaSlug')
3844 video_url=self._talk_video_link(mediaSlug)
3850 'thumbnail': thumb_match.group('thumbnail')
3854 class MySpassIE(InfoExtractor):
3855 _VALID_URL = r'http://www.myspass.de/.*'
3857 def _real_extract(self, url):
3858 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3860 # video id is the last path element of the URL
3861 # usually there is a trailing slash, so also try the second but last
3862 url_path = compat_urllib_parse_urlparse(url).path
3863 url_parent_path, video_id = os.path.split(url_path)
3865 _, video_id = os.path.split(url_parent_path)
3868 metadata_url = META_DATA_URL_TEMPLATE % video_id
3869 metadata_text = self._download_webpage(metadata_url, video_id)
3870 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3872 # extract values from metadata
3873 url_flv_el = metadata.find('url_flv')
3874 if url_flv_el is None:
3875 raise ExtractorError(u'Unable to extract download url')
3876 video_url = url_flv_el.text
3877 extension = os.path.splitext(video_url)[1][1:]
3878 title_el = metadata.find('title')
3879 if title_el is None:
3880 raise ExtractorError(u'Unable to extract title')
3881 title = title_el.text
3882 format_id_el = metadata.find('format_id')
3883 if format_id_el is None:
3886 format = format_id_el.text
3887 description_el = metadata.find('description')
3888 if description_el is not None:
3889 description = description_el.text
3892 imagePreview_el = metadata.find('imagePreview')
3893 if imagePreview_el is not None:
3894 thumbnail = imagePreview_el.text
3903 'thumbnail': thumbnail,
3904 'description': description
3908 class SpiegelIE(InfoExtractor):
3909 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3911 def _real_extract(self, url):
3912 m = re.match(self._VALID_URL, url)
3913 video_id = m.group('videoID')
3915 webpage = self._download_webpage(url, video_id)
3917 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3920 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3921 xml_code = self._download_webpage(xml_url, video_id,
3922 note=u'Downloading XML', errnote=u'Failed to download XML')
3924 idoc = xml.etree.ElementTree.fromstring(xml_code)
3925 last_type = idoc[-1]
3926 filename = last_type.findall('./filename')[0].text
3927 duration = float(last_type.findall('./duration')[0].text)
3929 video_url = 'http://video2.spiegel.de/flash/' + filename
3930 video_ext = filename.rpartition('.')[2]
3935 'title': video_title,
3936 'duration': duration,
3940 class LiveLeakIE(InfoExtractor):
3942 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3943 IE_NAME = u'liveleak'
3945 def _real_extract(self, url):
3946 mobj = re.match(self._VALID_URL, url)
3948 raise ExtractorError(u'Invalid URL: %s' % url)
3950 video_id = mobj.group('video_id')
3952 webpage = self._download_webpage(url, video_id)
3954 video_url = self._search_regex(r'file: "(.*?)",',
3955 webpage, u'video URL')
3957 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3958 webpage, u'title').replace('LiveLeak.com -', '').strip()
3960 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3961 webpage, u'description', fatal=False)
3963 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3964 webpage, u'uploader', fatal=False)
3970 'title': video_title,
3971 'description': video_description,
3972 'uploader': video_uploader
3977 class ARDIE(InfoExtractor):
3978 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3979 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3980 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3982 def _real_extract(self, url):
3983 # determine video id from url
3984 m = re.match(self._VALID_URL, url)
3986 numid = re.search(r'documentId=([0-9]+)', url)
3988 video_id = numid.group(1)
3990 video_id = m.group('video_id')
3992 # determine title and media streams from webpage
3993 html = self._download_webpage(url, video_id)
3994 title = re.search(self._TITLE, html).group('title')
3995 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3997 assert '"fsk"' in html
3998 raise ExtractorError(u'This video is only available after 8:00 pm')
4000 # choose default media type and highest quality for now
4001 stream = max([s for s in streams if int(s["media_type"]) == 0],
4002 key=lambda s: int(s["quality"]))
4004 # there's two possibilities: RTMP stream or HTTP download
4005 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4006 if stream['rtmp_url']:
4007 self.to_screen(u'RTMP download detected')
4008 assert stream['video_url'].startswith('mp4:')
4009 info["url"] = stream["rtmp_url"]
4010 info["play_path"] = stream['video_url']
4012 assert stream["video_url"].endswith('.mp4')
4013 info["url"] = stream["video_url"]
4016 class ZDFIE(InfoExtractor):
4017 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4018 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4019 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4020 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4021 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4023 def _real_extract(self, url):
4024 mobj = re.match(self._VALID_URL, url)
4026 raise ExtractorError(u'Invalid URL: %s' % url)
4027 video_id = mobj.group('video_id')
4029 html = self._download_webpage(url, video_id)
4030 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4032 raise ExtractorError(u'No media url found.')
4034 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4035 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4036 # choose first/default media type and highest quality for now
4037 for s in streams: #find 300 - dsl1000mbit
4038 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4041 for s in streams: #find veryhigh - dsl2000mbit
4042 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4046 raise ExtractorError(u'No stream found.')
4048 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4050 self.report_extraction(video_id)
4051 mobj = re.search(self._TITLE, html)
4053 raise ExtractorError(u'Cannot extract title')
4054 title = unescapeHTML(mobj.group('title'))
4056 mobj = re.search(self._MMS_STREAM, media_link)
4058 mobj = re.search(self._RTSP_STREAM, media_link)
4060 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4061 mms_url = mobj.group('video_url')
4063 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4065 raise ExtractorError(u'Cannot extract extention')
4066 ext = mobj.group('ext')
4068 return [{'id': video_id,
4074 class TumblrIE(InfoExtractor):
4075 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4077 def _real_extract(self, url):
4078 m_url = re.match(self._VALID_URL, url)
4079 video_id = m_url.group('id')
4080 blog = m_url.group('blog_name')
4082 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4083 webpage = self._download_webpage(url, video_id)
4085 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4086 video = re.search(re_video, webpage)
4088 raise ExtractorError(u'Unable to extract video')
4089 video_url = video.group('video_url')
4090 ext = video.group('ext')
4092 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4093 webpage, u'thumbnail', fatal=False) # We pick the first poster
4094 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4096 # The only place where you can get a title, it's not complete,
4097 # but searching in other places doesn't work for all videos
4098 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4099 webpage, u'title', flags=re.DOTALL)
4101 return [{'id': video_id,
4103 'title': video_title,
4104 'thumbnail': video_thumbnail,
4108 class BandcampIE(InfoExtractor):
4109 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4111 def _real_extract(self, url):
4112 mobj = re.match(self._VALID_URL, url)
4113 title = mobj.group('title')
4114 webpage = self._download_webpage(url, title)
4115 # We get the link to the free download page
4116 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4117 if m_download is None:
4118 raise ExtractorError(u'No free songs found')
4120 download_link = m_download.group(1)
4121 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4122 webpage, re.MULTILINE|re.DOTALL).group('id')
4124 download_webpage = self._download_webpage(download_link, id,
4125 'Downloading free downloads page')
4126 # We get the dictionary of the track from some javascrip code
4127 info = re.search(r'items: (.*?),$',
4128 download_webpage, re.MULTILINE).group(1)
4129 info = json.loads(info)[0]
4130 # We pick mp3-320 for now, until format selection can be easily implemented.
4131 mp3_info = info[u'downloads'][u'mp3-320']
4132 # If we try to use this url it says the link has expired
4133 initial_url = mp3_info[u'url']
4134 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4135 m_url = re.match(re_url, initial_url)
4136 #We build the url we will use to get the final track url
4137 # This url is build in Bandcamp in the script download_bunde_*.js
4138 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4139 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4140 # If we could correctly generate the .rand field the url would be
4141 #in the "download_url" key
4142 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4144 track_info = {'id':id,
4145 'title' : info[u'title'],
4148 'thumbnail' : info[u'thumb_url'],
4149 'uploader' : info[u'artist']
4154 class RedTubeIE(InfoExtractor):
4155 """Information Extractor for redtube"""
4156 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4158 def _real_extract(self,url):
4159 mobj = re.match(self._VALID_URL, url)
4161 raise ExtractorError(u'Invalid URL: %s' % url)
4163 video_id = mobj.group('id')
4164 video_extension = 'mp4'
4165 webpage = self._download_webpage(url, video_id)
4167 self.report_extraction(video_id)
4169 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4170 webpage, u'video URL')
4172 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4178 'ext': video_extension,
4179 'title': video_title,
4182 class InaIE(InfoExtractor):
4183 """Information Extractor for Ina.fr"""
4184 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4186 def _real_extract(self,url):
4187 mobj = re.match(self._VALID_URL, url)
4189 video_id = mobj.group('id')
4190 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4191 video_extension = 'mp4'
4192 webpage = self._download_webpage(mrss_url, video_id)
4194 self.report_extraction(video_id)
4196 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4197 webpage, u'video URL')
4199 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4205 'ext': video_extension,
4206 'title': video_title,
4209 class HowcastIE(InfoExtractor):
4210 """Information Extractor for Howcast.com"""
4211 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4213 def _real_extract(self, url):
4214 mobj = re.match(self._VALID_URL, url)
4216 video_id = mobj.group('id')
4217 webpage_url = 'http://www.howcast.com/videos/' + video_id
4218 webpage = self._download_webpage(webpage_url, video_id)
4220 self.report_extraction(video_id)
4222 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4223 webpage, u'video URL')
4225 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4228 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4229 webpage, u'description', fatal=False)
4231 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4232 webpage, u'thumbnail', fatal=False)
4238 'title': video_title,
4239 'description': video_description,
4240 'thumbnail': thumbnail,
4243 class VineIE(InfoExtractor):
4244 """Information Extractor for Vine.co"""
4245 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4247 def _real_extract(self, url):
4248 mobj = re.match(self._VALID_URL, url)
4250 video_id = mobj.group('id')
4251 webpage_url = 'https://vine.co/v/' + video_id
4252 webpage = self._download_webpage(webpage_url, video_id)
4254 self.report_extraction(video_id)
4256 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4257 webpage, u'video URL')
4259 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4262 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4263 webpage, u'thumbnail', fatal=False)
4265 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4266 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4272 'title': video_title,
4273 'thumbnail': thumbnail,
4274 'uploader': uploader,
4277 class FlickrIE(InfoExtractor):
4278 """Information Extractor for Flickr videos"""
4279 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4281 def _real_extract(self, url):
4282 mobj = re.match(self._VALID_URL, url)
4284 video_id = mobj.group('id')
4285 video_uploader_id = mobj.group('uploader_id')
4286 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4287 webpage = self._download_webpage(webpage_url, video_id)
4289 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4291 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4292 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4294 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4295 first_xml, u'node_id')
4297 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4298 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4300 self.report_extraction(video_id)
4302 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4304 raise ExtractorError(u'Unable to extract video url')
4305 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4307 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4308 webpage, u'video title')
4310 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4311 webpage, u'description', fatal=False)
4313 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4314 webpage, u'thumbnail', fatal=False)
4320 'title': video_title,
4321 'description': video_description,
4322 'thumbnail': thumbnail,
4323 'uploader_id': video_uploader_id,
4326 class TeamcocoIE(InfoExtractor):
4327 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4329 def _real_extract(self, url):
4330 mobj = re.match(self._VALID_URL, url)
4332 raise ExtractorError(u'Invalid URL: %s' % url)
4333 url_title = mobj.group('url_title')
4334 webpage = self._download_webpage(url, url_title)
4336 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4337 webpage, u'video id')
4339 self.report_extraction(video_id)
4341 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4344 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4345 webpage, u'thumbnail', fatal=False)
4347 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4348 webpage, u'description', fatal=False)
4350 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4351 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4353 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4360 'title': video_title,
4361 'thumbnail': thumbnail,
4362 'description': video_description,
4365 class XHamsterIE(InfoExtractor):
4366 """Information Extractor for xHamster"""
4367 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4369 def _real_extract(self,url):
4370 mobj = re.match(self._VALID_URL, url)
4372 video_id = mobj.group('id')
4373 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4374 webpage = self._download_webpage(mrss_url, video_id)
4376 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4378 raise ExtractorError(u'Unable to extract media URL')
4379 if len(mobj.group('server')) == 0:
4380 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4382 video_url = mobj.group('server')+'/key='+mobj.group('file')
4383 video_extension = video_url.split('.')[-1]
4385 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4388 # Can't see the description anywhere in the UI
4389 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4390 # webpage, u'description', fatal=False)
4391 # if video_description: video_description = unescapeHTML(video_description)
4393 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4395 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4397 video_upload_date = None
4398 self._downloader.report_warning(u'Unable to extract upload date')
4400 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4401 webpage, u'uploader id', default=u'anonymous')
4403 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4404 webpage, u'thumbnail', fatal=False)
4409 'ext': video_extension,
4410 'title': video_title,
4411 # 'description': video_description,
4412 'upload_date': video_upload_date,
4413 'uploader_id': video_uploader_id,
4414 'thumbnail': video_thumbnail
4417 class HypemIE(InfoExtractor):
4418 """Information Extractor for hypem"""
4419 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4421 def _real_extract(self, url):
4422 mobj = re.match(self._VALID_URL, url)
4424 raise ExtractorError(u'Invalid URL: %s' % url)
4425 track_id = mobj.group(1)
4427 data = { 'ax': 1, 'ts': time.time() }
4428 data_encoded = compat_urllib_parse.urlencode(data)
4429 complete_url = url + "?" + data_encoded
4430 request = compat_urllib_request.Request(complete_url)
4431 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4432 cookie = urlh.headers.get('Set-Cookie', '')
4434 self.report_extraction(track_id)
4436 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4437 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4439 track_list = json.loads(html_tracks)
4440 track = track_list[u'tracks'][0]
4442 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4445 track_id = track[u"id"]
4446 artist = track[u"artist"]
4447 title = track[u"song"]
4449 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4450 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4451 request.add_header('cookie', cookie)
4452 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4454 song_data = json.loads(song_data_json)
4456 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4457 final_url = song_data[u"url"]
4467 class Vbox7IE(InfoExtractor):
4468 """Information Extractor for Vbox7"""
4469 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4471 def _real_extract(self,url):
4472 mobj = re.match(self._VALID_URL, url)
4474 raise ExtractorError(u'Invalid URL: %s' % url)
4475 video_id = mobj.group(1)
4477 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4478 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4479 redirect_url = urlh.geturl() + new_location
4480 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4482 title = self._html_search_regex(r'<title>(.*)</title>',
4483 webpage, u'title').split('/')[0].strip()
4486 info_url = "http://vbox7.com/play/magare.do"
4487 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4488 info_request = compat_urllib_request.Request(info_url, data)
4489 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4490 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4491 if info_response is None:
4492 raise ExtractorError(u'Unable to extract the media url')
4493 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4500 'thumbnail': thumbnail_url,
4503 class GametrailersIE(InfoExtractor):
4504 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4506 def _real_extract(self, url):
4507 mobj = re.match(self._VALID_URL, url)
4509 raise ExtractorError(u'Invalid URL: %s' % url)
4510 video_id = mobj.group('id')
4511 video_type = mobj.group('type')
4512 webpage = self._download_webpage(url, video_id)
4513 if video_type == 'full-episodes':
4514 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4516 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4517 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4518 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4520 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4521 video_id, u'Downloading video info')
4522 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4523 video_id, u'Downloading video urls info')
4525 self.report_extraction(video_id)
4526 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4527 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4529 <url>(?P<thumb>.*?)</url>.*
4532 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4534 raise ExtractorError(u'Unable to extract video info')
4535 video_title = m_info.group('title')
4536 video_description = m_info.group('description')
4537 video_thumb = m_info.group('thumb')
4539 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4540 if m_urls is None or len(m_urls) == 0:
4541 raise ExtractError(u'Unable to extrat video url')
4542 # They are sorted from worst to best quality
4543 video_url = m_urls[-1].group('url')
4545 return {'url': video_url,
4547 'title': video_title,
4548 # Videos are actually flv not mp4
4550 'thumbnail': video_thumb,
4551 'description': video_description,
4554 def gen_extractors():
4555 """ Return a list of an instance of every supported extractor.
4556 The order does matter; the first extractor matched is the one handling the URL.
4559 YoutubePlaylistIE(),
4584 StanfordOpenClassroomIE(),
4594 WorldStarHipHopIE(),
4623 def get_info_extractor(ie_name):
4624 """Returns the info extractor class with the given ie_name"""
4625 return globals()[ie_name+'IE']