2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
736 if not 'ratebypass' in url: url += '&ratebypass=yes'
737 url_map[url_data['itag'][0]] = url
739 format_limit = self._downloader.params.get('format_limit', None)
740 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
741 if format_limit is not None and format_limit in available_formats:
742 format_list = available_formats[available_formats.index(format_limit):]
744 format_list = available_formats
745 existing_formats = [x for x in format_list if x in url_map]
746 if len(existing_formats) == 0:
747 raise ExtractorError(u'no known formats available for video')
748 if self._downloader.params.get('listformats', None):
749 self._print_formats(existing_formats)
751 if req_format is None or req_format == 'best':
752 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
753 elif req_format == 'worst':
754 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
755 elif req_format in ('-1', 'all'):
756 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
758 # Specific formats. We pick the first in a slash-delimeted sequence.
759 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
760 req_formats = req_format.split('/')
761 video_url_list = None
762 for rf in req_formats:
764 video_url_list = [(rf, url_map[rf])]
766 if video_url_list is None:
767 raise ExtractorError(u'requested format not available')
769 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
772 for format_param, video_real_url in video_url_list:
774 video_extension = self._video_extensions.get(format_param, 'flv')
776 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
777 self._video_dimensions.get(format_param, '???'))
781 'url': video_real_url,
782 'uploader': video_uploader,
783 'uploader_id': video_uploader_id,
784 'upload_date': upload_date,
785 'title': video_title,
786 'ext': video_extension,
787 'format': video_format,
788 'thumbnail': video_thumbnail,
789 'description': video_description,
790 'player_url': player_url,
791 'subtitles': video_subtitles,
792 'duration': video_duration
797 class MetacafeIE(InfoExtractor):
798 """Information Extractor for metacafe.com."""
800 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
801 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
802 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
803 IE_NAME = u'metacafe'
805 def report_disclaimer(self):
806 """Report disclaimer retrieval."""
807 self.to_screen(u'Retrieving disclaimer')
809 def _real_initialize(self):
810 # Retrieve disclaimer
811 request = compat_urllib_request.Request(self._DISCLAIMER)
813 self.report_disclaimer()
814 disclaimer = compat_urllib_request.urlopen(request).read()
815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
816 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
821 'submit': "Continue - I'm over 18",
823 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
825 self.report_age_confirmation()
826 disclaimer = compat_urllib_request.urlopen(request).read()
827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
830 def _real_extract(self, url):
831 # Extract id and simplified title from URL
832 mobj = re.match(self._VALID_URL, url)
834 raise ExtractorError(u'Invalid URL: %s' % url)
836 video_id = mobj.group(1)
838 # Check if video comes from YouTube
839 mobj2 = re.match(r'^yt-(.*)$', video_id)
840 if mobj2 is not None:
841 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
843 # Retrieve video webpage to extract further information
844 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
846 # Extract URL, uploader and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
850 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
851 video_extension = mediaURL[-3:]
853 # Extract gdaKey if available
854 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
858 gdaKey = mobj.group(1)
859 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
861 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
863 raise ExtractorError(u'Unable to extract media URL')
864 vardict = compat_parse_qs(mobj.group(1))
865 if 'mediaData' not in vardict:
866 raise ExtractorError(u'Unable to extract media URL')
867 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
869 raise ExtractorError(u'Unable to extract media URL')
870 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
871 video_extension = mediaURL[-3:]
872 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
874 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
876 raise ExtractorError(u'Unable to extract title')
877 video_title = mobj.group(1).decode('utf-8')
879 mobj = re.search(r'submitter=(.*?);', webpage)
881 raise ExtractorError(u'Unable to extract uploader nickname')
882 video_uploader = mobj.group(1)
885 'id': video_id.decode('utf-8'),
886 'url': video_url.decode('utf-8'),
887 'uploader': video_uploader.decode('utf-8'),
889 'title': video_title,
890 'ext': video_extension.decode('utf-8'),
893 class DailymotionIE(InfoExtractor):
894 """Information Extractor for Dailymotion"""
896 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
897 IE_NAME = u'dailymotion'
899 def _real_extract(self, url):
900 # Extract id and simplified title from URL
901 mobj = re.match(self._VALID_URL, url)
903 raise ExtractorError(u'Invalid URL: %s' % url)
905 video_id = mobj.group(1).split('_')[0].split('?')[0]
907 video_extension = 'mp4'
909 # Retrieve video webpage to extract further information
910 request = compat_urllib_request.Request(url)
911 request.add_header('Cookie', 'family_filter=off')
912 webpage = self._download_webpage(request, video_id)
914 # Extract URL, uploader and title from webpage
915 self.report_extraction(video_id)
916 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
918 raise ExtractorError(u'Unable to extract media URL')
919 flashvars = compat_urllib_parse.unquote(mobj.group(1))
921 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
924 self.to_screen(u'Using %s' % key)
927 raise ExtractorError(u'Unable to extract video URL')
929 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
931 raise ExtractorError(u'Unable to extract video URL')
933 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
935 # TODO: support choosing qualities
937 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
939 raise ExtractorError(u'Unable to extract title')
940 video_title = unescapeHTML(mobj.group('title'))
942 video_uploader = None
943 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
944 # Looking for official user
945 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
946 webpage, 'video uploader')
948 video_upload_date = None
949 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
951 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
956 'uploader': video_uploader,
957 'upload_date': video_upload_date,
958 'title': video_title,
959 'ext': video_extension,
963 class PhotobucketIE(InfoExtractor):
964 """Information extractor for photobucket.com."""
966 # TODO: the original _VALID_URL was:
967 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
968 # Check if it's necessary to keep the old extracion process
969 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
970 IE_NAME = u'photobucket'
972 def _real_extract(self, url):
973 # Extract id from URL
974 mobj = re.match(self._VALID_URL, url)
976 raise ExtractorError(u'Invalid URL: %s' % url)
978 video_id = mobj.group('id')
980 video_extension = mobj.group('ext')
982 # Retrieve video webpage to extract further information
983 webpage = self._download_webpage(url, video_id)
985 # Extract URL, uploader, and title from webpage
986 self.report_extraction(video_id)
987 # We try first by looking the javascript code:
988 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
990 info = json.loads(mobj.group('json'))
993 'url': info[u'downloadUrl'],
994 'uploader': info[u'username'],
995 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
996 'title': info[u'title'],
997 'ext': video_extension,
998 'thumbnail': info[u'thumbUrl'],
1001 # We try looking in other parts of the webpage
1002 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1003 webpage, u'video URL')
1005 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1007 raise ExtractorError(u'Unable to extract title')
1008 video_title = mobj.group(1).decode('utf-8')
1009 video_uploader = mobj.group(2).decode('utf-8')
1012 'id': video_id.decode('utf-8'),
1013 'url': video_url.decode('utf-8'),
1014 'uploader': video_uploader,
1015 'upload_date': None,
1016 'title': video_title,
1017 'ext': video_extension.decode('utf-8'),
1021 class YahooIE(InfoExtractor):
1022 """Information extractor for screen.yahoo.com."""
1023 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1025 def _real_extract(self, url):
1026 mobj = re.match(self._VALID_URL, url)
1028 raise ExtractorError(u'Invalid URL: %s' % url)
1029 video_id = mobj.group('id')
1030 webpage = self._download_webpage(url, video_id)
1031 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1034 # TODO: Check which url parameters are required
1035 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1036 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1037 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1038 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1039 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1040 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1042 self.report_extraction(video_id)
1043 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1045 raise ExtractorError(u'Unable to extract video info')
1046 video_title = m_info.group('title')
1047 video_description = m_info.group('description')
1048 video_thumb = m_info.group('thumb')
1049 video_date = m_info.group('date')
1050 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1052 # TODO: Find a way to get mp4 videos
1053 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1054 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1055 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1056 video_url = m_rest.group('url')
1057 video_path = m_rest.group('path')
1059 raise ExtractorError(u'Unable to extract video url')
1061 else: # We have to use a different method if another id is defined
1062 long_id = m_id.group('new_id')
1063 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1064 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1065 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1066 info = json.loads(json_str)
1067 res = info[u'query'][u'results'][u'mediaObj'][0]
1068 stream = res[u'streams'][0]
1069 video_path = stream[u'path']
1070 video_url = stream[u'host']
1072 video_title = meta[u'title']
1073 video_description = meta[u'description']
1074 video_thumb = meta[u'thumbnail']
1075 video_date = None # I can't find it
1080 'play_path': video_path,
1081 'title':video_title,
1082 'description': video_description,
1083 'thumbnail': video_thumb,
1084 'upload_date': video_date,
1089 class VimeoIE(InfoExtractor):
1090 """Information extractor for vimeo.com."""
1092 # _VALID_URL matches Vimeo URLs
1093 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1096 def _real_extract(self, url, new_video=True):
1097 # Extract ID from URL
1098 mobj = re.match(self._VALID_URL, url)
1100 raise ExtractorError(u'Invalid URL: %s' % url)
1102 video_id = mobj.group('id')
1103 if not mobj.group('proto'):
1104 url = 'https://' + url
1105 if mobj.group('direct_link') or mobj.group('pro'):
1106 url = 'https://vimeo.com/' + video_id
1108 # Retrieve video webpage to extract further information
1109 request = compat_urllib_request.Request(url, None, std_headers)
1110 webpage = self._download_webpage(request, video_id)
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1123 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1125 raise ExtractorError(u'Unable to extract info section')
1128 video_title = config["video"]["title"]
1130 # Extract uploader and uploader_id
1131 video_uploader = config["video"]["owner"]["name"]
1132 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1134 # Extract video thumbnail
1135 video_thumbnail = config["video"]["thumbnail"]
1137 # Extract video description
1138 video_description = get_element_by_attribute("itemprop", "description", webpage)
1139 if video_description: video_description = clean_html(video_description)
1140 else: video_description = u''
1142 # Extract upload date
1143 video_upload_date = None
1144 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1145 if mobj is not None:
1146 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1148 # Vimeo specific: extract request signature and timestamp
1149 sig = config['request']['signature']
1150 timestamp = config['request']['timestamp']
1152 # Vimeo specific: extract video codec and quality information
1153 # First consider quality, then codecs, then take everything
1154 # TODO bind to format param
1155 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1156 files = { 'hd': [], 'sd': [], 'other': []}
1157 for codec_name, codec_extension in codecs:
1158 if codec_name in config["video"]["files"]:
1159 if 'hd' in config["video"]["files"][codec_name]:
1160 files['hd'].append((codec_name, codec_extension, 'hd'))
1161 elif 'sd' in config["video"]["files"][codec_name]:
1162 files['sd'].append((codec_name, codec_extension, 'sd'))
1164 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1166 for quality in ('hd', 'sd', 'other'):
1167 if len(files[quality]) > 0:
1168 video_quality = files[quality][0][2]
1169 video_codec = files[quality][0][0]
1170 video_extension = files[quality][0][1]
1171 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1174 raise ExtractorError(u'No known codec found')
1176 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1182 'uploader': video_uploader,
1183 'uploader_id': video_uploader_id,
1184 'upload_date': video_upload_date,
1185 'title': video_title,
1186 'ext': video_extension,
1187 'thumbnail': video_thumbnail,
1188 'description': video_description,
1192 class ArteTvIE(InfoExtractor):
1193 """arte.tv information extractor."""
1195 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196 _LIVE_URL = r'index-[0-9]+\.html$'
1198 IE_NAME = u'arte.tv'
1200 def fetch_webpage(self, url):
1201 request = compat_urllib_request.Request(url)
1203 self.report_download_webpage(url)
1204 webpage = compat_urllib_request.urlopen(request).read()
1205 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1206 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1207 except ValueError as err:
1208 raise ExtractorError(u'Invalid URL: %s' % url)
1211 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1212 page = self.fetch_webpage(url)
1213 mobj = re.search(regex, page, regexFlags)
1217 raise ExtractorError(u'Invalid URL: %s' % url)
1219 for (i, key, err) in matchTuples:
1220 if mobj.group(i) is None:
1221 raise ExtractorError(err)
1223 info[key] = mobj.group(i)
1227 def extractLiveStream(self, url):
1228 video_lang = url.split('/')[-4]
1229 info = self.grep_webpage(
1231 r'src="(.*?/videothek_js.*?\.js)',
1234 (1, 'url', u'Invalid URL: %s' % url)
1237 http_host = url.split('/')[2]
1238 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239 info = self.grep_webpage(
1241 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242 '(http://.*?\.swf).*?' +
1246 (1, 'path', u'could not extract video path: %s' % url),
1247 (2, 'player', u'could not extract video player: %s' % url),
1248 (3, 'url', u'could not extract video url: %s' % url)
1251 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1253 def extractPlus7Stream(self, url):
1254 video_lang = url.split('/')[-3]
1255 info = self.grep_webpage(
1257 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1260 (1, 'url', u'Invalid URL: %s' % url)
1263 next_url = compat_urllib_parse.unquote(info.get('url'))
1264 info = self.grep_webpage(
1266 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1269 (1, 'url', u'Could not find <video> tag: %s' % url)
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1274 info = self.grep_webpage(
1276 r'<video id="(.*?)".*?>.*?' +
1277 '<name>(.*?)</name>.*?' +
1278 '<dateVideo>(.*?)</dateVideo>.*?' +
1279 '<url quality="hd">(.*?)</url>',
1282 (1, 'id', u'could not extract video id: %s' % url),
1283 (2, 'title', u'could not extract video title: %s' % url),
1284 (3, 'date', u'could not extract video date: %s' % url),
1285 (4, 'url', u'could not extract video url: %s' % url)
1290 'id': info.get('id'),
1291 'url': compat_urllib_parse.unquote(info.get('url')),
1292 'uploader': u'arte.tv',
1293 'upload_date': unified_strdate(info.get('date')),
1294 'title': info.get('title').decode('utf-8'),
1300 def _real_extract(self, url):
1301 video_id = url.split('/')[-1]
1302 self.report_extraction(video_id)
1304 if re.search(self._LIVE_URL, video_id) is not None:
1305 self.extractLiveStream(url)
1308 info = self.extractPlus7Stream(url)
1313 class GenericIE(InfoExtractor):
1314 """Generic last-resort information extractor."""
1317 IE_NAME = u'generic'
1319 def report_download_webpage(self, video_id):
1320 """Report webpage download."""
1321 if not self._downloader.params.get('test', False):
1322 self._downloader.report_warning(u'Falling back on generic information extractor.')
1323 super(GenericIE, self).report_download_webpage(video_id)
1325 def report_following_redirect(self, new_url):
1326 """Report information extraction."""
1327 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1329 def _test_redirect(self, url):
1330 """Check if it is a redirect, like url shorteners, in case return the new url."""
1331 class HeadRequest(compat_urllib_request.Request):
1332 def get_method(self):
1335 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1337 Subclass the HTTPRedirectHandler to make it use our
1338 HeadRequest also on the redirected URL
1340 def redirect_request(self, req, fp, code, msg, headers, newurl):
1341 if code in (301, 302, 303, 307):
1342 newurl = newurl.replace(' ', '%20')
1343 newheaders = dict((k,v) for k,v in req.headers.items()
1344 if k.lower() not in ("content-length", "content-type"))
1345 return HeadRequest(newurl,
1347 origin_req_host=req.get_origin_req_host(),
1350 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1352 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1354 Fallback to GET if HEAD is not allowed (405 HTTP error)
1356 def http_error_405(self, req, fp, code, msg, headers):
1360 newheaders = dict((k,v) for k,v in req.headers.items()
1361 if k.lower() not in ("content-length", "content-type"))
1362 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1364 origin_req_host=req.get_origin_req_host(),
1368 opener = compat_urllib_request.OpenerDirector()
1369 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370 HTTPMethodFallback, HEADRedirectHandler,
1371 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372 opener.add_handler(handler())
1374 response = opener.open(HeadRequest(url))
1375 if response is None:
1376 raise ExtractorError(u'Invalid URL protocol')
1377 new_url = response.geturl()
1382 self.report_following_redirect(new_url)
1385 def _real_extract(self, url):
1386 new_url = self._test_redirect(url)
1387 if new_url: return [self.url_result(new_url)]
1389 video_id = url.split('/')[-1]
1391 webpage = self._download_webpage(url, video_id)
1392 except ValueError as err:
1393 # since this is the last-resort InfoExtractor, if
1394 # this error is thrown, it'll be thrown here
1395 raise ExtractorError(u'Invalid URL: %s' % url)
1397 self.report_extraction(video_id)
1398 # Start with something easy: JW Player in SWFObject
1399 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1401 # Broaden the search a little bit
1402 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit: JWPlayer JS loader
1405 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1407 # Try to find twitter cards info
1408 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1410 raise ExtractorError(u'Invalid URL: %s' % url)
1412 # It's possible that one of the regexes
1413 # matched, but returned an empty group:
1414 if mobj.group(1) is None:
1415 raise ExtractorError(u'Invalid URL: %s' % url)
1417 video_url = compat_urllib_parse.unquote(mobj.group(1))
1418 video_id = os.path.basename(video_url)
1420 # here's a fun little line of code for you:
1421 video_extension = os.path.splitext(video_id)[1][1:]
1422 video_id = os.path.splitext(video_id)[0]
1424 # it's tempting to parse this further, but you would
1425 # have to take into account all the variations like
1426 # Video Title - Site Name
1427 # Site Name | Video Title
1428 # Video Title - Tagline | Site Name
1429 # and so on and so forth; it's just not practical
1430 video_title = self._html_search_regex(r'<title>(.*)</title>',
1431 webpage, u'video title')
1433 # video uploader is domain name
1434 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1435 url, u'video uploader')
1440 'uploader': video_uploader,
1441 'upload_date': None,
1442 'title': video_title,
1443 'ext': video_extension,
1447 class YoutubeSearchIE(SearchInfoExtractor):
1448 """Information Extractor for YouTube search queries."""
1449 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1451 IE_NAME = u'youtube:search'
1452 _SEARCH_KEY = 'ytsearch'
1454 def report_download_page(self, query, pagenum):
1455 """Report attempt to download search page with given number."""
1456 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1458 def _get_n_results(self, query, n):
1459 """Get a specified number of results for a query"""
1465 while (50 * pagenum) < limit:
1466 self.report_download_page(query, pagenum+1)
1467 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1468 request = compat_urllib_request.Request(result_url)
1470 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1471 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1472 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1473 api_response = json.loads(data)['data']
1475 if not 'items' in api_response:
1476 raise ExtractorError(u'[youtube] No video results')
1478 new_ids = list(video['id'] for video in api_response['items'])
1479 video_ids += new_ids
1481 limit = min(n, api_response['totalItems'])
1484 if len(video_ids) > n:
1485 video_ids = video_ids[:n]
1486 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1487 return self.playlist_result(videos, query)
1490 class GoogleSearchIE(SearchInfoExtractor):
1491 """Information Extractor for Google Video search queries."""
1492 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1494 IE_NAME = u'video.google:search'
1495 _SEARCH_KEY = 'gvsearch'
1497 def _get_n_results(self, query, n):
1498 """Get a specified number of results for a query"""
1501 '_type': 'playlist',
1506 for pagenum in itertools.count(1):
1507 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1508 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1509 note='Downloading result page ' + str(pagenum))
1511 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1514 'url': mobj.group(1)
1516 res['entries'].append(e)
1518 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1521 class YahooSearchIE(SearchInfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1525 IE_NAME = u'screen.yahoo:search'
1526 _SEARCH_KEY = 'yvsearch'
1528 def _get_n_results(self, query, n):
1529 """Get a specified number of results for a query"""
1532 '_type': 'playlist',
1536 for pagenum in itertools.count(0):
1537 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1538 webpage = self._download_webpage(result_url, query,
1539 note='Downloading results page '+str(pagenum+1))
1540 info = json.loads(webpage)
1542 results = info[u'results']
1544 for (i, r) in enumerate(results):
1545 if (pagenum * 30) +i >= n:
1547 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1548 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1549 res['entries'].append(e)
1550 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1556 class YoutubePlaylistIE(InfoExtractor):
1557 """Information Extractor for YouTube playlists."""
1559 _VALID_URL = r"""(?:
1564 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1565 \? (?:.*?&)*? (?:p|a|list)=
1568 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1571 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1573 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1575 IE_NAME = u'youtube:playlist'
1578 def suitable(cls, url):
1579 """Receives a URL and returns True if suitable for this IE."""
1580 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1582 def _real_extract(self, url):
1583 # Extract playlist id
1584 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1586 raise ExtractorError(u'Invalid URL: %s' % url)
1588 # Download playlist videos from API
1589 playlist_id = mobj.group(1) or mobj.group(2)
1594 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1595 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1598 response = json.loads(page)
1599 except ValueError as err:
1600 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1602 if 'feed' not in response:
1603 raise ExtractorError(u'Got a malformed response from YouTube API')
1604 playlist_title = response['feed']['title']['$t']
1605 if 'entry' not in response['feed']:
1606 # Number of videos is a multiple of self._MAX_RESULTS
1609 for entry in response['feed']['entry']:
1610 index = entry['yt$position']['$t']
1611 if 'media$group' in entry and 'media$player' in entry['media$group']:
1612 videos.append((index, entry['media$group']['media$player']['url']))
1614 if len(response['feed']['entry']) < self._MAX_RESULTS:
1618 videos = [v[1] for v in sorted(videos)]
1620 url_results = [self.url_result(url, 'Youtube') for url in videos]
1621 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1624 class YoutubeChannelIE(InfoExtractor):
1625 """Information Extractor for YouTube channels."""
1627 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1628 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1629 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1630 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1631 IE_NAME = u'youtube:channel'
1633 def extract_videos_from_page(self, page):
1635 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1636 if mobj.group(1) not in ids_in_page:
1637 ids_in_page.append(mobj.group(1))
1640 def _real_extract(self, url):
1641 # Extract channel id
1642 mobj = re.match(self._VALID_URL, url)
1644 raise ExtractorError(u'Invalid URL: %s' % url)
1646 # Download channel page
1647 channel_id = mobj.group(1)
1651 url = self._TEMPLATE_URL % (channel_id, pagenum)
1652 page = self._download_webpage(url, channel_id,
1653 u'Downloading page #%s' % pagenum)
1655 # Extract video identifiers
1656 ids_in_page = self.extract_videos_from_page(page)
1657 video_ids.extend(ids_in_page)
1659 # Download any subsequent channel pages using the json-based channel_ajax query
1660 if self._MORE_PAGES_INDICATOR in page:
1662 pagenum = pagenum + 1
1664 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1665 page = self._download_webpage(url, channel_id,
1666 u'Downloading page #%s' % pagenum)
1668 page = json.loads(page)
1670 ids_in_page = self.extract_videos_from_page(page['content_html'])
1671 video_ids.extend(ids_in_page)
1673 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1676 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1678 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1679 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1680 return [self.playlist_result(url_entries, channel_id)]
1683 class YoutubeUserIE(InfoExtractor):
1684 """Information Extractor for YouTube users."""
1686 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1687 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1688 _GDATA_PAGE_SIZE = 50
1689 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1690 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1691 IE_NAME = u'youtube:user'
1693 def _real_extract(self, url):
1695 mobj = re.match(self._VALID_URL, url)
1697 raise ExtractorError(u'Invalid URL: %s' % url)
1699 username = mobj.group(1)
1701 # Download video ids using YouTube Data API. Result size per
1702 # query is limited (currently to 50 videos) so we need to query
1703 # page by page until there are no video ids - it means we got
1710 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1712 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1713 page = self._download_webpage(gdata_url, username,
1714 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1716 # Extract video identifiers
1719 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1720 if mobj.group(1) not in ids_in_page:
1721 ids_in_page.append(mobj.group(1))
1723 video_ids.extend(ids_in_page)
1725 # A little optimization - if current page is not
1726 # "full", ie. does not contain PAGE_SIZE video ids then
1727 # we can assume that this page is the last one - there
1728 # are no more ids on further pages - no need to query
1731 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1736 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1737 url_results = [self.url_result(url, 'Youtube') for url in urls]
1738 return [self.playlist_result(url_results, playlist_title = username)]
1741 class BlipTVUserIE(InfoExtractor):
1742 """Information Extractor for blip.tv users."""
1744 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1746 IE_NAME = u'blip.tv:user'
1748 def _real_extract(self, url):
1750 mobj = re.match(self._VALID_URL, url)
1752 raise ExtractorError(u'Invalid URL: %s' % url)
1754 username = mobj.group(1)
1756 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1758 page = self._download_webpage(url, username, u'Downloading user page')
1759 mobj = re.search(r'data-users-id="([^"]+)"', page)
1760 page_base = page_base % mobj.group(1)
1763 # Download video ids using BlipTV Ajax calls. Result size per
1764 # query is limited (currently to 12 videos) so we need to query
1765 # page by page until there are no video ids - it means we got
1772 url = page_base + "&page=" + str(pagenum)
1773 page = self._download_webpage(url, username,
1774 u'Downloading video ids from page %d' % pagenum)
1776 # Extract video identifiers
1779 for mobj in re.finditer(r'href="/([^"]+)"', page):
1780 if mobj.group(1) not in ids_in_page:
1781 ids_in_page.append(unescapeHTML(mobj.group(1)))
1783 video_ids.extend(ids_in_page)
1785 # A little optimization - if current page is not
1786 # "full", ie. does not contain PAGE_SIZE video ids then
1787 # we can assume that this page is the last one - there
1788 # are no more ids on further pages - no need to query
1791 if len(ids_in_page) < self._PAGE_SIZE:
1796 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1797 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1798 return [self.playlist_result(url_entries, playlist_title = username)]
1801 class DepositFilesIE(InfoExtractor):
1802 """Information extractor for depositfiles.com"""
1804 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1806 def _real_extract(self, url):
1807 file_id = url.split('/')[-1]
1808 # Rebuild url in english locale
1809 url = 'http://depositfiles.com/en/files/' + file_id
1811 # Retrieve file webpage with 'Free download' button pressed
1812 free_download_indication = { 'gateway_result' : '1' }
1813 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1815 self.report_download_webpage(file_id)
1816 webpage = compat_urllib_request.urlopen(request).read()
1817 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1818 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1820 # Search for the real file URL
1821 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1822 if (mobj is None) or (mobj.group(1) is None):
1823 # Try to figure out reason of the error.
1824 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1825 if (mobj is not None) and (mobj.group(1) is not None):
1826 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1827 raise ExtractorError(u'%s' % restriction_message)
1829 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1831 file_url = mobj.group(1)
1832 file_extension = os.path.splitext(file_url)[1][1:]
1834 # Search for file title
1835 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1838 'id': file_id.decode('utf-8'),
1839 'url': file_url.decode('utf-8'),
1841 'upload_date': None,
1842 'title': file_title,
1843 'ext': file_extension.decode('utf-8'),
1847 class FacebookIE(InfoExtractor):
1848 """Information Extractor for Facebook"""
1850 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1851 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1852 _NETRC_MACHINE = 'facebook'
1853 IE_NAME = u'facebook'
1855 def report_login(self):
1856 """Report attempt to log in."""
1857 self.to_screen(u'Logging in')
1859 def _real_initialize(self):
1860 if self._downloader is None:
1865 downloader_params = self._downloader.params
1867 # Attempt to use provided username and password or .netrc data
1868 if downloader_params.get('username', None) is not None:
1869 useremail = downloader_params['username']
1870 password = downloader_params['password']
1871 elif downloader_params.get('usenetrc', False):
1873 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1874 if info is not None:
1878 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1879 except (IOError, netrc.NetrcParseError) as err:
1880 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1883 if useremail is None:
1892 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1895 login_results = compat_urllib_request.urlopen(request).read()
1896 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1897 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1900 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1903 def _real_extract(self, url):
1904 mobj = re.match(self._VALID_URL, url)
1906 raise ExtractorError(u'Invalid URL: %s' % url)
1907 video_id = mobj.group('ID')
1909 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1910 webpage = self._download_webpage(url, video_id)
1912 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1913 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1914 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1916 raise ExtractorError(u'Cannot parse data')
1917 data = dict(json.loads(m.group(1)))
1918 params_raw = compat_urllib_parse.unquote(data['params'])
1919 params = json.loads(params_raw)
1920 video_data = params['video_data'][0]
1921 video_url = video_data.get('hd_src')
1923 video_url = video_data['sd_src']
1925 raise ExtractorError(u'Cannot find video URL')
1926 video_duration = int(video_data['video_duration'])
1927 thumbnail = video_data['thumbnail_src']
1929 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1934 'title': video_title,
1937 'duration': video_duration,
1938 'thumbnail': thumbnail,
1943 class BlipTVIE(InfoExtractor):
1944 """Information extractor for blip.tv"""
1946 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1947 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1948 IE_NAME = u'blip.tv'
1950 def report_direct_download(self, title):
1951 """Report information extraction."""
1952 self.to_screen(u'%s: Direct download detected' % title)
1954 def _real_extract(self, url):
1955 mobj = re.match(self._VALID_URL, url)
1957 raise ExtractorError(u'Invalid URL: %s' % url)
1959 # See https://github.com/rg3/youtube-dl/issues/857
1960 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1961 if api_mobj is not None:
1962 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1963 urlp = compat_urllib_parse_urlparse(url)
1964 if urlp.path.startswith('/play/'):
1965 request = compat_urllib_request.Request(url)
1966 response = compat_urllib_request.urlopen(request)
1967 redirecturl = response.geturl()
1968 rurlp = compat_urllib_parse_urlparse(redirecturl)
1969 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1970 url = 'http://blip.tv/a/a-' + file_id
1971 return self._real_extract(url)
1978 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1979 request = compat_urllib_request.Request(json_url)
1980 request.add_header('User-Agent', 'iTunes/10.6.1')
1981 self.report_extraction(mobj.group(1))
1984 urlh = compat_urllib_request.urlopen(request)
1985 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1986 basename = url.split('/')[-1]
1987 title,ext = os.path.splitext(basename)
1988 title = title.decode('UTF-8')
1989 ext = ext.replace('.', '')
1990 self.report_direct_download(title)
1995 'upload_date': None,
2000 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2001 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2002 if info is None: # Regular URL
2004 json_code_bytes = urlh.read()
2005 json_code = json_code_bytes.decode('utf-8')
2006 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2007 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2010 json_data = json.loads(json_code)
2011 if 'Post' in json_data:
2012 data = json_data['Post']
2016 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2017 video_url = data['media']['url']
2018 umobj = re.match(self._URL_EXT, video_url)
2020 raise ValueError('Can not determine filename extension')
2021 ext = umobj.group(1)
2024 'id': data['item_id'],
2026 'uploader': data['display_name'],
2027 'upload_date': upload_date,
2028 'title': data['title'],
2030 'format': data['media']['mimeType'],
2031 'thumbnail': data['thumbnailUrl'],
2032 'description': data['description'],
2033 'player_url': data['embedUrl'],
2034 'user_agent': 'iTunes/10.6.1',
2036 except (ValueError,KeyError) as err:
2037 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2042 class MyVideoIE(InfoExtractor):
2043 """Information Extractor for myvideo.de."""
2045 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2046 IE_NAME = u'myvideo'
2048 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2049 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2050 # https://github.com/rg3/youtube-dl/pull/842
2051 def __rc4crypt(self,data, key):
2053 box = list(range(256))
2054 for i in list(range(256)):
2055 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2056 box[i], box[x] = box[x], box[i]
2062 y = (y + box[x]) % 256
2063 box[x], box[y] = box[y], box[x]
2064 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2068 return hashlib.md5(s).hexdigest().encode()
2070 def _real_extract(self,url):
2071 mobj = re.match(self._VALID_URL, url)
2073 raise ExtractorError(u'invalid URL: %s' % url)
2075 video_id = mobj.group(1)
2078 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2079 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2080 b'TnpsbA0KTVRkbU1tSTRNdz09'
2084 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2085 webpage = self._download_webpage(webpage_url, video_id)
2087 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2088 if mobj is not None:
2089 self.report_extraction(video_id)
2090 video_url = mobj.group(1) + '.flv'
2092 video_title = self._html_search_regex('<title>([^<]+)</title>',
2095 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2101 'upload_date': None,
2102 'title': video_title,
2107 mobj = re.search('var flashvars={(.+?)}', webpage)
2109 raise ExtractorError(u'Unable to extract video')
2114 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2115 if not a == '_encxml':
2118 encxml = compat_urllib_parse.unquote(b)
2119 if not params.get('domain'):
2120 params['domain'] = 'www.myvideo.de'
2121 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2122 if 'flash_playertype=MTV' in xmldata_url:
2123 self._downloader.report_warning(u'avoiding MTV player')
2125 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2126 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2130 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2131 enc_data_b = binascii.unhexlify(enc_data)
2133 base64.b64decode(base64.b64decode(GK)) +
2135 str(video_id).encode('utf-8')
2138 dec_data = self.__rc4crypt(enc_data_b, sk)
2141 self.report_extraction(video_id)
2144 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2146 video_url = compat_urllib_parse.unquote(mobj.group(1))
2147 if 'myvideo2flash' in video_url:
2148 self._downloader.report_warning(u'forcing RTMPT ...')
2149 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2152 # extract non rtmp videos
2153 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2155 raise ExtractorError(u'unable to extract url')
2156 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2158 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2159 video_file = compat_urllib_parse.unquote(video_file)
2161 if not video_file.endswith('f4m'):
2162 ppath, prefix = video_file.split('.')
2163 video_playpath = '%s:%s' % (prefix, ppath)
2164 video_hls_playlist = ''
2167 video_hls_playlist = (
2168 video_filepath + video_file
2169 ).replace('.f4m', '.m3u8')
2171 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2172 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2174 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2180 'tc_url': video_url,
2182 'upload_date': None,
2183 'title': video_title,
2185 'play_path': video_playpath,
2186 'video_file': video_file,
2187 'video_hls_playlist': video_hls_playlist,
2188 'player_url': video_swfobj,
2192 class ComedyCentralIE(InfoExtractor):
2193 """Information extractor for The Daily Show and Colbert Report """
2195 # urls can be abbreviations like :thedailyshow or :colbert
2196 # urls for episodes like:
2197 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2198 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2199 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2200 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2201 |(https?://)?(www\.)?
2202 (?P<showname>thedailyshow|colbertnation)\.com/
2203 (full-episodes/(?P<episode>.*)|
2205 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2206 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2209 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2211 _video_extensions = {
2219 _video_dimensions = {
2229 def suitable(cls, url):
2230 """Receives a URL and returns True if suitable for this IE."""
2231 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2233 def _print_formats(self, formats):
2234 print('Available formats:')
2236 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2239 def _real_extract(self, url):
2240 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2242 raise ExtractorError(u'Invalid URL: %s' % url)
2244 if mobj.group('shortname'):
2245 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2246 url = u'http://www.thedailyshow.com/full-episodes/'
2248 url = u'http://www.colbertnation.com/full-episodes/'
2249 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2250 assert mobj is not None
2252 if mobj.group('clip'):
2253 if mobj.group('showname') == 'thedailyshow':
2254 epTitle = mobj.group('tdstitle')
2256 epTitle = mobj.group('cntitle')
2259 dlNewest = not mobj.group('episode')
2261 epTitle = mobj.group('showname')
2263 epTitle = mobj.group('episode')
2265 self.report_extraction(epTitle)
2266 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2268 url = htmlHandle.geturl()
2269 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2271 raise ExtractorError(u'Invalid redirected URL: ' + url)
2272 if mobj.group('episode') == '':
2273 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2274 epTitle = mobj.group('episode')
2276 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2278 if len(mMovieParams) == 0:
2279 # The Colbert Report embeds the information in a without
2280 # a URL prefix; so extract the alternate reference
2281 # and then add the URL prefix manually.
2283 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2284 if len(altMovieParams) == 0:
2285 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2287 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2289 uri = mMovieParams[0][1]
2290 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2291 indexXml = self._download_webpage(indexUrl, epTitle,
2292 u'Downloading show index',
2293 u'unable to download episode index')
2297 idoc = xml.etree.ElementTree.fromstring(indexXml)
2298 itemEls = idoc.findall('.//item')
2299 for partNum,itemEl in enumerate(itemEls):
2300 mediaId = itemEl.findall('./guid')[0].text
2301 shortMediaId = mediaId.split(':')[-1]
2302 showId = mediaId.split(':')[-2].replace('.com', '')
2303 officialTitle = itemEl.findall('./title')[0].text
2304 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2306 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2307 compat_urllib_parse.urlencode({'uri': mediaId}))
2308 configXml = self._download_webpage(configUrl, epTitle,
2309 u'Downloading configuration for %s' % shortMediaId)
2311 cdoc = xml.etree.ElementTree.fromstring(configXml)
2313 for rendition in cdoc.findall('.//rendition'):
2314 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2318 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2321 if self._downloader.params.get('listformats', None):
2322 self._print_formats([i[0] for i in turls])
2325 # For now, just pick the highest bitrate
2326 format,rtmp_video_url = turls[-1]
2328 # Get the format arg from the arg stream
2329 req_format = self._downloader.params.get('format', None)
2331 # Select format if we can find one
2334 format, rtmp_video_url = f, v
2337 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2339 raise ExtractorError(u'Cannot transform RTMP url')
2340 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2341 video_url = base + m.group('finalid')
2343 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2348 'upload_date': officialDate,
2353 'description': officialTitle,
2355 results.append(info)
2360 class EscapistIE(InfoExtractor):
2361 """Information extractor for The Escapist """
2363 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2364 IE_NAME = u'escapist'
2366 def _real_extract(self, url):
2367 mobj = re.match(self._VALID_URL, url)
2369 raise ExtractorError(u'Invalid URL: %s' % url)
2370 showName = mobj.group('showname')
2371 videoId = mobj.group('episode')
2373 self.report_extraction(videoId)
2374 webpage = self._download_webpage(url, videoId)
2376 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2377 webpage, u'description', fatal=False)
2379 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2380 webpage, u'thumbnail', fatal=False)
2382 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2383 webpage, u'player url')
2385 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2386 webpage, u'player url').split(' : ')[-1]
2388 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2389 configUrl = compat_urllib_parse.unquote(configUrl)
2391 configJSON = self._download_webpage(configUrl, videoId,
2392 u'Downloading configuration',
2393 u'unable to download configuration')
2395 # Technically, it's JavaScript, not JSON
2396 configJSON = configJSON.replace("'", '"')
2399 config = json.loads(configJSON)
2400 except (ValueError,) as err:
2401 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2403 playlist = config['playlist']
2404 videoUrl = playlist[1]['url']
2409 'uploader': showName,
2410 'upload_date': None,
2413 'thumbnail': imgUrl,
2414 'description': videoDesc,
2415 'player_url': playerUrl,
2420 class CollegeHumorIE(InfoExtractor):
2421 """Information extractor for collegehumor.com"""
2424 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2425 IE_NAME = u'collegehumor'
2427 def report_manifest(self, video_id):
2428 """Report information extraction."""
2429 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2431 def _real_extract(self, url):
2432 mobj = re.match(self._VALID_URL, url)
2434 raise ExtractorError(u'Invalid URL: %s' % url)
2435 video_id = mobj.group('videoid')
2440 'upload_date': None,
2443 self.report_extraction(video_id)
2444 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2446 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2448 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2450 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2452 videoNode = mdoc.findall('./video')[0]
2453 info['description'] = videoNode.findall('./description')[0].text
2454 info['title'] = videoNode.findall('./caption')[0].text
2455 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2456 manifest_url = videoNode.findall('./file')[0].text
2458 raise ExtractorError(u'Invalid metadata XML file')
2460 manifest_url += '?hdcore=2.10.3'
2461 self.report_manifest(video_id)
2463 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2464 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2467 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2469 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2470 node_id = media_node.attrib['url']
2471 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2472 except IndexError as err:
2473 raise ExtractorError(u'Invalid manifest file')
2475 url_pr = compat_urllib_parse_urlparse(manifest_url)
2476 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2483 class XVideosIE(InfoExtractor):
2484 """Information extractor for xvideos.com"""
2486 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2487 IE_NAME = u'xvideos'
2489 def _real_extract(self, url):
2490 mobj = re.match(self._VALID_URL, url)
2492 raise ExtractorError(u'Invalid URL: %s' % url)
2493 video_id = mobj.group(1)
2495 webpage = self._download_webpage(url, video_id)
2497 self.report_extraction(video_id)
2500 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2501 webpage, u'video URL'))
2504 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2507 # Extract video thumbnail
2508 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2509 webpage, u'thumbnail', fatal=False)
2515 'upload_date': None,
2516 'title': video_title,
2518 'thumbnail': video_thumbnail,
2519 'description': None,
2525 class SoundcloudIE(InfoExtractor):
2526 """Information extractor for soundcloud.com
2527 To access the media, the uid of the song and a stream token
2528 must be extracted from the page source and the script must make
2529 a request to media.soundcloud.com/crossdomain.xml. Then
2530 the media can be grabbed by requesting from an url composed
2531 of the stream token and uid
2534 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2535 IE_NAME = u'soundcloud'
2537 def report_resolve(self, video_id):
2538 """Report information extraction."""
2539 self.to_screen(u'%s: Resolving id' % video_id)
2541 def _real_extract(self, url):
2542 mobj = re.match(self._VALID_URL, url)
2544 raise ExtractorError(u'Invalid URL: %s' % url)
2546 # extract uploader (which is in the url)
2547 uploader = mobj.group(1)
2548 # extract simple title (uploader + slug of song title)
2549 slug_title = mobj.group(2)
2550 simple_title = uploader + u'-' + slug_title
2551 full_title = '%s/%s' % (uploader, slug_title)
2553 self.report_resolve(full_title)
2555 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2556 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2557 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2559 info = json.loads(info_json)
2560 video_id = info['id']
2561 self.report_extraction(full_title)
2563 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2564 stream_json = self._download_webpage(streams_url, full_title,
2565 u'Downloading stream definitions',
2566 u'unable to download stream definitions')
2568 streams = json.loads(stream_json)
2569 mediaURL = streams['http_mp3_128_url']
2570 upload_date = unified_strdate(info['created_at'])
2575 'uploader': info['user']['username'],
2576 'upload_date': upload_date,
2577 'title': info['title'],
2579 'description': info['description'],
2582 class SoundcloudSetIE(InfoExtractor):
2583 """Information extractor for soundcloud.com sets
2584 To access the media, the uid of the song and a stream token
2585 must be extracted from the page source and the script must make
2586 a request to media.soundcloud.com/crossdomain.xml. Then
2587 the media can be grabbed by requesting from an url composed
2588 of the stream token and uid
2591 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2592 IE_NAME = u'soundcloud:set'
2594 def report_resolve(self, video_id):
2595 """Report information extraction."""
2596 self.to_screen(u'%s: Resolving id' % video_id)
2598 def _real_extract(self, url):
2599 mobj = re.match(self._VALID_URL, url)
2601 raise ExtractorError(u'Invalid URL: %s' % url)
2603 # extract uploader (which is in the url)
2604 uploader = mobj.group(1)
2605 # extract simple title (uploader + slug of song title)
2606 slug_title = mobj.group(2)
2607 simple_title = uploader + u'-' + slug_title
2608 full_title = '%s/sets/%s' % (uploader, slug_title)
2610 self.report_resolve(full_title)
2612 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2613 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2614 info_json = self._download_webpage(resolv_url, full_title)
2617 info = json.loads(info_json)
2618 if 'errors' in info:
2619 for err in info['errors']:
2620 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2623 self.report_extraction(full_title)
2624 for track in info['tracks']:
2625 video_id = track['id']
2627 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2628 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2630 self.report_extraction(video_id)
2631 streams = json.loads(stream_json)
2632 mediaURL = streams['http_mp3_128_url']
2637 'uploader': track['user']['username'],
2638 'upload_date': unified_strdate(track['created_at']),
2639 'title': track['title'],
2641 'description': track['description'],
2646 class InfoQIE(InfoExtractor):
2647 """Information extractor for infoq.com"""
2648 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2650 def _real_extract(self, url):
2651 mobj = re.match(self._VALID_URL, url)
2653 raise ExtractorError(u'Invalid URL: %s' % url)
2655 webpage = self._download_webpage(url, video_id=url)
2656 self.report_extraction(url)
2659 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2661 raise ExtractorError(u'Unable to extract video url')
2662 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2663 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2666 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2669 # Extract description
2670 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2671 webpage, u'description', fatal=False)
2673 video_filename = video_url.split('/')[-1]
2674 video_id, extension = video_filename.split('.')
2680 'upload_date': None,
2681 'title': video_title,
2682 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2684 'description': video_description,
2689 class MixcloudIE(InfoExtractor):
2690 """Information extractor for www.mixcloud.com"""
2692 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2693 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2694 IE_NAME = u'mixcloud'
2696 def report_download_json(self, file_id):
2697 """Report JSON download."""
2698 self.to_screen(u'Downloading json')
2700 def get_urls(self, jsonData, fmt, bitrate='best'):
2701 """Get urls from 'audio_formats' section in json"""
2704 bitrate_list = jsonData[fmt]
2705 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2706 bitrate = max(bitrate_list) # select highest
2708 url_list = jsonData[fmt][bitrate]
2709 except TypeError: # we have no bitrate info.
2710 url_list = jsonData[fmt]
2713 def check_urls(self, url_list):
2714 """Returns 1st active url from list"""
2715 for url in url_list:
2717 compat_urllib_request.urlopen(url)
2719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2724 def _print_formats(self, formats):
2725 print('Available formats:')
2726 for fmt in formats.keys():
2727 for b in formats[fmt]:
2729 ext = formats[fmt][b][0]
2730 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2731 except TypeError: # we have no bitrate info
2732 ext = formats[fmt][0]
2733 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2736 def _real_extract(self, url):
2737 mobj = re.match(self._VALID_URL, url)
2739 raise ExtractorError(u'Invalid URL: %s' % url)
2740 # extract uploader & filename from url
2741 uploader = mobj.group(1).decode('utf-8')
2742 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2744 # construct API request
2745 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2746 # retrieve .json file with links to files
2747 request = compat_urllib_request.Request(file_url)
2749 self.report_download_json(file_url)
2750 jsonData = compat_urllib_request.urlopen(request).read()
2751 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2755 json_data = json.loads(jsonData)
2756 player_url = json_data['player_swf_url']
2757 formats = dict(json_data['audio_formats'])
2759 req_format = self._downloader.params.get('format', None)
2762 if self._downloader.params.get('listformats', None):
2763 self._print_formats(formats)
2766 if req_format is None or req_format == 'best':
2767 for format_param in formats.keys():
2768 url_list = self.get_urls(formats, format_param)
2770 file_url = self.check_urls(url_list)
2771 if file_url is not None:
2774 if req_format not in formats:
2775 raise ExtractorError(u'Format is not available')
2777 url_list = self.get_urls(formats, req_format)
2778 file_url = self.check_urls(url_list)
2779 format_param = req_format
2782 'id': file_id.decode('utf-8'),
2783 'url': file_url.decode('utf-8'),
2784 'uploader': uploader.decode('utf-8'),
2785 'upload_date': None,
2786 'title': json_data['name'],
2787 'ext': file_url.split('.')[-1].decode('utf-8'),
2788 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2789 'thumbnail': json_data['thumbnail_url'],
2790 'description': json_data['description'],
2791 'player_url': player_url.decode('utf-8'),
2794 class StanfordOpenClassroomIE(InfoExtractor):
2795 """Information extractor for Stanford's Open ClassRoom"""
2797 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2798 IE_NAME = u'stanfordoc'
2800 def _real_extract(self, url):
2801 mobj = re.match(self._VALID_URL, url)
2803 raise ExtractorError(u'Invalid URL: %s' % url)
2805 if mobj.group('course') and mobj.group('video'): # A specific video
2806 course = mobj.group('course')
2807 video = mobj.group('video')
2809 'id': course + '_' + video,
2811 'upload_date': None,
2814 self.report_extraction(info['id'])
2815 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2816 xmlUrl = baseUrl + video + '.xml'
2818 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2819 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2821 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2823 info['title'] = mdoc.findall('./title')[0].text
2824 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2826 raise ExtractorError(u'Invalid metadata XML file')
2827 info['ext'] = info['url'].rpartition('.')[2]
2829 elif mobj.group('course'): # A course page
2830 course = mobj.group('course')
2835 'upload_date': None,
2838 coursepage = self._download_webpage(url, info['id'],
2839 note='Downloading course info page',
2840 errnote='Unable to download course info page')
2842 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2844 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2845 coursepage, u'description', fatal=False)
2847 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2850 'type': 'reference',
2851 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2855 for entry in info['list']:
2856 assert entry['type'] == 'reference'
2857 results += self.extract(entry['url'])
2861 'id': 'Stanford OpenClassroom',
2864 'upload_date': None,
2867 self.report_download_webpage(info['id'])
2868 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2870 rootpage = compat_urllib_request.urlopen(rootURL).read()
2871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2874 info['title'] = info['id']
2876 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2879 'type': 'reference',
2880 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2885 for entry in info['list']:
2886 assert entry['type'] == 'reference'
2887 results += self.extract(entry['url'])
2890 class MTVIE(InfoExtractor):
2891 """Information extractor for MTV.com"""
2893 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2896 def _real_extract(self, url):
2897 mobj = re.match(self._VALID_URL, url)
2899 raise ExtractorError(u'Invalid URL: %s' % url)
2900 if not mobj.group('proto'):
2901 url = 'http://' + url
2902 video_id = mobj.group('videoid')
2904 webpage = self._download_webpage(url, video_id)
2906 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2907 webpage, u'song name', fatal=False)
2909 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2912 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2913 webpage, u'mtvn_uri', fatal=False)
2915 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2916 webpage, u'content id', fatal=False)
2918 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2919 self.report_extraction(video_id)
2920 request = compat_urllib_request.Request(videogen_url)
2922 metadataXml = compat_urllib_request.urlopen(request).read()
2923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2926 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2927 renditions = mdoc.findall('.//rendition')
2929 # For now, always pick the highest quality.
2930 rendition = renditions[-1]
2933 _,_,ext = rendition.attrib['type'].partition('/')
2934 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2935 video_url = rendition.find('./src').text
2937 raise ExtractorError('Invalid rendition field.')
2942 'uploader': performer,
2943 'upload_date': None,
2944 'title': video_title,
2952 class YoukuIE(InfoExtractor):
2953 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2956 nowTime = int(time.time() * 1000)
2957 random1 = random.randint(1000,1998)
2958 random2 = random.randint(1000,9999)
2960 return "%d%d%d" %(nowTime,random1,random2)
2962 def _get_file_ID_mix_string(self, seed):
2964 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2966 for i in range(len(source)):
2967 seed = (seed * 211 + 30031 ) % 65536
2968 index = math.floor(seed / 65536 * len(source) )
2969 mixed.append(source[int(index)])
2970 source.remove(source[int(index)])
2971 #return ''.join(mixed)
2974 def _get_file_id(self, fileId, seed):
2975 mixed = self._get_file_ID_mix_string(seed)
2976 ids = fileId.split('*')
2980 realId.append(mixed[int(ch)])
2981 return ''.join(realId)
2983 def _real_extract(self, url):
2984 mobj = re.match(self._VALID_URL, url)
2986 raise ExtractorError(u'Invalid URL: %s' % url)
2987 video_id = mobj.group('ID')
2989 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2991 jsondata = self._download_webpage(info_url, video_id)
2993 self.report_extraction(video_id)
2995 config = json.loads(jsondata)
2997 video_title = config['data'][0]['title']
2998 seed = config['data'][0]['seed']
3000 format = self._downloader.params.get('format', None)
3001 supported_format = list(config['data'][0]['streamfileids'].keys())
3003 if format is None or format == 'best':
3004 if 'hd2' in supported_format:
3009 elif format == 'worst':
3017 fileid = config['data'][0]['streamfileids'][format]
3018 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3019 except (UnicodeDecodeError, ValueError, KeyError):
3020 raise ExtractorError(u'Unable to extract info section')
3023 sid = self._gen_sid()
3024 fileid = self._get_file_id(fileid, seed)
3026 #column 8,9 of fileid represent the segment number
3027 #fileid[7:9] should be changed
3028 for index, key in enumerate(keys):
3030 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3031 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3034 'id': '%s_part%02d' % (video_id, index),
3035 'url': download_url,
3037 'upload_date': None,
3038 'title': video_title,
3041 files_info.append(info)
3046 class XNXXIE(InfoExtractor):
3047 """Information extractor for xnxx.com"""
3049 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3051 VIDEO_URL_RE = r'flv_url=(.*?)&'
3052 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3053 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3055 def _real_extract(self, url):
3056 mobj = re.match(self._VALID_URL, url)
3058 raise ExtractorError(u'Invalid URL: %s' % url)
3059 video_id = mobj.group(1)
3061 # Get webpage content
3062 webpage = self._download_webpage(url, video_id)
3064 video_url = self._search_regex(self.VIDEO_URL_RE,
3065 webpage, u'video URL')
3066 video_url = compat_urllib_parse.unquote(video_url)
3068 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3071 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3072 webpage, u'thumbnail', fatal=False)
3078 'upload_date': None,
3079 'title': video_title,
3081 'thumbnail': video_thumbnail,
3082 'description': None,
3086 class GooglePlusIE(InfoExtractor):
3087 """Information extractor for plus.google.com."""
3089 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3090 IE_NAME = u'plus.google'
3092 def _real_extract(self, url):
3093 # Extract id from URL
3094 mobj = re.match(self._VALID_URL, url)
3096 raise ExtractorError(u'Invalid URL: %s' % url)
3098 post_url = mobj.group(0)
3099 video_id = mobj.group(1)
3101 video_extension = 'flv'
3103 # Step 1, Retrieve post webpage to extract further information
3104 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3106 self.report_extraction(video_id)
3108 # Extract update date
3109 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3110 webpage, u'upload date', fatal=False)
3112 # Convert timestring to a format suitable for filename
3113 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3114 upload_date = upload_date.strftime('%Y%m%d')
3117 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3118 webpage, u'uploader', fatal=False)
3121 # Get the first line for title
3122 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3123 webpage, 'title', default=u'NA')
3125 # Step 2, Stimulate clicking the image box to launch video
3126 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3127 webpage, u'video page URL')
3128 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3130 # Extract video links on video page
3131 """Extract video links of all sizes"""
3132 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3133 mobj = re.findall(pattern, webpage)
3135 raise ExtractorError(u'Unable to extract video links')
3137 # Sort in resolution
3138 links = sorted(mobj)
3140 # Choose the lowest of the sort, i.e. highest resolution
3141 video_url = links[-1]
3142 # Only get the url. The resolution part in the tuple has no use anymore
3143 video_url = video_url[-1]
3144 # Treat escaped \u0026 style hex
3146 video_url = video_url.decode("unicode_escape")
3147 except AttributeError: # Python 3
3148 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3154 'uploader': uploader,
3155 'upload_date': upload_date,
3156 'title': video_title,
3157 'ext': video_extension,
3160 class NBAIE(InfoExtractor):
3161 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3164 def _real_extract(self, url):
3165 mobj = re.match(self._VALID_URL, url)
3167 raise ExtractorError(u'Invalid URL: %s' % url)
3169 video_id = mobj.group(1)
3171 webpage = self._download_webpage(url, video_id)
3173 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3175 shortened_video_id = video_id.rpartition('/')[2]
3176 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3177 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3179 # It isn't there in the HTML it returns to us
3180 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3182 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3185 'id': shortened_video_id,
3189 # 'uploader_date': uploader_date,
3190 'description': description,
3194 class JustinTVIE(InfoExtractor):
3195 """Information extractor for justin.tv and twitch.tv"""
3196 # TODO: One broadcast may be split into multiple videos. The key
3197 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3198 # starts at 1 and increases. Can we treat all parts as one video?
3200 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3202 (?P<channelid>[^/]+)|
3203 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3204 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3208 _JUSTIN_PAGE_LIMIT = 100
3209 IE_NAME = u'justin.tv'
3211 def report_download_page(self, channel, offset):
3212 """Report attempt to download a single page of videos."""
3213 self.to_screen(u'%s: Downloading video information from %d to %d' %
3214 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3216 # Return count of items, list of *valid* items
3217 def _parse_page(self, url, video_id):
3218 webpage = self._download_webpage(url, video_id,
3219 u'Downloading video info JSON',
3220 u'unable to download video info JSON')
3222 response = json.loads(webpage)
3223 if type(response) != list:
3224 error_text = response.get('error', 'unknown error')
3225 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3227 for clip in response:
3228 video_url = clip['video_file_url']
3230 video_extension = os.path.splitext(video_url)[1][1:]
3231 video_date = re.sub('-', '', clip['start_time'][:10])
3232 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3233 video_id = clip['id']
3234 video_title = clip.get('title', video_id)
3238 'title': video_title,
3239 'uploader': clip.get('channel_name', video_uploader_id),
3240 'uploader_id': video_uploader_id,
3241 'upload_date': video_date,
3242 'ext': video_extension,
3244 return (len(response), info)
3246 def _real_extract(self, url):
3247 mobj = re.match(self._VALID_URL, url)
3249 raise ExtractorError(u'invalid URL: %s' % url)
3251 api_base = 'http://api.justin.tv'
3253 if mobj.group('channelid'):
3255 video_id = mobj.group('channelid')
3256 api = api_base + '/channel/archives/%s.json' % video_id
3257 elif mobj.group('chapterid'):
3258 chapter_id = mobj.group('chapterid')
3260 webpage = self._download_webpage(url, chapter_id)
3261 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3263 raise ExtractorError(u'Cannot find archive of a chapter')
3264 archive_id = m.group(1)
3266 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3267 chapter_info_xml = self._download_webpage(api, chapter_id,
3268 note=u'Downloading chapter information',
3269 errnote=u'Chapter information download failed')
3270 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3271 for a in doc.findall('.//archive'):
3272 if archive_id == a.find('./id').text:
3275 raise ExtractorError(u'Could not find chapter in chapter information')
3277 video_url = a.find('./video_file_url').text
3278 video_ext = video_url.rpartition('.')[2] or u'flv'
3280 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3281 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3282 note='Downloading chapter metadata',
3283 errnote='Download of chapter metadata failed')
3284 chapter_info = json.loads(chapter_info_json)
3286 bracket_start = int(doc.find('.//bracket_start').text)
3287 bracket_end = int(doc.find('.//bracket_end').text)
3289 # TODO determine start (and probably fix up file)
3290 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3291 #video_url += u'?start=' + TODO:start_timestamp
3292 # bracket_start is 13290, but we want 51670615
3293 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3294 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3297 'id': u'c' + chapter_id,
3300 'title': chapter_info['title'],
3301 'thumbnail': chapter_info['preview'],
3302 'description': chapter_info['description'],
3303 'uploader': chapter_info['channel']['display_name'],
3304 'uploader_id': chapter_info['channel']['name'],
3308 video_id = mobj.group('videoid')
3309 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3311 self.report_extraction(video_id)
3315 limit = self._JUSTIN_PAGE_LIMIT
3318 self.report_download_page(video_id, offset)
3319 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3320 page_count, page_info = self._parse_page(page_url, video_id)
3321 info.extend(page_info)
3322 if not paged or page_count != limit:
3327 class FunnyOrDieIE(InfoExtractor):
3328 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3330 def _real_extract(self, url):
3331 mobj = re.match(self._VALID_URL, url)
3333 raise ExtractorError(u'invalid URL: %s' % url)
3335 video_id = mobj.group('id')
3336 webpage = self._download_webpage(url, video_id)
3338 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3339 webpage, u'video URL', flags=re.DOTALL)
3341 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3342 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3344 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3345 webpage, u'description', fatal=False, flags=re.DOTALL)
3352 'description': video_description,
3356 class SteamIE(InfoExtractor):
3357 _VALID_URL = r"""http://store\.steampowered\.com/
3359 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3361 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3363 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3364 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3367 def suitable(cls, url):
3368 """Receives a URL and returns True if suitable for this IE."""
3369 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3371 def _real_extract(self, url):
3372 m = re.match(self._VALID_URL, url, re.VERBOSE)
3373 gameID = m.group('gameID')
3375 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3376 webpage = self._download_webpage(videourl, gameID)
3378 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3379 videourl = self._AGECHECK_TEMPLATE % gameID
3380 self.report_age_confirmation()
3381 webpage = self._download_webpage(videourl, gameID)
3383 self.report_extraction(gameID)
3384 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3385 webpage, 'game title')
3387 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3388 mweb = re.finditer(urlRE, webpage)
3389 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3390 titles = re.finditer(namesRE, webpage)
3391 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3392 thumbs = re.finditer(thumbsRE, webpage)
3394 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3395 video_id = vid.group('videoID')
3396 title = vtitle.group('videoName')
3397 video_url = vid.group('videoURL')
3398 video_thumb = thumb.group('thumbnail')
3400 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3405 'title': unescapeHTML(title),
3406 'thumbnail': video_thumb
3409 return [self.playlist_result(videos, gameID, game_title)]
3411 class UstreamIE(InfoExtractor):
3412 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3413 IE_NAME = u'ustream'
3415 def _real_extract(self, url):
3416 m = re.match(self._VALID_URL, url)
3417 video_id = m.group('videoID')
3419 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3420 webpage = self._download_webpage(url, video_id)
3422 self.report_extraction(video_id)
3424 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3427 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3428 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3430 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3431 webpage, u'thumbnail', fatal=False)
3437 'title': video_title,
3438 'uploader': uploader,
3439 'thumbnail': thumbnail,
3443 class WorldStarHipHopIE(InfoExtractor):
3444 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3445 IE_NAME = u'WorldStarHipHop'
3447 def _real_extract(self, url):
3448 m = re.match(self._VALID_URL, url)
3449 video_id = m.group('id')
3451 webpage_src = self._download_webpage(url, video_id)
3453 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3454 webpage_src, u'video URL')
3456 if 'mp4' in video_url:
3461 video_title = self._html_search_regex(r"<title>(.*)</title>",
3462 webpage_src, u'title')
3464 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3465 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3466 webpage_src, u'thumbnail', fatal=False)
3469 _title = r"""candytitles.*>(.*)</span>"""
3470 mobj = re.search(_title, webpage_src)
3471 if mobj is not None:
3472 video_title = mobj.group(1)
3477 'title' : video_title,
3478 'thumbnail' : thumbnail,
3483 class RBMARadioIE(InfoExtractor):
3484 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3486 def _real_extract(self, url):
3487 m = re.match(self._VALID_URL, url)
3488 video_id = m.group('videoID')
3490 webpage = self._download_webpage(url, video_id)
3492 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3493 webpage, u'json data', flags=re.MULTILINE)
3496 data = json.loads(json_data)
3497 except ValueError as e:
3498 raise ExtractorError(u'Invalid JSON: ' + str(e))
3500 video_url = data['akamai_url'] + '&cbr=256'
3501 url_parts = compat_urllib_parse_urlparse(video_url)
3502 video_ext = url_parts.path.rpartition('.')[2]
3507 'title': data['title'],
3508 'description': data.get('teaser_text'),
3509 'location': data.get('country_of_origin'),
3510 'uploader': data.get('host', {}).get('name'),
3511 'uploader_id': data.get('host', {}).get('slug'),
3512 'thumbnail': data.get('image', {}).get('large_url_2x'),
3513 'duration': data.get('duration'),
3518 class YouPornIE(InfoExtractor):
3519 """Information extractor for youporn.com."""
3520 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3522 def _print_formats(self, formats):
3523 """Print all available formats"""
3524 print(u'Available formats:')
3525 print(u'ext\t\tformat')
3526 print(u'---------------------------------')
3527 for format in formats:
3528 print(u'%s\t\t%s' % (format['ext'], format['format']))
3530 def _specific(self, req_format, formats):
3532 if(x["format"]==req_format):
3536 def _real_extract(self, url):
3537 mobj = re.match(self._VALID_URL, url)
3539 raise ExtractorError(u'Invalid URL: %s' % url)
3540 video_id = mobj.group('videoid')
3542 req = compat_urllib_request.Request(url)
3543 req.add_header('Cookie', 'age_verified=1')
3544 webpage = self._download_webpage(req, video_id)
3546 # Get JSON parameters
3547 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3549 params = json.loads(json_params)
3551 raise ExtractorError(u'Invalid JSON')
3553 self.report_extraction(video_id)
3555 video_title = params['title']
3556 upload_date = unified_strdate(params['release_date_f'])
3557 video_description = params['description']
3558 video_uploader = params['submitted_by']
3559 thumbnail = params['thumbnails'][0]['image']
3561 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3563 # Get all of the formats available
3564 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3565 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3566 webpage, u'download list').strip()
3568 # Get all of the links from the page
3569 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3570 links = re.findall(LINK_RE, download_list_html)
3571 if(len(links) == 0):
3572 raise ExtractorError(u'ERROR: no known formats available for video')
3574 self.to_screen(u'Links found: %d' % len(links))
3579 # A link looks like this:
3580 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3581 # A path looks like this:
3582 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3583 video_url = unescapeHTML( link )
3584 path = compat_urllib_parse_urlparse( video_url ).path
3585 extension = os.path.splitext( path )[1][1:]
3586 format = path.split('/')[4].split('_')[:2]
3589 format = "-".join( format )
3590 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3595 'uploader': video_uploader,
3596 'upload_date': upload_date,
3597 'title': video_title,
3600 'thumbnail': thumbnail,
3601 'description': video_description
3604 if self._downloader.params.get('listformats', None):
3605 self._print_formats(formats)
3608 req_format = self._downloader.params.get('format', None)
3609 self.to_screen(u'Format: %s' % req_format)
3611 if req_format is None or req_format == 'best':
3613 elif req_format == 'worst':
3614 return [formats[-1]]
3615 elif req_format in ('-1', 'all'):
3618 format = self._specific( req_format, formats )
3620 raise ExtractorError(u'Requested format not available')
3625 class PornotubeIE(InfoExtractor):
3626 """Information extractor for pornotube.com."""
3627 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3629 def _real_extract(self, url):
3630 mobj = re.match(self._VALID_URL, url)
3632 raise ExtractorError(u'Invalid URL: %s' % url)
3634 video_id = mobj.group('videoid')
3635 video_title = mobj.group('title')
3637 # Get webpage content
3638 webpage = self._download_webpage(url, video_id)
3641 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3642 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3643 video_url = compat_urllib_parse.unquote(video_url)
3645 #Get the uploaded date
3646 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3647 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3648 if upload_date: upload_date = unified_strdate(upload_date)
3650 info = {'id': video_id,
3653 'upload_date': upload_date,
3654 'title': video_title,
3660 class YouJizzIE(InfoExtractor):
3661 """Information extractor for youjizz.com."""
3662 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3664 def _real_extract(self, url):
3665 mobj = re.match(self._VALID_URL, url)
3667 raise ExtractorError(u'Invalid URL: %s' % url)
3669 video_id = mobj.group('videoid')
3671 # Get webpage content
3672 webpage = self._download_webpage(url, video_id)
3674 # Get the video title
3675 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3676 webpage, u'title').strip()
3678 # Get the embed page
3679 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3681 raise ExtractorError(u'ERROR: unable to extract embed page')
3683 embed_page_url = result.group(0).strip()
3684 video_id = result.group('videoid')
3686 webpage = self._download_webpage(embed_page_url, video_id)
3689 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3690 webpage, u'video URL')
3692 info = {'id': video_id,
3694 'title': video_title,
3697 'player_url': embed_page_url}
3701 class EightTracksIE(InfoExtractor):
3703 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3705 def _real_extract(self, url):
3706 mobj = re.match(self._VALID_URL, url)
3708 raise ExtractorError(u'Invalid URL: %s' % url)
3709 playlist_id = mobj.group('id')
3711 webpage = self._download_webpage(url, playlist_id)
3713 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3714 data = json.loads(json_like)
3716 session = str(random.randint(0, 1000000000))
3718 track_count = data['tracks_count']
3719 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3720 next_url = first_url
3722 for i in itertools.count():
3723 api_json = self._download_webpage(next_url, playlist_id,
3724 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3725 errnote=u'Failed to download song information')
3726 api_data = json.loads(api_json)
3727 track_data = api_data[u'set']['track']
3729 'id': track_data['id'],
3730 'url': track_data['track_file_stream_url'],
3731 'title': track_data['performer'] + u' - ' + track_data['name'],
3732 'raw_title': track_data['name'],
3733 'uploader_id': data['user']['login'],
3737 if api_data['set']['at_last_track']:
3739 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3742 class KeekIE(InfoExtractor):
3743 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3746 def _real_extract(self, url):
3747 m = re.match(self._VALID_URL, url)
3748 video_id = m.group('videoID')
3750 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3751 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3752 webpage = self._download_webpage(url, video_id)
3754 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3757 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3758 webpage, u'uploader', fatal=False)
3764 'title': video_title,
3765 'thumbnail': thumbnail,
3766 'uploader': uploader
3770 class TEDIE(InfoExtractor):
3771 _VALID_URL=r'''http://www\.ted\.com/
3773 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3775 ((?P<type_talk>talks)) # We have a simple talk
3777 (/lang/(.*?))? # The url may contain the language
3778 /(?P<name>\w+) # Here goes the name and then ".html"
3782 def suitable(cls, url):
3783 """Receives a URL and returns True if suitable for this IE."""
3784 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3786 def _real_extract(self, url):
3787 m=re.match(self._VALID_URL, url, re.VERBOSE)
3788 if m.group('type_talk'):
3789 return [self._talk_info(url)]
3791 playlist_id=m.group('playlist_id')
3792 name=m.group('name')
3793 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3794 return [self._playlist_videos_info(url,name,playlist_id)]
3796 def _talk_video_link(self,mediaSlug):
3797 '''Returns the video link for that mediaSlug'''
3798 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3800 def _playlist_videos_info(self,url,name,playlist_id=0):
3801 '''Returns the videos of the playlist'''
3803 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3804 ([.\s]*?)data-playlist_item_id="(\d+)"
3805 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3807 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3808 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3809 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3810 m_names=re.finditer(video_name_RE,webpage)
3812 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3813 m_playlist = re.search(playlist_RE, webpage)
3814 playlist_title = m_playlist.group('playlist_title')
3816 playlist_entries = []
3817 for m_video, m_name in zip(m_videos,m_names):
3818 video_id=m_video.group('video_id')
3819 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3820 playlist_entries.append(self.url_result(talk_url, 'TED'))
3821 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3823 def _talk_info(self, url, video_id=0):
3824 """Return the video for the talk in the url"""
3825 m=re.match(self._VALID_URL, url,re.VERBOSE)
3826 videoName=m.group('name')
3827 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3828 # If the url includes the language we get the title translated
3829 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3830 title=re.search(title_RE, webpage).group('title')
3831 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3832 "id":(?P<videoID>[\d]+).*?
3833 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3834 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3835 thumb_match=re.search(thumb_RE,webpage)
3836 info_match=re.search(info_RE,webpage,re.VERBOSE)
3837 video_id=info_match.group('videoID')
3838 mediaSlug=info_match.group('mediaSlug')
3839 video_url=self._talk_video_link(mediaSlug)
3845 'thumbnail': thumb_match.group('thumbnail')
3849 class MySpassIE(InfoExtractor):
3850 _VALID_URL = r'http://www.myspass.de/.*'
3852 def _real_extract(self, url):
3853 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3855 # video id is the last path element of the URL
3856 # usually there is a trailing slash, so also try the second but last
3857 url_path = compat_urllib_parse_urlparse(url).path
3858 url_parent_path, video_id = os.path.split(url_path)
3860 _, video_id = os.path.split(url_parent_path)
3863 metadata_url = META_DATA_URL_TEMPLATE % video_id
3864 metadata_text = self._download_webpage(metadata_url, video_id)
3865 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3867 # extract values from metadata
3868 url_flv_el = metadata.find('url_flv')
3869 if url_flv_el is None:
3870 raise ExtractorError(u'Unable to extract download url')
3871 video_url = url_flv_el.text
3872 extension = os.path.splitext(video_url)[1][1:]
3873 title_el = metadata.find('title')
3874 if title_el is None:
3875 raise ExtractorError(u'Unable to extract title')
3876 title = title_el.text
3877 format_id_el = metadata.find('format_id')
3878 if format_id_el is None:
3881 format = format_id_el.text
3882 description_el = metadata.find('description')
3883 if description_el is not None:
3884 description = description_el.text
3887 imagePreview_el = metadata.find('imagePreview')
3888 if imagePreview_el is not None:
3889 thumbnail = imagePreview_el.text
3898 'thumbnail': thumbnail,
3899 'description': description
3903 class SpiegelIE(InfoExtractor):
3904 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3906 def _real_extract(self, url):
3907 m = re.match(self._VALID_URL, url)
3908 video_id = m.group('videoID')
3910 webpage = self._download_webpage(url, video_id)
3912 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3915 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3916 xml_code = self._download_webpage(xml_url, video_id,
3917 note=u'Downloading XML', errnote=u'Failed to download XML')
3919 idoc = xml.etree.ElementTree.fromstring(xml_code)
3920 last_type = idoc[-1]
3921 filename = last_type.findall('./filename')[0].text
3922 duration = float(last_type.findall('./duration')[0].text)
3924 video_url = 'http://video2.spiegel.de/flash/' + filename
3925 video_ext = filename.rpartition('.')[2]
3930 'title': video_title,
3931 'duration': duration,
3935 class LiveLeakIE(InfoExtractor):
3937 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3938 IE_NAME = u'liveleak'
3940 def _real_extract(self, url):
3941 mobj = re.match(self._VALID_URL, url)
3943 raise ExtractorError(u'Invalid URL: %s' % url)
3945 video_id = mobj.group('video_id')
3947 webpage = self._download_webpage(url, video_id)
3949 video_url = self._search_regex(r'file: "(.*?)",',
3950 webpage, u'video URL')
3952 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3953 webpage, u'title').replace('LiveLeak.com -', '').strip()
3955 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3956 webpage, u'description', fatal=False)
3958 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3959 webpage, u'uploader', fatal=False)
3965 'title': video_title,
3966 'description': video_description,
3967 'uploader': video_uploader
3972 class ARDIE(InfoExtractor):
3973 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3974 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3975 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3977 def _real_extract(self, url):
3978 # determine video id from url
3979 m = re.match(self._VALID_URL, url)
3981 numid = re.search(r'documentId=([0-9]+)', url)
3983 video_id = numid.group(1)
3985 video_id = m.group('video_id')
3987 # determine title and media streams from webpage
3988 html = self._download_webpage(url, video_id)
3989 title = re.search(self._TITLE, html).group('title')
3990 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3992 assert '"fsk"' in html
3993 raise ExtractorError(u'This video is only available after 8:00 pm')
3995 # choose default media type and highest quality for now
3996 stream = max([s for s in streams if int(s["media_type"]) == 0],
3997 key=lambda s: int(s["quality"]))
3999 # there's two possibilities: RTMP stream or HTTP download
4000 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4001 if stream['rtmp_url']:
4002 self.to_screen(u'RTMP download detected')
4003 assert stream['video_url'].startswith('mp4:')
4004 info["url"] = stream["rtmp_url"]
4005 info["play_path"] = stream['video_url']
4007 assert stream["video_url"].endswith('.mp4')
4008 info["url"] = stream["video_url"]
4011 class ZDFIE(InfoExtractor):
4012 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4013 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4014 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4015 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4016 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4018 def _real_extract(self, url):
4019 mobj = re.match(self._VALID_URL, url)
4021 raise ExtractorError(u'Invalid URL: %s' % url)
4022 video_id = mobj.group('video_id')
4024 html = self._download_webpage(url, video_id)
4025 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4027 raise ExtractorError(u'No media url found.')
4029 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4030 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4031 # choose first/default media type and highest quality for now
4032 for s in streams: #find 300 - dsl1000mbit
4033 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4036 for s in streams: #find veryhigh - dsl2000mbit
4037 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4041 raise ExtractorError(u'No stream found.')
4043 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4045 self.report_extraction(video_id)
4046 mobj = re.search(self._TITLE, html)
4048 raise ExtractorError(u'Cannot extract title')
4049 title = unescapeHTML(mobj.group('title'))
4051 mobj = re.search(self._MMS_STREAM, media_link)
4053 mobj = re.search(self._RTSP_STREAM, media_link)
4055 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4056 mms_url = mobj.group('video_url')
4058 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4060 raise ExtractorError(u'Cannot extract extention')
4061 ext = mobj.group('ext')
4063 return [{'id': video_id,
4069 class TumblrIE(InfoExtractor):
4070 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4072 def _real_extract(self, url):
4073 m_url = re.match(self._VALID_URL, url)
4074 video_id = m_url.group('id')
4075 blog = m_url.group('blog_name')
4077 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4078 webpage = self._download_webpage(url, video_id)
4080 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4081 video = re.search(re_video, webpage)
4083 raise ExtractorError(u'Unable to extract video')
4084 video_url = video.group('video_url')
4085 ext = video.group('ext')
4087 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4088 webpage, u'thumbnail', fatal=False) # We pick the first poster
4089 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4091 # The only place where you can get a title, it's not complete,
4092 # but searching in other places doesn't work for all videos
4093 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4094 webpage, u'title', flags=re.DOTALL)
4096 return [{'id': video_id,
4098 'title': video_title,
4099 'thumbnail': video_thumbnail,
4103 class BandcampIE(InfoExtractor):
4104 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4106 def _real_extract(self, url):
4107 mobj = re.match(self._VALID_URL, url)
4108 title = mobj.group('title')
4109 webpage = self._download_webpage(url, title)
4110 # We get the link to the free download page
4111 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4112 if m_download is None:
4113 raise ExtractorError(u'No free songs found')
4115 download_link = m_download.group(1)
4116 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4117 webpage, re.MULTILINE|re.DOTALL).group('id')
4119 download_webpage = self._download_webpage(download_link, id,
4120 'Downloading free downloads page')
4121 # We get the dictionary of the track from some javascrip code
4122 info = re.search(r'items: (.*?),$',
4123 download_webpage, re.MULTILINE).group(1)
4124 info = json.loads(info)[0]
4125 # We pick mp3-320 for now, until format selection can be easily implemented.
4126 mp3_info = info[u'downloads'][u'mp3-320']
4127 # If we try to use this url it says the link has expired
4128 initial_url = mp3_info[u'url']
4129 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4130 m_url = re.match(re_url, initial_url)
4131 #We build the url we will use to get the final track url
4132 # This url is build in Bandcamp in the script download_bunde_*.js
4133 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4134 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4135 # If we could correctly generate the .rand field the url would be
4136 #in the "download_url" key
4137 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4139 track_info = {'id':id,
4140 'title' : info[u'title'],
4143 'thumbnail' : info[u'thumb_url'],
4144 'uploader' : info[u'artist']
4149 class RedTubeIE(InfoExtractor):
4150 """Information Extractor for redtube"""
4151 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4153 def _real_extract(self,url):
4154 mobj = re.match(self._VALID_URL, url)
4156 raise ExtractorError(u'Invalid URL: %s' % url)
4158 video_id = mobj.group('id')
4159 video_extension = 'mp4'
4160 webpage = self._download_webpage(url, video_id)
4162 self.report_extraction(video_id)
4164 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4165 webpage, u'video URL')
4167 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4173 'ext': video_extension,
4174 'title': video_title,
4177 class InaIE(InfoExtractor):
4178 """Information Extractor for Ina.fr"""
4179 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4181 def _real_extract(self,url):
4182 mobj = re.match(self._VALID_URL, url)
4184 video_id = mobj.group('id')
4185 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4186 video_extension = 'mp4'
4187 webpage = self._download_webpage(mrss_url, video_id)
4189 self.report_extraction(video_id)
4191 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4192 webpage, u'video URL')
4194 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4200 'ext': video_extension,
4201 'title': video_title,
4204 class HowcastIE(InfoExtractor):
4205 """Information Extractor for Howcast.com"""
4206 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4208 def _real_extract(self, url):
4209 mobj = re.match(self._VALID_URL, url)
4211 video_id = mobj.group('id')
4212 webpage_url = 'http://www.howcast.com/videos/' + video_id
4213 webpage = self._download_webpage(webpage_url, video_id)
4215 self.report_extraction(video_id)
4217 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4218 webpage, u'video URL')
4220 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4223 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4224 webpage, u'description', fatal=False)
4226 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4227 webpage, u'thumbnail', fatal=False)
4233 'title': video_title,
4234 'description': video_description,
4235 'thumbnail': thumbnail,
4238 class VineIE(InfoExtractor):
4239 """Information Extractor for Vine.co"""
4240 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4242 def _real_extract(self, url):
4243 mobj = re.match(self._VALID_URL, url)
4245 video_id = mobj.group('id')
4246 webpage_url = 'https://vine.co/v/' + video_id
4247 webpage = self._download_webpage(webpage_url, video_id)
4249 self.report_extraction(video_id)
4251 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4252 webpage, u'video URL')
4254 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4257 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4258 webpage, u'thumbnail', fatal=False)
4260 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4261 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4267 'title': video_title,
4268 'thumbnail': thumbnail,
4269 'uploader': uploader,
4272 class FlickrIE(InfoExtractor):
4273 """Information Extractor for Flickr videos"""
4274 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4276 def _real_extract(self, url):
4277 mobj = re.match(self._VALID_URL, url)
4279 video_id = mobj.group('id')
4280 video_uploader_id = mobj.group('uploader_id')
4281 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4282 webpage = self._download_webpage(webpage_url, video_id)
4284 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4286 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4287 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4289 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4290 first_xml, u'node_id')
4292 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4293 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4295 self.report_extraction(video_id)
4297 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4299 raise ExtractorError(u'Unable to extract video url')
4300 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4302 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4303 webpage, u'video title')
4305 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4306 webpage, u'description', fatal=False)
4308 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4309 webpage, u'thumbnail', fatal=False)
4315 'title': video_title,
4316 'description': video_description,
4317 'thumbnail': thumbnail,
4318 'uploader_id': video_uploader_id,
4321 class TeamcocoIE(InfoExtractor):
4322 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4324 def _real_extract(self, url):
4325 mobj = re.match(self._VALID_URL, url)
4327 raise ExtractorError(u'Invalid URL: %s' % url)
4328 url_title = mobj.group('url_title')
4329 webpage = self._download_webpage(url, url_title)
4331 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4332 webpage, u'video id')
4334 self.report_extraction(video_id)
4336 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4339 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4340 webpage, u'thumbnail', fatal=False)
4342 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4343 webpage, u'description', fatal=False)
4345 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4346 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4348 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4355 'title': video_title,
4356 'thumbnail': thumbnail,
4357 'description': video_description,
4360 class XHamsterIE(InfoExtractor):
4361 """Information Extractor for xHamster"""
4362 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4364 def _real_extract(self,url):
4365 mobj = re.match(self._VALID_URL, url)
4367 video_id = mobj.group('id')
4368 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4369 webpage = self._download_webpage(mrss_url, video_id)
4371 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4373 raise ExtractorError(u'Unable to extract media URL')
4374 if len(mobj.group('server')) == 0:
4375 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4377 video_url = mobj.group('server')+'/key='+mobj.group('file')
4378 video_extension = video_url.split('.')[-1]
4380 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4383 # Can't see the description anywhere in the UI
4384 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4385 # webpage, u'description', fatal=False)
4386 # if video_description: video_description = unescapeHTML(video_description)
4388 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4390 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4392 video_upload_date = None
4393 self._downloader.report_warning(u'Unable to extract upload date')
4395 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4396 webpage, u'uploader id', default=u'anonymous')
4398 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4399 webpage, u'thumbnail', fatal=False)
4404 'ext': video_extension,
4405 'title': video_title,
4406 # 'description': video_description,
4407 'upload_date': video_upload_date,
4408 'uploader_id': video_uploader_id,
4409 'thumbnail': video_thumbnail
4412 class HypemIE(InfoExtractor):
4413 """Information Extractor for hypem"""
4414 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4416 def _real_extract(self, url):
4417 mobj = re.match(self._VALID_URL, url)
4419 raise ExtractorError(u'Invalid URL: %s' % url)
4420 track_id = mobj.group(1)
4422 data = { 'ax': 1, 'ts': time.time() }
4423 data_encoded = compat_urllib_parse.urlencode(data)
4424 complete_url = url + "?" + data_encoded
4425 request = compat_urllib_request.Request(complete_url)
4426 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4427 cookie = urlh.headers.get('Set-Cookie', '')
4429 self.report_extraction(track_id)
4431 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4432 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4434 track_list = json.loads(html_tracks)
4435 track = track_list[u'tracks'][0]
4437 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4440 track_id = track[u"id"]
4441 artist = track[u"artist"]
4442 title = track[u"song"]
4444 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4445 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4446 request.add_header('cookie', cookie)
4447 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4449 song_data = json.loads(song_data_json)
4451 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4452 final_url = song_data[u"url"]
4462 class Vbox7IE(InfoExtractor):
4463 """Information Extractor for Vbox7"""
4464 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4466 def _real_extract(self,url):
4467 mobj = re.match(self._VALID_URL, url)
4469 raise ExtractorError(u'Invalid URL: %s' % url)
4470 video_id = mobj.group(1)
4472 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4473 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4474 redirect_url = urlh.geturl() + new_location
4475 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4477 title = self._html_search_regex(r'<title>(.*)</title>',
4478 webpage, u'title').split('/')[0].strip()
4481 info_url = "http://vbox7.com/play/magare.do"
4482 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4483 info_request = compat_urllib_request.Request(info_url, data)
4484 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4485 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4486 if info_response is None:
4487 raise ExtractorError(u'Unable to extract the media url')
4488 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4495 'thumbnail': thumbnail_url,
4498 class GametrailersIE(InfoExtractor):
4499 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4501 def _real_extract(self, url):
4502 mobj = re.match(self._VALID_URL, url)
4504 raise ExtractorError(u'Invalid URL: %s' % url)
4505 video_id = mobj.group('id')
4506 video_type = mobj.group('type')
4507 webpage = self._download_webpage(url, video_id)
4508 if video_type == 'full-episodes':
4509 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4511 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4512 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4513 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4515 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4516 video_id, u'Downloading video info')
4517 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4518 video_id, u'Downloading video urls info')
4520 self.report_extraction(video_id)
4521 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4522 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4524 <url>(?P<thumb>.*?)</url>.*
4527 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4529 raise ExtractorError(u'Unable to extract video info')
4530 video_title = m_info.group('title')
4531 video_description = m_info.group('description')
4532 video_thumb = m_info.group('thumb')
4534 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4535 if m_urls is None or len(m_urls) == 0:
4536 raise ExtractError(u'Unable to extrat video url')
4537 # They are sorted from worst to best quality
4538 video_url = m_urls[-1].group('url')
4540 return {'url': video_url,
4542 'title': video_title,
4543 # Videos are actually flv not mp4
4545 'thumbnail': video_thumb,
4546 'description': video_description,
4549 def gen_extractors():
4550 """ Return a list of an instance of every supported extractor.
4551 The order does matter; the first extractor matched is the one handling the URL.
4554 YoutubePlaylistIE(),
4579 StanfordOpenClassroomIE(),
4589 WorldStarHipHopIE(),
4618 def get_info_extractor(ie_name):
4619 """Returns the info extractor class with the given ie_name"""
4620 return globals()[ie_name+'IE']