2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang') or 'en'
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0]
736 if 'sig' in url_data:
737 url += '&signature=' + url_data['sig'][0]
738 if 'ratebypass' not in url:
739 url += '&ratebypass=yes'
740 url_map[url_data['itag'][0]] = url
742 format_limit = self._downloader.params.get('format_limit', None)
743 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
744 if format_limit is not None and format_limit in available_formats:
745 format_list = available_formats[available_formats.index(format_limit):]
747 format_list = available_formats
748 existing_formats = [x for x in format_list if x in url_map]
749 if len(existing_formats) == 0:
750 raise ExtractorError(u'no known formats available for video')
751 if self._downloader.params.get('listformats', None):
752 self._print_formats(existing_formats)
754 if req_format is None or req_format == 'best':
755 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
756 elif req_format == 'worst':
757 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
758 elif req_format in ('-1', 'all'):
759 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
761 # Specific formats. We pick the first in a slash-delimeted sequence.
762 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
763 req_formats = req_format.split('/')
764 video_url_list = None
765 for rf in req_formats:
767 video_url_list = [(rf, url_map[rf])]
769 if video_url_list is None:
770 raise ExtractorError(u'requested format not available')
772 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
775 for format_param, video_real_url in video_url_list:
777 video_extension = self._video_extensions.get(format_param, 'flv')
779 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
780 self._video_dimensions.get(format_param, '???'))
784 'url': video_real_url,
785 'uploader': video_uploader,
786 'uploader_id': video_uploader_id,
787 'upload_date': upload_date,
788 'title': video_title,
789 'ext': video_extension,
790 'format': video_format,
791 'thumbnail': video_thumbnail,
792 'description': video_description,
793 'player_url': player_url,
794 'subtitles': video_subtitles,
795 'duration': video_duration
800 class MetacafeIE(InfoExtractor):
801 """Information Extractor for metacafe.com."""
803 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
804 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
805 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
806 IE_NAME = u'metacafe'
808 def report_disclaimer(self):
809 """Report disclaimer retrieval."""
810 self.to_screen(u'Retrieving disclaimer')
812 def _real_initialize(self):
813 # Retrieve disclaimer
814 request = compat_urllib_request.Request(self._DISCLAIMER)
816 self.report_disclaimer()
817 disclaimer = compat_urllib_request.urlopen(request).read()
818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
819 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
824 'submit': "Continue - I'm over 18",
826 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
828 self.report_age_confirmation()
829 disclaimer = compat_urllib_request.urlopen(request).read()
830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
831 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
833 def _real_extract(self, url):
834 # Extract id and simplified title from URL
835 mobj = re.match(self._VALID_URL, url)
837 raise ExtractorError(u'Invalid URL: %s' % url)
839 video_id = mobj.group(1)
841 # Check if video comes from YouTube
842 mobj2 = re.match(r'^yt-(.*)$', video_id)
843 if mobj2 is not None:
844 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
846 # Retrieve video webpage to extract further information
847 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
849 # Extract URL, uploader and title from webpage
850 self.report_extraction(video_id)
851 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
853 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
854 video_extension = mediaURL[-3:]
856 # Extract gdaKey if available
857 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
861 gdaKey = mobj.group(1)
862 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
864 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
866 raise ExtractorError(u'Unable to extract media URL')
867 vardict = compat_parse_qs(mobj.group(1))
868 if 'mediaData' not in vardict:
869 raise ExtractorError(u'Unable to extract media URL')
870 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
872 raise ExtractorError(u'Unable to extract media URL')
873 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
874 video_extension = mediaURL[-3:]
875 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
877 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
879 raise ExtractorError(u'Unable to extract title')
880 video_title = mobj.group(1).decode('utf-8')
882 mobj = re.search(r'submitter=(.*?);', webpage)
884 raise ExtractorError(u'Unable to extract uploader nickname')
885 video_uploader = mobj.group(1)
888 'id': video_id.decode('utf-8'),
889 'url': video_url.decode('utf-8'),
890 'uploader': video_uploader.decode('utf-8'),
892 'title': video_title,
893 'ext': video_extension.decode('utf-8'),
896 class DailymotionIE(InfoExtractor):
897 """Information Extractor for Dailymotion"""
899 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
900 IE_NAME = u'dailymotion'
902 def _real_extract(self, url):
903 # Extract id and simplified title from URL
904 mobj = re.match(self._VALID_URL, url)
906 raise ExtractorError(u'Invalid URL: %s' % url)
908 video_id = mobj.group(1).split('_')[0].split('?')[0]
910 video_extension = 'mp4'
912 # Retrieve video webpage to extract further information
913 request = compat_urllib_request.Request(url)
914 request.add_header('Cookie', 'family_filter=off')
915 webpage = self._download_webpage(request, video_id)
917 # Extract URL, uploader and title from webpage
918 self.report_extraction(video_id)
919 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
921 raise ExtractorError(u'Unable to extract media URL')
922 flashvars = compat_urllib_parse.unquote(mobj.group(1))
924 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
927 self.to_screen(u'Using %s' % key)
930 raise ExtractorError(u'Unable to extract video URL')
932 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
934 raise ExtractorError(u'Unable to extract video URL')
936 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
938 # TODO: support choosing qualities
940 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
942 raise ExtractorError(u'Unable to extract title')
943 video_title = unescapeHTML(mobj.group('title'))
945 video_uploader = None
946 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
947 # Looking for official user
948 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
949 webpage, 'video uploader')
951 video_upload_date = None
952 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
954 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
959 'uploader': video_uploader,
960 'upload_date': video_upload_date,
961 'title': video_title,
962 'ext': video_extension,
966 class PhotobucketIE(InfoExtractor):
967 """Information extractor for photobucket.com."""
969 # TODO: the original _VALID_URL was:
970 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
971 # Check if it's necessary to keep the old extracion process
972 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
973 IE_NAME = u'photobucket'
975 def _real_extract(self, url):
976 # Extract id from URL
977 mobj = re.match(self._VALID_URL, url)
979 raise ExtractorError(u'Invalid URL: %s' % url)
981 video_id = mobj.group('id')
983 video_extension = mobj.group('ext')
985 # Retrieve video webpage to extract further information
986 webpage = self._download_webpage(url, video_id)
988 # Extract URL, uploader, and title from webpage
989 self.report_extraction(video_id)
990 # We try first by looking the javascript code:
991 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
993 info = json.loads(mobj.group('json'))
996 'url': info[u'downloadUrl'],
997 'uploader': info[u'username'],
998 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
999 'title': info[u'title'],
1000 'ext': video_extension,
1001 'thumbnail': info[u'thumbUrl'],
1004 # We try looking in other parts of the webpage
1005 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006 webpage, u'video URL')
1008 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1010 raise ExtractorError(u'Unable to extract title')
1011 video_title = mobj.group(1).decode('utf-8')
1012 video_uploader = mobj.group(2).decode('utf-8')
1015 'id': video_id.decode('utf-8'),
1016 'url': video_url.decode('utf-8'),
1017 'uploader': video_uploader,
1018 'upload_date': None,
1019 'title': video_title,
1020 'ext': video_extension.decode('utf-8'),
1024 class YahooIE(InfoExtractor):
1025 """Information extractor for screen.yahoo.com."""
1026 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1028 def _real_extract(self, url):
1029 mobj = re.match(self._VALID_URL, url)
1031 raise ExtractorError(u'Invalid URL: %s' % url)
1032 video_id = mobj.group('id')
1033 webpage = self._download_webpage(url, video_id)
1034 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1037 # TODO: Check which url parameters are required
1038 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1045 self.report_extraction(video_id)
1046 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1048 raise ExtractorError(u'Unable to extract video info')
1049 video_title = m_info.group('title')
1050 video_description = m_info.group('description')
1051 video_thumb = m_info.group('thumb')
1052 video_date = m_info.group('date')
1053 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1055 # TODO: Find a way to get mp4 videos
1056 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059 video_url = m_rest.group('url')
1060 video_path = m_rest.group('path')
1062 raise ExtractorError(u'Unable to extract video url')
1064 else: # We have to use a different method if another id is defined
1065 long_id = m_id.group('new_id')
1066 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069 info = json.loads(json_str)
1070 res = info[u'query'][u'results'][u'mediaObj'][0]
1071 stream = res[u'streams'][0]
1072 video_path = stream[u'path']
1073 video_url = stream[u'host']
1075 video_title = meta[u'title']
1076 video_description = meta[u'description']
1077 video_thumb = meta[u'thumbnail']
1078 video_date = None # I can't find it
1083 'play_path': video_path,
1084 'title':video_title,
1085 'description': video_description,
1086 'thumbnail': video_thumb,
1087 'upload_date': video_date,
1092 class VimeoIE(InfoExtractor):
1093 """Information extractor for vimeo.com."""
1095 # _VALID_URL matches Vimeo URLs
1096 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1099 def _real_extract(self, url, new_video=True):
1100 # Extract ID from URL
1101 mobj = re.match(self._VALID_URL, url)
1103 raise ExtractorError(u'Invalid URL: %s' % url)
1105 video_id = mobj.group('id')
1106 if not mobj.group('proto'):
1107 url = 'https://' + url
1108 if mobj.group('direct_link') or mobj.group('pro'):
1109 url = 'https://vimeo.com/' + video_id
1111 # Retrieve video webpage to extract further information
1112 request = compat_urllib_request.Request(url, None, std_headers)
1113 webpage = self._download_webpage(request, video_id)
1115 # Now we begin extracting as much information as we can from what we
1116 # retrieved. First we extract the information common to all extractors,
1117 # and latter we extract those that are Vimeo specific.
1118 self.report_extraction(video_id)
1120 # Extract the config JSON
1122 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1123 config = json.loads(config)
1125 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1126 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1128 raise ExtractorError(u'Unable to extract info section')
1131 video_title = config["video"]["title"]
1133 # Extract uploader and uploader_id
1134 video_uploader = config["video"]["owner"]["name"]
1135 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1137 # Extract video thumbnail
1138 video_thumbnail = config["video"]["thumbnail"]
1140 # Extract video description
1141 video_description = get_element_by_attribute("itemprop", "description", webpage)
1142 if video_description: video_description = clean_html(video_description)
1143 else: video_description = u''
1145 # Extract upload date
1146 video_upload_date = None
1147 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148 if mobj is not None:
1149 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1151 # Vimeo specific: extract request signature and timestamp
1152 sig = config['request']['signature']
1153 timestamp = config['request']['timestamp']
1155 # Vimeo specific: extract video codec and quality information
1156 # First consider quality, then codecs, then take everything
1157 # TODO bind to format param
1158 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159 files = { 'hd': [], 'sd': [], 'other': []}
1160 for codec_name, codec_extension in codecs:
1161 if codec_name in config["video"]["files"]:
1162 if 'hd' in config["video"]["files"][codec_name]:
1163 files['hd'].append((codec_name, codec_extension, 'hd'))
1164 elif 'sd' in config["video"]["files"][codec_name]:
1165 files['sd'].append((codec_name, codec_extension, 'sd'))
1167 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1169 for quality in ('hd', 'sd', 'other'):
1170 if len(files[quality]) > 0:
1171 video_quality = files[quality][0][2]
1172 video_codec = files[quality][0][0]
1173 video_extension = files[quality][0][1]
1174 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1177 raise ExtractorError(u'No known codec found')
1179 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1185 'uploader': video_uploader,
1186 'uploader_id': video_uploader_id,
1187 'upload_date': video_upload_date,
1188 'title': video_title,
1189 'ext': video_extension,
1190 'thumbnail': video_thumbnail,
1191 'description': video_description,
1195 class ArteTvIE(InfoExtractor):
1196 """arte.tv information extractor."""
1198 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199 _LIVE_URL = r'index-[0-9]+\.html$'
1201 IE_NAME = u'arte.tv'
1203 def fetch_webpage(self, url):
1204 request = compat_urllib_request.Request(url)
1206 self.report_download_webpage(url)
1207 webpage = compat_urllib_request.urlopen(request).read()
1208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210 except ValueError as err:
1211 raise ExtractorError(u'Invalid URL: %s' % url)
1214 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1215 page = self.fetch_webpage(url)
1216 mobj = re.search(regex, page, regexFlags)
1220 raise ExtractorError(u'Invalid URL: %s' % url)
1222 for (i, key, err) in matchTuples:
1223 if mobj.group(i) is None:
1224 raise ExtractorError(err)
1226 info[key] = mobj.group(i)
1230 def extractLiveStream(self, url):
1231 video_lang = url.split('/')[-4]
1232 info = self.grep_webpage(
1234 r'src="(.*?/videothek_js.*?\.js)',
1237 (1, 'url', u'Invalid URL: %s' % url)
1240 http_host = url.split('/')[2]
1241 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242 info = self.grep_webpage(
1244 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245 '(http://.*?\.swf).*?' +
1249 (1, 'path', u'could not extract video path: %s' % url),
1250 (2, 'player', u'could not extract video player: %s' % url),
1251 (3, 'url', u'could not extract video url: %s' % url)
1254 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1256 def extractPlus7Stream(self, url):
1257 video_lang = url.split('/')[-3]
1258 info = self.grep_webpage(
1260 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1263 (1, 'url', u'Invalid URL: %s' % url)
1266 next_url = compat_urllib_parse.unquote(info.get('url'))
1267 info = self.grep_webpage(
1269 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1272 (1, 'url', u'Could not find <video> tag: %s' % url)
1275 next_url = compat_urllib_parse.unquote(info.get('url'))
1277 info = self.grep_webpage(
1279 r'<video id="(.*?)".*?>.*?' +
1280 '<name>(.*?)</name>.*?' +
1281 '<dateVideo>(.*?)</dateVideo>.*?' +
1282 '<url quality="hd">(.*?)</url>',
1285 (1, 'id', u'could not extract video id: %s' % url),
1286 (2, 'title', u'could not extract video title: %s' % url),
1287 (3, 'date', u'could not extract video date: %s' % url),
1288 (4, 'url', u'could not extract video url: %s' % url)
1293 'id': info.get('id'),
1294 'url': compat_urllib_parse.unquote(info.get('url')),
1295 'uploader': u'arte.tv',
1296 'upload_date': unified_strdate(info.get('date')),
1297 'title': info.get('title').decode('utf-8'),
1303 def _real_extract(self, url):
1304 video_id = url.split('/')[-1]
1305 self.report_extraction(video_id)
1307 if re.search(self._LIVE_URL, video_id) is not None:
1308 self.extractLiveStream(url)
1311 info = self.extractPlus7Stream(url)
1316 class GenericIE(InfoExtractor):
1317 """Generic last-resort information extractor."""
1320 IE_NAME = u'generic'
1322 def report_download_webpage(self, video_id):
1323 """Report webpage download."""
1324 if not self._downloader.params.get('test', False):
1325 self._downloader.report_warning(u'Falling back on generic information extractor.')
1326 super(GenericIE, self).report_download_webpage(video_id)
1328 def report_following_redirect(self, new_url):
1329 """Report information extraction."""
1330 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1332 def _test_redirect(self, url):
1333 """Check if it is a redirect, like url shorteners, in case return the new url."""
1334 class HeadRequest(compat_urllib_request.Request):
1335 def get_method(self):
1338 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1340 Subclass the HTTPRedirectHandler to make it use our
1341 HeadRequest also on the redirected URL
1343 def redirect_request(self, req, fp, code, msg, headers, newurl):
1344 if code in (301, 302, 303, 307):
1345 newurl = newurl.replace(' ', '%20')
1346 newheaders = dict((k,v) for k,v in req.headers.items()
1347 if k.lower() not in ("content-length", "content-type"))
1348 return HeadRequest(newurl,
1350 origin_req_host=req.get_origin_req_host(),
1353 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1355 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1357 Fallback to GET if HEAD is not allowed (405 HTTP error)
1359 def http_error_405(self, req, fp, code, msg, headers):
1363 newheaders = dict((k,v) for k,v in req.headers.items()
1364 if k.lower() not in ("content-length", "content-type"))
1365 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1367 origin_req_host=req.get_origin_req_host(),
1371 opener = compat_urllib_request.OpenerDirector()
1372 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1373 HTTPMethodFallback, HEADRedirectHandler,
1374 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1375 opener.add_handler(handler())
1377 response = opener.open(HeadRequest(url))
1378 if response is None:
1379 raise ExtractorError(u'Invalid URL protocol')
1380 new_url = response.geturl()
1385 self.report_following_redirect(new_url)
1388 def _real_extract(self, url):
1389 new_url = self._test_redirect(url)
1390 if new_url: return [self.url_result(new_url)]
1392 video_id = url.split('/')[-1]
1394 webpage = self._download_webpage(url, video_id)
1395 except ValueError as err:
1396 # since this is the last-resort InfoExtractor, if
1397 # this error is thrown, it'll be thrown here
1398 raise ExtractorError(u'Invalid URL: %s' % url)
1400 self.report_extraction(video_id)
1401 # Start with something easy: JW Player in SWFObject
1402 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit
1405 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1407 # Broaden the search a little bit: JWPlayer JS loader
1408 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1410 # Try to find twitter cards info
1411 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1413 raise ExtractorError(u'Invalid URL: %s' % url)
1415 # It's possible that one of the regexes
1416 # matched, but returned an empty group:
1417 if mobj.group(1) is None:
1418 raise ExtractorError(u'Invalid URL: %s' % url)
1420 video_url = compat_urllib_parse.unquote(mobj.group(1))
1421 video_id = os.path.basename(video_url)
1423 # here's a fun little line of code for you:
1424 video_extension = os.path.splitext(video_id)[1][1:]
1425 video_id = os.path.splitext(video_id)[0]
1427 # it's tempting to parse this further, but you would
1428 # have to take into account all the variations like
1429 # Video Title - Site Name
1430 # Site Name | Video Title
1431 # Video Title - Tagline | Site Name
1432 # and so on and so forth; it's just not practical
1433 video_title = self._html_search_regex(r'<title>(.*)</title>',
1434 webpage, u'video title')
1436 # video uploader is domain name
1437 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1438 url, u'video uploader')
1443 'uploader': video_uploader,
1444 'upload_date': None,
1445 'title': video_title,
1446 'ext': video_extension,
1450 class YoutubeSearchIE(SearchInfoExtractor):
1451 """Information Extractor for YouTube search queries."""
1452 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1454 IE_NAME = u'youtube:search'
1455 _SEARCH_KEY = 'ytsearch'
1457 def report_download_page(self, query, pagenum):
1458 """Report attempt to download search page with given number."""
1459 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1461 def _get_n_results(self, query, n):
1462 """Get a specified number of results for a query"""
1468 while (50 * pagenum) < limit:
1469 self.report_download_page(query, pagenum+1)
1470 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471 request = compat_urllib_request.Request(result_url)
1473 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1476 api_response = json.loads(data)['data']
1478 if not 'items' in api_response:
1479 raise ExtractorError(u'[youtube] No video results')
1481 new_ids = list(video['id'] for video in api_response['items'])
1482 video_ids += new_ids
1484 limit = min(n, api_response['totalItems'])
1487 if len(video_ids) > n:
1488 video_ids = video_ids[:n]
1489 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1490 return self.playlist_result(videos, query)
1493 class GoogleSearchIE(SearchInfoExtractor):
1494 """Information Extractor for Google Video search queries."""
1495 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1497 IE_NAME = u'video.google:search'
1498 _SEARCH_KEY = 'gvsearch'
1500 def _get_n_results(self, query, n):
1501 """Get a specified number of results for a query"""
1504 '_type': 'playlist',
1509 for pagenum in itertools.count(1):
1510 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1511 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1512 note='Downloading result page ' + str(pagenum))
1514 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1517 'url': mobj.group(1)
1519 res['entries'].append(e)
1521 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1524 class YahooSearchIE(SearchInfoExtractor):
1525 """Information Extractor for Yahoo! Video search queries."""
1528 IE_NAME = u'screen.yahoo:search'
1529 _SEARCH_KEY = 'yvsearch'
1531 def _get_n_results(self, query, n):
1532 """Get a specified number of results for a query"""
1535 '_type': 'playlist',
1539 for pagenum in itertools.count(0):
1540 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1541 webpage = self._download_webpage(result_url, query,
1542 note='Downloading results page '+str(pagenum+1))
1543 info = json.loads(webpage)
1545 results = info[u'results']
1547 for (i, r) in enumerate(results):
1548 if (pagenum * 30) +i >= n:
1550 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1551 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1552 res['entries'].append(e)
1553 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1559 class YoutubePlaylistIE(InfoExtractor):
1560 """Information Extractor for YouTube playlists."""
1562 _VALID_URL = r"""(?:
1567 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1568 \? (?:.*?&)*? (?:p|a|list)=
1571 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1574 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1576 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1578 IE_NAME = u'youtube:playlist'
1581 def suitable(cls, url):
1582 """Receives a URL and returns True if suitable for this IE."""
1583 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1585 def _real_extract(self, url):
1586 # Extract playlist id
1587 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1589 raise ExtractorError(u'Invalid URL: %s' % url)
1591 # Download playlist videos from API
1592 playlist_id = mobj.group(1) or mobj.group(2)
1597 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1598 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1601 response = json.loads(page)
1602 except ValueError as err:
1603 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1605 if 'feed' not in response:
1606 raise ExtractorError(u'Got a malformed response from YouTube API')
1607 playlist_title = response['feed']['title']['$t']
1608 if 'entry' not in response['feed']:
1609 # Number of videos is a multiple of self._MAX_RESULTS
1612 for entry in response['feed']['entry']:
1613 index = entry['yt$position']['$t']
1614 if 'media$group' in entry and 'media$player' in entry['media$group']:
1615 videos.append((index, entry['media$group']['media$player']['url']))
1617 if len(response['feed']['entry']) < self._MAX_RESULTS:
1621 videos = [v[1] for v in sorted(videos)]
1623 url_results = [self.url_result(url, 'Youtube') for url in videos]
1624 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1627 class YoutubeChannelIE(InfoExtractor):
1628 """Information Extractor for YouTube channels."""
1630 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1631 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1632 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1633 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1634 IE_NAME = u'youtube:channel'
1636 def extract_videos_from_page(self, page):
1638 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1639 if mobj.group(1) not in ids_in_page:
1640 ids_in_page.append(mobj.group(1))
1643 def _real_extract(self, url):
1644 # Extract channel id
1645 mobj = re.match(self._VALID_URL, url)
1647 raise ExtractorError(u'Invalid URL: %s' % url)
1649 # Download channel page
1650 channel_id = mobj.group(1)
1654 url = self._TEMPLATE_URL % (channel_id, pagenum)
1655 page = self._download_webpage(url, channel_id,
1656 u'Downloading page #%s' % pagenum)
1658 # Extract video identifiers
1659 ids_in_page = self.extract_videos_from_page(page)
1660 video_ids.extend(ids_in_page)
1662 # Download any subsequent channel pages using the json-based channel_ajax query
1663 if self._MORE_PAGES_INDICATOR in page:
1665 pagenum = pagenum + 1
1667 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1668 page = self._download_webpage(url, channel_id,
1669 u'Downloading page #%s' % pagenum)
1671 page = json.loads(page)
1673 ids_in_page = self.extract_videos_from_page(page['content_html'])
1674 video_ids.extend(ids_in_page)
1676 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1679 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1681 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1682 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1683 return [self.playlist_result(url_entries, channel_id)]
1686 class YoutubeUserIE(InfoExtractor):
1687 """Information Extractor for YouTube users."""
1689 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1690 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1691 _GDATA_PAGE_SIZE = 50
1692 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1693 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1694 IE_NAME = u'youtube:user'
1696 def _real_extract(self, url):
1698 mobj = re.match(self._VALID_URL, url)
1700 raise ExtractorError(u'Invalid URL: %s' % url)
1702 username = mobj.group(1)
1704 # Download video ids using YouTube Data API. Result size per
1705 # query is limited (currently to 50 videos) so we need to query
1706 # page by page until there are no video ids - it means we got
1713 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1715 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1716 page = self._download_webpage(gdata_url, username,
1717 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1719 # Extract video identifiers
1722 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1723 if mobj.group(1) not in ids_in_page:
1724 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1728 # A little optimization - if current page is not
1729 # "full", ie. does not contain PAGE_SIZE video ids then
1730 # we can assume that this page is the last one - there
1731 # are no more ids on further pages - no need to query
1734 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1739 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1740 url_results = [self.url_result(url, 'Youtube') for url in urls]
1741 return [self.playlist_result(url_results, playlist_title = username)]
1744 class BlipTVUserIE(InfoExtractor):
1745 """Information Extractor for blip.tv users."""
1747 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1749 IE_NAME = u'blip.tv:user'
1751 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1755 raise ExtractorError(u'Invalid URL: %s' % url)
1757 username = mobj.group(1)
1759 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1761 page = self._download_webpage(url, username, u'Downloading user page')
1762 mobj = re.search(r'data-users-id="([^"]+)"', page)
1763 page_base = page_base % mobj.group(1)
1766 # Download video ids using BlipTV Ajax calls. Result size per
1767 # query is limited (currently to 12 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1775 url = page_base + "&page=" + str(pagenum)
1776 page = self._download_webpage(url, username,
1777 u'Downloading video ids from page %d' % pagenum)
1779 # Extract video identifiers
1782 for mobj in re.finditer(r'href="/([^"]+)"', page):
1783 if mobj.group(1) not in ids_in_page:
1784 ids_in_page.append(unescapeHTML(mobj.group(1)))
1786 video_ids.extend(ids_in_page)
1788 # A little optimization - if current page is not
1789 # "full", ie. does not contain PAGE_SIZE video ids then
1790 # we can assume that this page is the last one - there
1791 # are no more ids on further pages - no need to query
1794 if len(ids_in_page) < self._PAGE_SIZE:
1799 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1800 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1801 return [self.playlist_result(url_entries, playlist_title = username)]
1804 class DepositFilesIE(InfoExtractor):
1805 """Information extractor for depositfiles.com"""
1807 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1809 def _real_extract(self, url):
1810 file_id = url.split('/')[-1]
1811 # Rebuild url in english locale
1812 url = 'http://depositfiles.com/en/files/' + file_id
1814 # Retrieve file webpage with 'Free download' button pressed
1815 free_download_indication = { 'gateway_result' : '1' }
1816 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1818 self.report_download_webpage(file_id)
1819 webpage = compat_urllib_request.urlopen(request).read()
1820 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1821 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1823 # Search for the real file URL
1824 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1825 if (mobj is None) or (mobj.group(1) is None):
1826 # Try to figure out reason of the error.
1827 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1828 if (mobj is not None) and (mobj.group(1) is not None):
1829 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1830 raise ExtractorError(u'%s' % restriction_message)
1832 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1834 file_url = mobj.group(1)
1835 file_extension = os.path.splitext(file_url)[1][1:]
1837 # Search for file title
1838 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1841 'id': file_id.decode('utf-8'),
1842 'url': file_url.decode('utf-8'),
1844 'upload_date': None,
1845 'title': file_title,
1846 'ext': file_extension.decode('utf-8'),
1850 class FacebookIE(InfoExtractor):
1851 """Information Extractor for Facebook"""
1853 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1854 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1855 _NETRC_MACHINE = 'facebook'
1856 IE_NAME = u'facebook'
1858 def report_login(self):
1859 """Report attempt to log in."""
1860 self.to_screen(u'Logging in')
1862 def _real_initialize(self):
1863 if self._downloader is None:
1868 downloader_params = self._downloader.params
1870 # Attempt to use provided username and password or .netrc data
1871 if downloader_params.get('username', None) is not None:
1872 useremail = downloader_params['username']
1873 password = downloader_params['password']
1874 elif downloader_params.get('usenetrc', False):
1876 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1877 if info is not None:
1881 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1882 except (IOError, netrc.NetrcParseError) as err:
1883 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1886 if useremail is None:
1895 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1898 login_results = compat_urllib_request.urlopen(request).read()
1899 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1900 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1902 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1903 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1906 def _real_extract(self, url):
1907 mobj = re.match(self._VALID_URL, url)
1909 raise ExtractorError(u'Invalid URL: %s' % url)
1910 video_id = mobj.group('ID')
1912 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1913 webpage = self._download_webpage(url, video_id)
1915 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1916 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1917 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1919 raise ExtractorError(u'Cannot parse data')
1920 data = dict(json.loads(m.group(1)))
1921 params_raw = compat_urllib_parse.unquote(data['params'])
1922 params = json.loads(params_raw)
1923 video_data = params['video_data'][0]
1924 video_url = video_data.get('hd_src')
1926 video_url = video_data['sd_src']
1928 raise ExtractorError(u'Cannot find video URL')
1929 video_duration = int(video_data['video_duration'])
1930 thumbnail = video_data['thumbnail_src']
1932 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1937 'title': video_title,
1940 'duration': video_duration,
1941 'thumbnail': thumbnail,
1946 class BlipTVIE(InfoExtractor):
1947 """Information extractor for blip.tv"""
1949 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1950 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1951 IE_NAME = u'blip.tv'
1953 def report_direct_download(self, title):
1954 """Report information extraction."""
1955 self.to_screen(u'%s: Direct download detected' % title)
1957 def _real_extract(self, url):
1958 mobj = re.match(self._VALID_URL, url)
1960 raise ExtractorError(u'Invalid URL: %s' % url)
1962 # See https://github.com/rg3/youtube-dl/issues/857
1963 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1964 if api_mobj is not None:
1965 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1966 urlp = compat_urllib_parse_urlparse(url)
1967 if urlp.path.startswith('/play/'):
1968 request = compat_urllib_request.Request(url)
1969 response = compat_urllib_request.urlopen(request)
1970 redirecturl = response.geturl()
1971 rurlp = compat_urllib_parse_urlparse(redirecturl)
1972 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1973 url = 'http://blip.tv/a/a-' + file_id
1974 return self._real_extract(url)
1981 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1982 request = compat_urllib_request.Request(json_url)
1983 request.add_header('User-Agent', 'iTunes/10.6.1')
1984 self.report_extraction(mobj.group(1))
1987 urlh = compat_urllib_request.urlopen(request)
1988 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1989 basename = url.split('/')[-1]
1990 title,ext = os.path.splitext(basename)
1991 title = title.decode('UTF-8')
1992 ext = ext.replace('.', '')
1993 self.report_direct_download(title)
1998 'upload_date': None,
2003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2004 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2005 if info is None: # Regular URL
2007 json_code_bytes = urlh.read()
2008 json_code = json_code_bytes.decode('utf-8')
2009 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2010 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2013 json_data = json.loads(json_code)
2014 if 'Post' in json_data:
2015 data = json_data['Post']
2019 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2020 video_url = data['media']['url']
2021 umobj = re.match(self._URL_EXT, video_url)
2023 raise ValueError('Can not determine filename extension')
2024 ext = umobj.group(1)
2027 'id': data['item_id'],
2029 'uploader': data['display_name'],
2030 'upload_date': upload_date,
2031 'title': data['title'],
2033 'format': data['media']['mimeType'],
2034 'thumbnail': data['thumbnailUrl'],
2035 'description': data['description'],
2036 'player_url': data['embedUrl'],
2037 'user_agent': 'iTunes/10.6.1',
2039 except (ValueError,KeyError) as err:
2040 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2045 class MyVideoIE(InfoExtractor):
2046 """Information Extractor for myvideo.de."""
2048 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2049 IE_NAME = u'myvideo'
2051 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2052 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2053 # https://github.com/rg3/youtube-dl/pull/842
2054 def __rc4crypt(self,data, key):
2056 box = list(range(256))
2057 for i in list(range(256)):
2058 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2059 box[i], box[x] = box[x], box[i]
2065 y = (y + box[x]) % 256
2066 box[x], box[y] = box[y], box[x]
2067 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2071 return hashlib.md5(s).hexdigest().encode()
2073 def _real_extract(self,url):
2074 mobj = re.match(self._VALID_URL, url)
2076 raise ExtractorError(u'invalid URL: %s' % url)
2078 video_id = mobj.group(1)
2081 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2082 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2083 b'TnpsbA0KTVRkbU1tSTRNdz09'
2087 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2088 webpage = self._download_webpage(webpage_url, video_id)
2090 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2091 if mobj is not None:
2092 self.report_extraction(video_id)
2093 video_url = mobj.group(1) + '.flv'
2095 video_title = self._html_search_regex('<title>([^<]+)</title>',
2098 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2104 'upload_date': None,
2105 'title': video_title,
2110 mobj = re.search('var flashvars={(.+?)}', webpage)
2112 raise ExtractorError(u'Unable to extract video')
2117 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2118 if not a == '_encxml':
2121 encxml = compat_urllib_parse.unquote(b)
2122 if not params.get('domain'):
2123 params['domain'] = 'www.myvideo.de'
2124 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2125 if 'flash_playertype=MTV' in xmldata_url:
2126 self._downloader.report_warning(u'avoiding MTV player')
2128 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2129 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2133 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2134 enc_data_b = binascii.unhexlify(enc_data)
2136 base64.b64decode(base64.b64decode(GK)) +
2138 str(video_id).encode('utf-8')
2141 dec_data = self.__rc4crypt(enc_data_b, sk)
2144 self.report_extraction(video_id)
2147 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2149 video_url = compat_urllib_parse.unquote(mobj.group(1))
2150 if 'myvideo2flash' in video_url:
2151 self._downloader.report_warning(u'forcing RTMPT ...')
2152 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2155 # extract non rtmp videos
2156 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2158 raise ExtractorError(u'unable to extract url')
2159 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2161 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2162 video_file = compat_urllib_parse.unquote(video_file)
2164 if not video_file.endswith('f4m'):
2165 ppath, prefix = video_file.split('.')
2166 video_playpath = '%s:%s' % (prefix, ppath)
2167 video_hls_playlist = ''
2170 video_hls_playlist = (
2171 video_filepath + video_file
2172 ).replace('.f4m', '.m3u8')
2174 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2175 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2177 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2183 'tc_url': video_url,
2185 'upload_date': None,
2186 'title': video_title,
2188 'play_path': video_playpath,
2189 'video_file': video_file,
2190 'video_hls_playlist': video_hls_playlist,
2191 'player_url': video_swfobj,
2195 class ComedyCentralIE(InfoExtractor):
2196 """Information extractor for The Daily Show and Colbert Report """
2198 # urls can be abbreviations like :thedailyshow or :colbert
2199 # urls for episodes like:
2200 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2201 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2202 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2203 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2204 |(https?://)?(www\.)?
2205 (?P<showname>thedailyshow|colbertnation)\.com/
2206 (full-episodes/(?P<episode>.*)|
2208 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2209 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2212 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2214 _video_extensions = {
2222 _video_dimensions = {
2232 def suitable(cls, url):
2233 """Receives a URL and returns True if suitable for this IE."""
2234 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2236 def _print_formats(self, formats):
2237 print('Available formats:')
2239 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2242 def _real_extract(self, url):
2243 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2245 raise ExtractorError(u'Invalid URL: %s' % url)
2247 if mobj.group('shortname'):
2248 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2249 url = u'http://www.thedailyshow.com/full-episodes/'
2251 url = u'http://www.colbertnation.com/full-episodes/'
2252 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2253 assert mobj is not None
2255 if mobj.group('clip'):
2256 if mobj.group('showname') == 'thedailyshow':
2257 epTitle = mobj.group('tdstitle')
2259 epTitle = mobj.group('cntitle')
2262 dlNewest = not mobj.group('episode')
2264 epTitle = mobj.group('showname')
2266 epTitle = mobj.group('episode')
2268 self.report_extraction(epTitle)
2269 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2271 url = htmlHandle.geturl()
2272 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2274 raise ExtractorError(u'Invalid redirected URL: ' + url)
2275 if mobj.group('episode') == '':
2276 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2277 epTitle = mobj.group('episode')
2279 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2281 if len(mMovieParams) == 0:
2282 # The Colbert Report embeds the information in a without
2283 # a URL prefix; so extract the alternate reference
2284 # and then add the URL prefix manually.
2286 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2287 if len(altMovieParams) == 0:
2288 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2290 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2292 uri = mMovieParams[0][1]
2293 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2294 indexXml = self._download_webpage(indexUrl, epTitle,
2295 u'Downloading show index',
2296 u'unable to download episode index')
2300 idoc = xml.etree.ElementTree.fromstring(indexXml)
2301 itemEls = idoc.findall('.//item')
2302 for partNum,itemEl in enumerate(itemEls):
2303 mediaId = itemEl.findall('./guid')[0].text
2304 shortMediaId = mediaId.split(':')[-1]
2305 showId = mediaId.split(':')[-2].replace('.com', '')
2306 officialTitle = itemEl.findall('./title')[0].text
2307 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2309 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2310 compat_urllib_parse.urlencode({'uri': mediaId}))
2311 configXml = self._download_webpage(configUrl, epTitle,
2312 u'Downloading configuration for %s' % shortMediaId)
2314 cdoc = xml.etree.ElementTree.fromstring(configXml)
2316 for rendition in cdoc.findall('.//rendition'):
2317 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2321 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2324 if self._downloader.params.get('listformats', None):
2325 self._print_formats([i[0] for i in turls])
2328 # For now, just pick the highest bitrate
2329 format,rtmp_video_url = turls[-1]
2331 # Get the format arg from the arg stream
2332 req_format = self._downloader.params.get('format', None)
2334 # Select format if we can find one
2337 format, rtmp_video_url = f, v
2340 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2342 raise ExtractorError(u'Cannot transform RTMP url')
2343 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2344 video_url = base + m.group('finalid')
2346 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2351 'upload_date': officialDate,
2356 'description': officialTitle,
2358 results.append(info)
2363 class EscapistIE(InfoExtractor):
2364 """Information extractor for The Escapist """
2366 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2367 IE_NAME = u'escapist'
2369 def _real_extract(self, url):
2370 mobj = re.match(self._VALID_URL, url)
2372 raise ExtractorError(u'Invalid URL: %s' % url)
2373 showName = mobj.group('showname')
2374 videoId = mobj.group('episode')
2376 self.report_extraction(videoId)
2377 webpage = self._download_webpage(url, videoId)
2379 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2380 webpage, u'description', fatal=False)
2382 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2383 webpage, u'thumbnail', fatal=False)
2385 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2386 webpage, u'player url')
2388 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2389 webpage, u'player url').split(' : ')[-1]
2391 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2392 configUrl = compat_urllib_parse.unquote(configUrl)
2394 configJSON = self._download_webpage(configUrl, videoId,
2395 u'Downloading configuration',
2396 u'unable to download configuration')
2398 # Technically, it's JavaScript, not JSON
2399 configJSON = configJSON.replace("'", '"')
2402 config = json.loads(configJSON)
2403 except (ValueError,) as err:
2404 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2406 playlist = config['playlist']
2407 videoUrl = playlist[1]['url']
2412 'uploader': showName,
2413 'upload_date': None,
2416 'thumbnail': imgUrl,
2417 'description': videoDesc,
2418 'player_url': playerUrl,
2423 class CollegeHumorIE(InfoExtractor):
2424 """Information extractor for collegehumor.com"""
2427 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2428 IE_NAME = u'collegehumor'
2430 def report_manifest(self, video_id):
2431 """Report information extraction."""
2432 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2434 def _real_extract(self, url):
2435 mobj = re.match(self._VALID_URL, url)
2437 raise ExtractorError(u'Invalid URL: %s' % url)
2438 video_id = mobj.group('videoid')
2443 'upload_date': None,
2446 self.report_extraction(video_id)
2447 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2449 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2453 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2455 videoNode = mdoc.findall('./video')[0]
2456 info['description'] = videoNode.findall('./description')[0].text
2457 info['title'] = videoNode.findall('./caption')[0].text
2458 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2459 manifest_url = videoNode.findall('./file')[0].text
2461 raise ExtractorError(u'Invalid metadata XML file')
2463 manifest_url += '?hdcore=2.10.3'
2464 self.report_manifest(video_id)
2466 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2467 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2470 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2472 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2473 node_id = media_node.attrib['url']
2474 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2475 except IndexError as err:
2476 raise ExtractorError(u'Invalid manifest file')
2478 url_pr = compat_urllib_parse_urlparse(manifest_url)
2479 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2486 class XVideosIE(InfoExtractor):
2487 """Information extractor for xvideos.com"""
2489 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2490 IE_NAME = u'xvideos'
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2495 raise ExtractorError(u'Invalid URL: %s' % url)
2496 video_id = mobj.group(1)
2498 webpage = self._download_webpage(url, video_id)
2500 self.report_extraction(video_id)
2503 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2504 webpage, u'video URL'))
2507 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2510 # Extract video thumbnail
2511 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2512 webpage, u'thumbnail', fatal=False)
2518 'upload_date': None,
2519 'title': video_title,
2521 'thumbnail': video_thumbnail,
2522 'description': None,
2528 class SoundcloudIE(InfoExtractor):
2529 """Information extractor for soundcloud.com
2530 To access the media, the uid of the song and a stream token
2531 must be extracted from the page source and the script must make
2532 a request to media.soundcloud.com/crossdomain.xml. Then
2533 the media can be grabbed by requesting from an url composed
2534 of the stream token and uid
2537 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2538 IE_NAME = u'soundcloud'
2540 def report_resolve(self, video_id):
2541 """Report information extraction."""
2542 self.to_screen(u'%s: Resolving id' % video_id)
2544 def _real_extract(self, url):
2545 mobj = re.match(self._VALID_URL, url)
2547 raise ExtractorError(u'Invalid URL: %s' % url)
2549 # extract uploader (which is in the url)
2550 uploader = mobj.group(1)
2551 # extract simple title (uploader + slug of song title)
2552 slug_title = mobj.group(2)
2553 simple_title = uploader + u'-' + slug_title
2554 full_title = '%s/%s' % (uploader, slug_title)
2556 self.report_resolve(full_title)
2558 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2559 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2560 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2562 info = json.loads(info_json)
2563 video_id = info['id']
2564 self.report_extraction(full_title)
2566 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2567 stream_json = self._download_webpage(streams_url, full_title,
2568 u'Downloading stream definitions',
2569 u'unable to download stream definitions')
2571 streams = json.loads(stream_json)
2572 mediaURL = streams['http_mp3_128_url']
2573 upload_date = unified_strdate(info['created_at'])
2578 'uploader': info['user']['username'],
2579 'upload_date': upload_date,
2580 'title': info['title'],
2582 'description': info['description'],
2585 class SoundcloudSetIE(InfoExtractor):
2586 """Information extractor for soundcloud.com sets
2587 To access the media, the uid of the song and a stream token
2588 must be extracted from the page source and the script must make
2589 a request to media.soundcloud.com/crossdomain.xml. Then
2590 the media can be grabbed by requesting from an url composed
2591 of the stream token and uid
2594 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2595 IE_NAME = u'soundcloud:set'
2597 def report_resolve(self, video_id):
2598 """Report information extraction."""
2599 self.to_screen(u'%s: Resolving id' % video_id)
2601 def _real_extract(self, url):
2602 mobj = re.match(self._VALID_URL, url)
2604 raise ExtractorError(u'Invalid URL: %s' % url)
2606 # extract uploader (which is in the url)
2607 uploader = mobj.group(1)
2608 # extract simple title (uploader + slug of song title)
2609 slug_title = mobj.group(2)
2610 simple_title = uploader + u'-' + slug_title
2611 full_title = '%s/sets/%s' % (uploader, slug_title)
2613 self.report_resolve(full_title)
2615 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2616 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2617 info_json = self._download_webpage(resolv_url, full_title)
2620 info = json.loads(info_json)
2621 if 'errors' in info:
2622 for err in info['errors']:
2623 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2626 self.report_extraction(full_title)
2627 for track in info['tracks']:
2628 video_id = track['id']
2630 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2631 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2633 self.report_extraction(video_id)
2634 streams = json.loads(stream_json)
2635 mediaURL = streams['http_mp3_128_url']
2640 'uploader': track['user']['username'],
2641 'upload_date': unified_strdate(track['created_at']),
2642 'title': track['title'],
2644 'description': track['description'],
2649 class InfoQIE(InfoExtractor):
2650 """Information extractor for infoq.com"""
2651 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2653 def _real_extract(self, url):
2654 mobj = re.match(self._VALID_URL, url)
2656 raise ExtractorError(u'Invalid URL: %s' % url)
2658 webpage = self._download_webpage(url, video_id=url)
2659 self.report_extraction(url)
2662 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2664 raise ExtractorError(u'Unable to extract video url')
2665 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2666 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2669 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2672 # Extract description
2673 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2674 webpage, u'description', fatal=False)
2676 video_filename = video_url.split('/')[-1]
2677 video_id, extension = video_filename.split('.')
2683 'upload_date': None,
2684 'title': video_title,
2685 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2687 'description': video_description,
2692 class MixcloudIE(InfoExtractor):
2693 """Information extractor for www.mixcloud.com"""
2695 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2696 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2697 IE_NAME = u'mixcloud'
2699 def report_download_json(self, file_id):
2700 """Report JSON download."""
2701 self.to_screen(u'Downloading json')
2703 def get_urls(self, jsonData, fmt, bitrate='best'):
2704 """Get urls from 'audio_formats' section in json"""
2707 bitrate_list = jsonData[fmt]
2708 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2709 bitrate = max(bitrate_list) # select highest
2711 url_list = jsonData[fmt][bitrate]
2712 except TypeError: # we have no bitrate info.
2713 url_list = jsonData[fmt]
2716 def check_urls(self, url_list):
2717 """Returns 1st active url from list"""
2718 for url in url_list:
2720 compat_urllib_request.urlopen(url)
2722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2727 def _print_formats(self, formats):
2728 print('Available formats:')
2729 for fmt in formats.keys():
2730 for b in formats[fmt]:
2732 ext = formats[fmt][b][0]
2733 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2734 except TypeError: # we have no bitrate info
2735 ext = formats[fmt][0]
2736 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2739 def _real_extract(self, url):
2740 mobj = re.match(self._VALID_URL, url)
2742 raise ExtractorError(u'Invalid URL: %s' % url)
2743 # extract uploader & filename from url
2744 uploader = mobj.group(1).decode('utf-8')
2745 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2747 # construct API request
2748 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2749 # retrieve .json file with links to files
2750 request = compat_urllib_request.Request(file_url)
2752 self.report_download_json(file_url)
2753 jsonData = compat_urllib_request.urlopen(request).read()
2754 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2755 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2758 json_data = json.loads(jsonData)
2759 player_url = json_data['player_swf_url']
2760 formats = dict(json_data['audio_formats'])
2762 req_format = self._downloader.params.get('format', None)
2765 if self._downloader.params.get('listformats', None):
2766 self._print_formats(formats)
2769 if req_format is None or req_format == 'best':
2770 for format_param in formats.keys():
2771 url_list = self.get_urls(formats, format_param)
2773 file_url = self.check_urls(url_list)
2774 if file_url is not None:
2777 if req_format not in formats:
2778 raise ExtractorError(u'Format is not available')
2780 url_list = self.get_urls(formats, req_format)
2781 file_url = self.check_urls(url_list)
2782 format_param = req_format
2785 'id': file_id.decode('utf-8'),
2786 'url': file_url.decode('utf-8'),
2787 'uploader': uploader.decode('utf-8'),
2788 'upload_date': None,
2789 'title': json_data['name'],
2790 'ext': file_url.split('.')[-1].decode('utf-8'),
2791 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2792 'thumbnail': json_data['thumbnail_url'],
2793 'description': json_data['description'],
2794 'player_url': player_url.decode('utf-8'),
2797 class StanfordOpenClassroomIE(InfoExtractor):
2798 """Information extractor for Stanford's Open ClassRoom"""
2800 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2801 IE_NAME = u'stanfordoc'
2803 def _real_extract(self, url):
2804 mobj = re.match(self._VALID_URL, url)
2806 raise ExtractorError(u'Invalid URL: %s' % url)
2808 if mobj.group('course') and mobj.group('video'): # A specific video
2809 course = mobj.group('course')
2810 video = mobj.group('video')
2812 'id': course + '_' + video,
2814 'upload_date': None,
2817 self.report_extraction(info['id'])
2818 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2819 xmlUrl = baseUrl + video + '.xml'
2821 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2822 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2823 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2824 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2826 info['title'] = mdoc.findall('./title')[0].text
2827 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2829 raise ExtractorError(u'Invalid metadata XML file')
2830 info['ext'] = info['url'].rpartition('.')[2]
2832 elif mobj.group('course'): # A course page
2833 course = mobj.group('course')
2838 'upload_date': None,
2841 coursepage = self._download_webpage(url, info['id'],
2842 note='Downloading course info page',
2843 errnote='Unable to download course info page')
2845 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2847 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2848 coursepage, u'description', fatal=False)
2850 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2853 'type': 'reference',
2854 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2858 for entry in info['list']:
2859 assert entry['type'] == 'reference'
2860 results += self.extract(entry['url'])
2864 'id': 'Stanford OpenClassroom',
2867 'upload_date': None,
2870 self.report_download_webpage(info['id'])
2871 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2873 rootpage = compat_urllib_request.urlopen(rootURL).read()
2874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2877 info['title'] = info['id']
2879 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2882 'type': 'reference',
2883 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2888 for entry in info['list']:
2889 assert entry['type'] == 'reference'
2890 results += self.extract(entry['url'])
2893 class MTVIE(InfoExtractor):
2894 """Information extractor for MTV.com"""
2896 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2899 def _real_extract(self, url):
2900 mobj = re.match(self._VALID_URL, url)
2902 raise ExtractorError(u'Invalid URL: %s' % url)
2903 if not mobj.group('proto'):
2904 url = 'http://' + url
2905 video_id = mobj.group('videoid')
2907 webpage = self._download_webpage(url, video_id)
2909 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2910 webpage, u'song name', fatal=False)
2912 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2915 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2916 webpage, u'mtvn_uri', fatal=False)
2918 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2919 webpage, u'content id', fatal=False)
2921 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2922 self.report_extraction(video_id)
2923 request = compat_urllib_request.Request(videogen_url)
2925 metadataXml = compat_urllib_request.urlopen(request).read()
2926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2929 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2930 renditions = mdoc.findall('.//rendition')
2932 # For now, always pick the highest quality.
2933 rendition = renditions[-1]
2936 _,_,ext = rendition.attrib['type'].partition('/')
2937 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2938 video_url = rendition.find('./src').text
2940 raise ExtractorError('Invalid rendition field.')
2945 'uploader': performer,
2946 'upload_date': None,
2947 'title': video_title,
2955 class YoukuIE(InfoExtractor):
2956 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2959 nowTime = int(time.time() * 1000)
2960 random1 = random.randint(1000,1998)
2961 random2 = random.randint(1000,9999)
2963 return "%d%d%d" %(nowTime,random1,random2)
2965 def _get_file_ID_mix_string(self, seed):
2967 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2969 for i in range(len(source)):
2970 seed = (seed * 211 + 30031 ) % 65536
2971 index = math.floor(seed / 65536 * len(source) )
2972 mixed.append(source[int(index)])
2973 source.remove(source[int(index)])
2974 #return ''.join(mixed)
2977 def _get_file_id(self, fileId, seed):
2978 mixed = self._get_file_ID_mix_string(seed)
2979 ids = fileId.split('*')
2983 realId.append(mixed[int(ch)])
2984 return ''.join(realId)
2986 def _real_extract(self, url):
2987 mobj = re.match(self._VALID_URL, url)
2989 raise ExtractorError(u'Invalid URL: %s' % url)
2990 video_id = mobj.group('ID')
2992 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2994 jsondata = self._download_webpage(info_url, video_id)
2996 self.report_extraction(video_id)
2998 config = json.loads(jsondata)
3000 video_title = config['data'][0]['title']
3001 seed = config['data'][0]['seed']
3003 format = self._downloader.params.get('format', None)
3004 supported_format = list(config['data'][0]['streamfileids'].keys())
3006 if format is None or format == 'best':
3007 if 'hd2' in supported_format:
3012 elif format == 'worst':
3020 fileid = config['data'][0]['streamfileids'][format]
3021 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3022 except (UnicodeDecodeError, ValueError, KeyError):
3023 raise ExtractorError(u'Unable to extract info section')
3026 sid = self._gen_sid()
3027 fileid = self._get_file_id(fileid, seed)
3029 #column 8,9 of fileid represent the segment number
3030 #fileid[7:9] should be changed
3031 for index, key in enumerate(keys):
3033 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3034 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3037 'id': '%s_part%02d' % (video_id, index),
3038 'url': download_url,
3040 'upload_date': None,
3041 'title': video_title,
3044 files_info.append(info)
3049 class XNXXIE(InfoExtractor):
3050 """Information extractor for xnxx.com"""
3052 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3054 VIDEO_URL_RE = r'flv_url=(.*?)&'
3055 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3056 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3058 def _real_extract(self, url):
3059 mobj = re.match(self._VALID_URL, url)
3061 raise ExtractorError(u'Invalid URL: %s' % url)
3062 video_id = mobj.group(1)
3064 # Get webpage content
3065 webpage = self._download_webpage(url, video_id)
3067 video_url = self._search_regex(self.VIDEO_URL_RE,
3068 webpage, u'video URL')
3069 video_url = compat_urllib_parse.unquote(video_url)
3071 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3074 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3075 webpage, u'thumbnail', fatal=False)
3081 'upload_date': None,
3082 'title': video_title,
3084 'thumbnail': video_thumbnail,
3085 'description': None,
3089 class GooglePlusIE(InfoExtractor):
3090 """Information extractor for plus.google.com."""
3092 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3093 IE_NAME = u'plus.google'
3095 def _real_extract(self, url):
3096 # Extract id from URL
3097 mobj = re.match(self._VALID_URL, url)
3099 raise ExtractorError(u'Invalid URL: %s' % url)
3101 post_url = mobj.group(0)
3102 video_id = mobj.group(1)
3104 video_extension = 'flv'
3106 # Step 1, Retrieve post webpage to extract further information
3107 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3109 self.report_extraction(video_id)
3111 # Extract update date
3112 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3113 webpage, u'upload date', fatal=False)
3115 # Convert timestring to a format suitable for filename
3116 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3117 upload_date = upload_date.strftime('%Y%m%d')
3120 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3121 webpage, u'uploader', fatal=False)
3124 # Get the first line for title
3125 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3126 webpage, 'title', default=u'NA')
3128 # Step 2, Stimulate clicking the image box to launch video
3129 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3130 webpage, u'video page URL')
3131 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3133 # Extract video links on video page
3134 """Extract video links of all sizes"""
3135 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3136 mobj = re.findall(pattern, webpage)
3138 raise ExtractorError(u'Unable to extract video links')
3140 # Sort in resolution
3141 links = sorted(mobj)
3143 # Choose the lowest of the sort, i.e. highest resolution
3144 video_url = links[-1]
3145 # Only get the url. The resolution part in the tuple has no use anymore
3146 video_url = video_url[-1]
3147 # Treat escaped \u0026 style hex
3149 video_url = video_url.decode("unicode_escape")
3150 except AttributeError: # Python 3
3151 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3157 'uploader': uploader,
3158 'upload_date': upload_date,
3159 'title': video_title,
3160 'ext': video_extension,
3163 class NBAIE(InfoExtractor):
3164 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3167 def _real_extract(self, url):
3168 mobj = re.match(self._VALID_URL, url)
3170 raise ExtractorError(u'Invalid URL: %s' % url)
3172 video_id = mobj.group(1)
3174 webpage = self._download_webpage(url, video_id)
3176 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3178 shortened_video_id = video_id.rpartition('/')[2]
3179 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3180 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3182 # It isn't there in the HTML it returns to us
3183 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3185 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3188 'id': shortened_video_id,
3192 # 'uploader_date': uploader_date,
3193 'description': description,
3197 class JustinTVIE(InfoExtractor):
3198 """Information extractor for justin.tv and twitch.tv"""
3199 # TODO: One broadcast may be split into multiple videos. The key
3200 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3201 # starts at 1 and increases. Can we treat all parts as one video?
3203 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3205 (?P<channelid>[^/]+)|
3206 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3207 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3211 _JUSTIN_PAGE_LIMIT = 100
3212 IE_NAME = u'justin.tv'
3214 def report_download_page(self, channel, offset):
3215 """Report attempt to download a single page of videos."""
3216 self.to_screen(u'%s: Downloading video information from %d to %d' %
3217 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3219 # Return count of items, list of *valid* items
3220 def _parse_page(self, url, video_id):
3221 webpage = self._download_webpage(url, video_id,
3222 u'Downloading video info JSON',
3223 u'unable to download video info JSON')
3225 response = json.loads(webpage)
3226 if type(response) != list:
3227 error_text = response.get('error', 'unknown error')
3228 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3230 for clip in response:
3231 video_url = clip['video_file_url']
3233 video_extension = os.path.splitext(video_url)[1][1:]
3234 video_date = re.sub('-', '', clip['start_time'][:10])
3235 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3236 video_id = clip['id']
3237 video_title = clip.get('title', video_id)
3241 'title': video_title,
3242 'uploader': clip.get('channel_name', video_uploader_id),
3243 'uploader_id': video_uploader_id,
3244 'upload_date': video_date,
3245 'ext': video_extension,
3247 return (len(response), info)
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3252 raise ExtractorError(u'invalid URL: %s' % url)
3254 api_base = 'http://api.justin.tv'
3256 if mobj.group('channelid'):
3258 video_id = mobj.group('channelid')
3259 api = api_base + '/channel/archives/%s.json' % video_id
3260 elif mobj.group('chapterid'):
3261 chapter_id = mobj.group('chapterid')
3263 webpage = self._download_webpage(url, chapter_id)
3264 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3266 raise ExtractorError(u'Cannot find archive of a chapter')
3267 archive_id = m.group(1)
3269 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3270 chapter_info_xml = self._download_webpage(api, chapter_id,
3271 note=u'Downloading chapter information',
3272 errnote=u'Chapter information download failed')
3273 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3274 for a in doc.findall('.//archive'):
3275 if archive_id == a.find('./id').text:
3278 raise ExtractorError(u'Could not find chapter in chapter information')
3280 video_url = a.find('./video_file_url').text
3281 video_ext = video_url.rpartition('.')[2] or u'flv'
3283 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3284 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3285 note='Downloading chapter metadata',
3286 errnote='Download of chapter metadata failed')
3287 chapter_info = json.loads(chapter_info_json)
3289 bracket_start = int(doc.find('.//bracket_start').text)
3290 bracket_end = int(doc.find('.//bracket_end').text)
3292 # TODO determine start (and probably fix up file)
3293 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3294 #video_url += u'?start=' + TODO:start_timestamp
3295 # bracket_start is 13290, but we want 51670615
3296 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3297 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3300 'id': u'c' + chapter_id,
3303 'title': chapter_info['title'],
3304 'thumbnail': chapter_info['preview'],
3305 'description': chapter_info['description'],
3306 'uploader': chapter_info['channel']['display_name'],
3307 'uploader_id': chapter_info['channel']['name'],
3311 video_id = mobj.group('videoid')
3312 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3314 self.report_extraction(video_id)
3318 limit = self._JUSTIN_PAGE_LIMIT
3321 self.report_download_page(video_id, offset)
3322 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3323 page_count, page_info = self._parse_page(page_url, video_id)
3324 info.extend(page_info)
3325 if not paged or page_count != limit:
3330 class FunnyOrDieIE(InfoExtractor):
3331 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3333 def _real_extract(self, url):
3334 mobj = re.match(self._VALID_URL, url)
3336 raise ExtractorError(u'invalid URL: %s' % url)
3338 video_id = mobj.group('id')
3339 webpage = self._download_webpage(url, video_id)
3341 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3342 webpage, u'video URL', flags=re.DOTALL)
3344 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3345 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3347 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3348 webpage, u'description', fatal=False, flags=re.DOTALL)
3355 'description': video_description,
3359 class SteamIE(InfoExtractor):
3360 _VALID_URL = r"""http://store\.steampowered\.com/
3362 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3364 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3366 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3367 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3370 def suitable(cls, url):
3371 """Receives a URL and returns True if suitable for this IE."""
3372 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3374 def _real_extract(self, url):
3375 m = re.match(self._VALID_URL, url, re.VERBOSE)
3376 gameID = m.group('gameID')
3378 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3379 webpage = self._download_webpage(videourl, gameID)
3381 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3382 videourl = self._AGECHECK_TEMPLATE % gameID
3383 self.report_age_confirmation()
3384 webpage = self._download_webpage(videourl, gameID)
3386 self.report_extraction(gameID)
3387 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3388 webpage, 'game title')
3390 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3391 mweb = re.finditer(urlRE, webpage)
3392 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3393 titles = re.finditer(namesRE, webpage)
3394 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3395 thumbs = re.finditer(thumbsRE, webpage)
3397 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3398 video_id = vid.group('videoID')
3399 title = vtitle.group('videoName')
3400 video_url = vid.group('videoURL')
3401 video_thumb = thumb.group('thumbnail')
3403 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3408 'title': unescapeHTML(title),
3409 'thumbnail': video_thumb
3412 return [self.playlist_result(videos, gameID, game_title)]
3414 class UstreamIE(InfoExtractor):
3415 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3416 IE_NAME = u'ustream'
3418 def _real_extract(self, url):
3419 m = re.match(self._VALID_URL, url)
3420 video_id = m.group('videoID')
3422 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3423 webpage = self._download_webpage(url, video_id)
3425 self.report_extraction(video_id)
3427 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3430 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3431 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3433 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3434 webpage, u'thumbnail', fatal=False)
3440 'title': video_title,
3441 'uploader': uploader,
3442 'thumbnail': thumbnail,
3446 class WorldStarHipHopIE(InfoExtractor):
3447 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3448 IE_NAME = u'WorldStarHipHop'
3450 def _real_extract(self, url):
3451 m = re.match(self._VALID_URL, url)
3452 video_id = m.group('id')
3454 webpage_src = self._download_webpage(url, video_id)
3456 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3457 webpage_src, u'video URL')
3459 if 'mp4' in video_url:
3464 video_title = self._html_search_regex(r"<title>(.*)</title>",
3465 webpage_src, u'title')
3467 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3468 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3469 webpage_src, u'thumbnail', fatal=False)
3472 _title = r"""candytitles.*>(.*)</span>"""
3473 mobj = re.search(_title, webpage_src)
3474 if mobj is not None:
3475 video_title = mobj.group(1)
3480 'title' : video_title,
3481 'thumbnail' : thumbnail,
3486 class RBMARadioIE(InfoExtractor):
3487 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3489 def _real_extract(self, url):
3490 m = re.match(self._VALID_URL, url)
3491 video_id = m.group('videoID')
3493 webpage = self._download_webpage(url, video_id)
3495 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3496 webpage, u'json data', flags=re.MULTILINE)
3499 data = json.loads(json_data)
3500 except ValueError as e:
3501 raise ExtractorError(u'Invalid JSON: ' + str(e))
3503 video_url = data['akamai_url'] + '&cbr=256'
3504 url_parts = compat_urllib_parse_urlparse(video_url)
3505 video_ext = url_parts.path.rpartition('.')[2]
3510 'title': data['title'],
3511 'description': data.get('teaser_text'),
3512 'location': data.get('country_of_origin'),
3513 'uploader': data.get('host', {}).get('name'),
3514 'uploader_id': data.get('host', {}).get('slug'),
3515 'thumbnail': data.get('image', {}).get('large_url_2x'),
3516 'duration': data.get('duration'),
3521 class YouPornIE(InfoExtractor):
3522 """Information extractor for youporn.com."""
3523 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3525 def _print_formats(self, formats):
3526 """Print all available formats"""
3527 print(u'Available formats:')
3528 print(u'ext\t\tformat')
3529 print(u'---------------------------------')
3530 for format in formats:
3531 print(u'%s\t\t%s' % (format['ext'], format['format']))
3533 def _specific(self, req_format, formats):
3535 if(x["format"]==req_format):
3539 def _real_extract(self, url):
3540 mobj = re.match(self._VALID_URL, url)
3542 raise ExtractorError(u'Invalid URL: %s' % url)
3543 video_id = mobj.group('videoid')
3545 req = compat_urllib_request.Request(url)
3546 req.add_header('Cookie', 'age_verified=1')
3547 webpage = self._download_webpage(req, video_id)
3549 # Get JSON parameters
3550 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3552 params = json.loads(json_params)
3554 raise ExtractorError(u'Invalid JSON')
3556 self.report_extraction(video_id)
3558 video_title = params['title']
3559 upload_date = unified_strdate(params['release_date_f'])
3560 video_description = params['description']
3561 video_uploader = params['submitted_by']
3562 thumbnail = params['thumbnails'][0]['image']
3564 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3566 # Get all of the formats available
3567 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3568 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3569 webpage, u'download list').strip()
3571 # Get all of the links from the page
3572 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3573 links = re.findall(LINK_RE, download_list_html)
3574 if(len(links) == 0):
3575 raise ExtractorError(u'ERROR: no known formats available for video')
3577 self.to_screen(u'Links found: %d' % len(links))
3582 # A link looks like this:
3583 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3584 # A path looks like this:
3585 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3586 video_url = unescapeHTML( link )
3587 path = compat_urllib_parse_urlparse( video_url ).path
3588 extension = os.path.splitext( path )[1][1:]
3589 format = path.split('/')[4].split('_')[:2]
3592 format = "-".join( format )
3593 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3598 'uploader': video_uploader,
3599 'upload_date': upload_date,
3600 'title': video_title,
3603 'thumbnail': thumbnail,
3604 'description': video_description
3607 if self._downloader.params.get('listformats', None):
3608 self._print_formats(formats)
3611 req_format = self._downloader.params.get('format', None)
3612 self.to_screen(u'Format: %s' % req_format)
3614 if req_format is None or req_format == 'best':
3616 elif req_format == 'worst':
3617 return [formats[-1]]
3618 elif req_format in ('-1', 'all'):
3621 format = self._specific( req_format, formats )
3623 raise ExtractorError(u'Requested format not available')
3628 class PornotubeIE(InfoExtractor):
3629 """Information extractor for pornotube.com."""
3630 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3632 def _real_extract(self, url):
3633 mobj = re.match(self._VALID_URL, url)
3635 raise ExtractorError(u'Invalid URL: %s' % url)
3637 video_id = mobj.group('videoid')
3638 video_title = mobj.group('title')
3640 # Get webpage content
3641 webpage = self._download_webpage(url, video_id)
3644 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3645 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3646 video_url = compat_urllib_parse.unquote(video_url)
3648 #Get the uploaded date
3649 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3650 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3651 if upload_date: upload_date = unified_strdate(upload_date)
3653 info = {'id': video_id,
3656 'upload_date': upload_date,
3657 'title': video_title,
3663 class YouJizzIE(InfoExtractor):
3664 """Information extractor for youjizz.com."""
3665 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3667 def _real_extract(self, url):
3668 mobj = re.match(self._VALID_URL, url)
3670 raise ExtractorError(u'Invalid URL: %s' % url)
3672 video_id = mobj.group('videoid')
3674 # Get webpage content
3675 webpage = self._download_webpage(url, video_id)
3677 # Get the video title
3678 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3679 webpage, u'title').strip()
3681 # Get the embed page
3682 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3684 raise ExtractorError(u'ERROR: unable to extract embed page')
3686 embed_page_url = result.group(0).strip()
3687 video_id = result.group('videoid')
3689 webpage = self._download_webpage(embed_page_url, video_id)
3692 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3693 webpage, u'video URL')
3695 info = {'id': video_id,
3697 'title': video_title,
3700 'player_url': embed_page_url}
3704 class EightTracksIE(InfoExtractor):
3706 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3708 def _real_extract(self, url):
3709 mobj = re.match(self._VALID_URL, url)
3711 raise ExtractorError(u'Invalid URL: %s' % url)
3712 playlist_id = mobj.group('id')
3714 webpage = self._download_webpage(url, playlist_id)
3716 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3717 data = json.loads(json_like)
3719 session = str(random.randint(0, 1000000000))
3721 track_count = data['tracks_count']
3722 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3723 next_url = first_url
3725 for i in itertools.count():
3726 api_json = self._download_webpage(next_url, playlist_id,
3727 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3728 errnote=u'Failed to download song information')
3729 api_data = json.loads(api_json)
3730 track_data = api_data[u'set']['track']
3732 'id': track_data['id'],
3733 'url': track_data['track_file_stream_url'],
3734 'title': track_data['performer'] + u' - ' + track_data['name'],
3735 'raw_title': track_data['name'],
3736 'uploader_id': data['user']['login'],
3740 if api_data['set']['at_last_track']:
3742 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3745 class KeekIE(InfoExtractor):
3746 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3749 def _real_extract(self, url):
3750 m = re.match(self._VALID_URL, url)
3751 video_id = m.group('videoID')
3753 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3754 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3755 webpage = self._download_webpage(url, video_id)
3757 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3760 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3761 webpage, u'uploader', fatal=False)
3767 'title': video_title,
3768 'thumbnail': thumbnail,
3769 'uploader': uploader
3773 class TEDIE(InfoExtractor):
3774 _VALID_URL=r'''http://www\.ted\.com/
3776 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3778 ((?P<type_talk>talks)) # We have a simple talk
3780 (/lang/(.*?))? # The url may contain the language
3781 /(?P<name>\w+) # Here goes the name and then ".html"
3785 def suitable(cls, url):
3786 """Receives a URL and returns True if suitable for this IE."""
3787 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3789 def _real_extract(self, url):
3790 m=re.match(self._VALID_URL, url, re.VERBOSE)
3791 if m.group('type_talk'):
3792 return [self._talk_info(url)]
3794 playlist_id=m.group('playlist_id')
3795 name=m.group('name')
3796 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3797 return [self._playlist_videos_info(url,name,playlist_id)]
3799 def _playlist_videos_info(self,url,name,playlist_id=0):
3800 '''Returns the videos of the playlist'''
3802 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3803 ([.\s]*?)data-playlist_item_id="(\d+)"
3804 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3806 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3807 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3808 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3809 m_names=re.finditer(video_name_RE,webpage)
3811 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3812 webpage, 'playlist title')
3814 playlist_entries = []
3815 for m_video, m_name in zip(m_videos,m_names):
3816 video_id=m_video.group('video_id')
3817 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818 playlist_entries.append(self.url_result(talk_url, 'TED'))
3819 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3821 def _talk_info(self, url, video_id=0):
3822 """Return the video for the talk in the url"""
3823 m = re.match(self._VALID_URL, url,re.VERBOSE)
3824 video_name = m.group('name')
3825 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3826 self.report_extraction(video_name)
3827 # If the url includes the language we get the title translated
3828 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3830 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3831 webpage, 'json data')
3832 info = json.loads(json_data)
3833 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3834 webpage, 'description', flags = re.DOTALL)
3836 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3837 webpage, 'thumbnail')
3840 'url': info['htmlStreams'][-1]['file'],
3843 'thumbnail': thumbnail,
3844 'description': desc,
3848 class MySpassIE(InfoExtractor):
3849 _VALID_URL = r'http://www.myspass.de/.*'
3851 def _real_extract(self, url):
3852 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3854 # video id is the last path element of the URL
3855 # usually there is a trailing slash, so also try the second but last
3856 url_path = compat_urllib_parse_urlparse(url).path
3857 url_parent_path, video_id = os.path.split(url_path)
3859 _, video_id = os.path.split(url_parent_path)
3862 metadata_url = META_DATA_URL_TEMPLATE % video_id
3863 metadata_text = self._download_webpage(metadata_url, video_id)
3864 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3866 # extract values from metadata
3867 url_flv_el = metadata.find('url_flv')
3868 if url_flv_el is None:
3869 raise ExtractorError(u'Unable to extract download url')
3870 video_url = url_flv_el.text
3871 extension = os.path.splitext(video_url)[1][1:]
3872 title_el = metadata.find('title')
3873 if title_el is None:
3874 raise ExtractorError(u'Unable to extract title')
3875 title = title_el.text
3876 format_id_el = metadata.find('format_id')
3877 if format_id_el is None:
3880 format = format_id_el.text
3881 description_el = metadata.find('description')
3882 if description_el is not None:
3883 description = description_el.text
3886 imagePreview_el = metadata.find('imagePreview')
3887 if imagePreview_el is not None:
3888 thumbnail = imagePreview_el.text
3897 'thumbnail': thumbnail,
3898 'description': description
3902 class SpiegelIE(InfoExtractor):
3903 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3905 def _real_extract(self, url):
3906 m = re.match(self._VALID_URL, url)
3907 video_id = m.group('videoID')
3909 webpage = self._download_webpage(url, video_id)
3911 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3914 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915 xml_code = self._download_webpage(xml_url, video_id,
3916 note=u'Downloading XML', errnote=u'Failed to download XML')
3918 idoc = xml.etree.ElementTree.fromstring(xml_code)
3919 last_type = idoc[-1]
3920 filename = last_type.findall('./filename')[0].text
3921 duration = float(last_type.findall('./duration')[0].text)
3923 video_url = 'http://video2.spiegel.de/flash/' + filename
3924 video_ext = filename.rpartition('.')[2]
3929 'title': video_title,
3930 'duration': duration,
3934 class LiveLeakIE(InfoExtractor):
3936 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937 IE_NAME = u'liveleak'
3939 def _real_extract(self, url):
3940 mobj = re.match(self._VALID_URL, url)
3942 raise ExtractorError(u'Invalid URL: %s' % url)
3944 video_id = mobj.group('video_id')
3946 webpage = self._download_webpage(url, video_id)
3948 video_url = self._search_regex(r'file: "(.*?)",',
3949 webpage, u'video URL')
3951 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3952 webpage, u'title').replace('LiveLeak.com -', '').strip()
3954 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3955 webpage, u'description', fatal=False)
3957 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3958 webpage, u'uploader', fatal=False)
3964 'title': video_title,
3965 'description': video_description,
3966 'uploader': video_uploader
3971 class ARDIE(InfoExtractor):
3972 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3973 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3974 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3976 def _real_extract(self, url):
3977 # determine video id from url
3978 m = re.match(self._VALID_URL, url)
3980 numid = re.search(r'documentId=([0-9]+)', url)
3982 video_id = numid.group(1)
3984 video_id = m.group('video_id')
3986 # determine title and media streams from webpage
3987 html = self._download_webpage(url, video_id)
3988 title = re.search(self._TITLE, html).group('title')
3989 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3991 assert '"fsk"' in html
3992 raise ExtractorError(u'This video is only available after 8:00 pm')
3994 # choose default media type and highest quality for now
3995 stream = max([s for s in streams if int(s["media_type"]) == 0],
3996 key=lambda s: int(s["quality"]))
3998 # there's two possibilities: RTMP stream or HTTP download
3999 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4000 if stream['rtmp_url']:
4001 self.to_screen(u'RTMP download detected')
4002 assert stream['video_url'].startswith('mp4:')
4003 info["url"] = stream["rtmp_url"]
4004 info["play_path"] = stream['video_url']
4006 assert stream["video_url"].endswith('.mp4')
4007 info["url"] = stream["video_url"]
4010 class ZDFIE(InfoExtractor):
4011 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4012 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4013 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4014 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4015 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4017 def _real_extract(self, url):
4018 mobj = re.match(self._VALID_URL, url)
4020 raise ExtractorError(u'Invalid URL: %s' % url)
4021 video_id = mobj.group('video_id')
4023 html = self._download_webpage(url, video_id)
4024 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4026 raise ExtractorError(u'No media url found.')
4028 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4029 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4030 # choose first/default media type and highest quality for now
4031 for s in streams: #find 300 - dsl1000mbit
4032 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4035 for s in streams: #find veryhigh - dsl2000mbit
4036 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4040 raise ExtractorError(u'No stream found.')
4042 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4044 self.report_extraction(video_id)
4045 mobj = re.search(self._TITLE, html)
4047 raise ExtractorError(u'Cannot extract title')
4048 title = unescapeHTML(mobj.group('title'))
4050 mobj = re.search(self._MMS_STREAM, media_link)
4052 mobj = re.search(self._RTSP_STREAM, media_link)
4054 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4055 mms_url = mobj.group('video_url')
4057 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4059 raise ExtractorError(u'Cannot extract extention')
4060 ext = mobj.group('ext')
4062 return [{'id': video_id,
4068 class TumblrIE(InfoExtractor):
4069 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4071 def _real_extract(self, url):
4072 m_url = re.match(self._VALID_URL, url)
4073 video_id = m_url.group('id')
4074 blog = m_url.group('blog_name')
4076 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4077 webpage = self._download_webpage(url, video_id)
4079 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4080 video = re.search(re_video, webpage)
4082 raise ExtractorError(u'Unable to extract video')
4083 video_url = video.group('video_url')
4084 ext = video.group('ext')
4086 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4087 webpage, u'thumbnail', fatal=False) # We pick the first poster
4088 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4090 # The only place where you can get a title, it's not complete,
4091 # but searching in other places doesn't work for all videos
4092 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4093 webpage, u'title', flags=re.DOTALL)
4095 return [{'id': video_id,
4097 'title': video_title,
4098 'thumbnail': video_thumbnail,
4102 class BandcampIE(InfoExtractor):
4103 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4105 def _real_extract(self, url):
4106 mobj = re.match(self._VALID_URL, url)
4107 title = mobj.group('title')
4108 webpage = self._download_webpage(url, title)
4109 # We get the link to the free download page
4110 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4111 if m_download is None:
4112 raise ExtractorError(u'No free songs found')
4114 download_link = m_download.group(1)
4115 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4116 webpage, re.MULTILINE|re.DOTALL).group('id')
4118 download_webpage = self._download_webpage(download_link, id,
4119 'Downloading free downloads page')
4120 # We get the dictionary of the track from some javascrip code
4121 info = re.search(r'items: (.*?),$',
4122 download_webpage, re.MULTILINE).group(1)
4123 info = json.loads(info)[0]
4124 # We pick mp3-320 for now, until format selection can be easily implemented.
4125 mp3_info = info[u'downloads'][u'mp3-320']
4126 # If we try to use this url it says the link has expired
4127 initial_url = mp3_info[u'url']
4128 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4129 m_url = re.match(re_url, initial_url)
4130 #We build the url we will use to get the final track url
4131 # This url is build in Bandcamp in the script download_bunde_*.js
4132 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4133 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4134 # If we could correctly generate the .rand field the url would be
4135 #in the "download_url" key
4136 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4138 track_info = {'id':id,
4139 'title' : info[u'title'],
4142 'thumbnail' : info[u'thumb_url'],
4143 'uploader' : info[u'artist']
4148 class RedTubeIE(InfoExtractor):
4149 """Information Extractor for redtube"""
4150 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4152 def _real_extract(self,url):
4153 mobj = re.match(self._VALID_URL, url)
4155 raise ExtractorError(u'Invalid URL: %s' % url)
4157 video_id = mobj.group('id')
4158 video_extension = 'mp4'
4159 webpage = self._download_webpage(url, video_id)
4161 self.report_extraction(video_id)
4163 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4164 webpage, u'video URL')
4166 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4172 'ext': video_extension,
4173 'title': video_title,
4176 class InaIE(InfoExtractor):
4177 """Information Extractor for Ina.fr"""
4178 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4180 def _real_extract(self,url):
4181 mobj = re.match(self._VALID_URL, url)
4183 video_id = mobj.group('id')
4184 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4185 video_extension = 'mp4'
4186 webpage = self._download_webpage(mrss_url, video_id)
4188 self.report_extraction(video_id)
4190 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4191 webpage, u'video URL')
4193 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4199 'ext': video_extension,
4200 'title': video_title,
4203 class HowcastIE(InfoExtractor):
4204 """Information Extractor for Howcast.com"""
4205 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4207 def _real_extract(self, url):
4208 mobj = re.match(self._VALID_URL, url)
4210 video_id = mobj.group('id')
4211 webpage_url = 'http://www.howcast.com/videos/' + video_id
4212 webpage = self._download_webpage(webpage_url, video_id)
4214 self.report_extraction(video_id)
4216 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4217 webpage, u'video URL')
4219 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4222 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4223 webpage, u'description', fatal=False)
4225 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4226 webpage, u'thumbnail', fatal=False)
4232 'title': video_title,
4233 'description': video_description,
4234 'thumbnail': thumbnail,
4237 class VineIE(InfoExtractor):
4238 """Information Extractor for Vine.co"""
4239 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4241 def _real_extract(self, url):
4242 mobj = re.match(self._VALID_URL, url)
4244 video_id = mobj.group('id')
4245 webpage_url = 'https://vine.co/v/' + video_id
4246 webpage = self._download_webpage(webpage_url, video_id)
4248 self.report_extraction(video_id)
4250 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4251 webpage, u'video URL')
4253 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4256 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4257 webpage, u'thumbnail', fatal=False)
4259 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4260 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4266 'title': video_title,
4267 'thumbnail': thumbnail,
4268 'uploader': uploader,
4271 class FlickrIE(InfoExtractor):
4272 """Information Extractor for Flickr videos"""
4273 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4275 def _real_extract(self, url):
4276 mobj = re.match(self._VALID_URL, url)
4278 video_id = mobj.group('id')
4279 video_uploader_id = mobj.group('uploader_id')
4280 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4281 webpage = self._download_webpage(webpage_url, video_id)
4283 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4285 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4286 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4288 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4289 first_xml, u'node_id')
4291 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4292 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4294 self.report_extraction(video_id)
4296 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4298 raise ExtractorError(u'Unable to extract video url')
4299 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4301 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4302 webpage, u'video title')
4304 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4305 webpage, u'description', fatal=False)
4307 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4308 webpage, u'thumbnail', fatal=False)
4314 'title': video_title,
4315 'description': video_description,
4316 'thumbnail': thumbnail,
4317 'uploader_id': video_uploader_id,
4320 class TeamcocoIE(InfoExtractor):
4321 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4323 def _real_extract(self, url):
4324 mobj = re.match(self._VALID_URL, url)
4326 raise ExtractorError(u'Invalid URL: %s' % url)
4327 url_title = mobj.group('url_title')
4328 webpage = self._download_webpage(url, url_title)
4330 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4331 webpage, u'video id')
4333 self.report_extraction(video_id)
4335 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4338 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4339 webpage, u'thumbnail', fatal=False)
4341 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4342 webpage, u'description', fatal=False)
4344 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4345 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4347 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4354 'title': video_title,
4355 'thumbnail': thumbnail,
4356 'description': video_description,
4359 class XHamsterIE(InfoExtractor):
4360 """Information Extractor for xHamster"""
4361 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4363 def _real_extract(self,url):
4364 mobj = re.match(self._VALID_URL, url)
4366 video_id = mobj.group('id')
4367 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4368 webpage = self._download_webpage(mrss_url, video_id)
4370 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4372 raise ExtractorError(u'Unable to extract media URL')
4373 if len(mobj.group('server')) == 0:
4374 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4376 video_url = mobj.group('server')+'/key='+mobj.group('file')
4377 video_extension = video_url.split('.')[-1]
4379 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4382 # Can't see the description anywhere in the UI
4383 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4384 # webpage, u'description', fatal=False)
4385 # if video_description: video_description = unescapeHTML(video_description)
4387 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4389 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4391 video_upload_date = None
4392 self._downloader.report_warning(u'Unable to extract upload date')
4394 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4395 webpage, u'uploader id', default=u'anonymous')
4397 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4398 webpage, u'thumbnail', fatal=False)
4403 'ext': video_extension,
4404 'title': video_title,
4405 # 'description': video_description,
4406 'upload_date': video_upload_date,
4407 'uploader_id': video_uploader_id,
4408 'thumbnail': video_thumbnail
4411 class HypemIE(InfoExtractor):
4412 """Information Extractor for hypem"""
4413 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4415 def _real_extract(self, url):
4416 mobj = re.match(self._VALID_URL, url)
4418 raise ExtractorError(u'Invalid URL: %s' % url)
4419 track_id = mobj.group(1)
4421 data = { 'ax': 1, 'ts': time.time() }
4422 data_encoded = compat_urllib_parse.urlencode(data)
4423 complete_url = url + "?" + data_encoded
4424 request = compat_urllib_request.Request(complete_url)
4425 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4426 cookie = urlh.headers.get('Set-Cookie', '')
4428 self.report_extraction(track_id)
4430 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4431 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4433 track_list = json.loads(html_tracks)
4434 track = track_list[u'tracks'][0]
4436 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4439 track_id = track[u"id"]
4440 artist = track[u"artist"]
4441 title = track[u"song"]
4443 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4444 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4445 request.add_header('cookie', cookie)
4446 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4448 song_data = json.loads(song_data_json)
4450 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4451 final_url = song_data[u"url"]
4461 class Vbox7IE(InfoExtractor):
4462 """Information Extractor for Vbox7"""
4463 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4465 def _real_extract(self,url):
4466 mobj = re.match(self._VALID_URL, url)
4468 raise ExtractorError(u'Invalid URL: %s' % url)
4469 video_id = mobj.group(1)
4471 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4472 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4473 redirect_url = urlh.geturl() + new_location
4474 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4476 title = self._html_search_regex(r'<title>(.*)</title>',
4477 webpage, u'title').split('/')[0].strip()
4480 info_url = "http://vbox7.com/play/magare.do"
4481 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4482 info_request = compat_urllib_request.Request(info_url, data)
4483 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4484 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4485 if info_response is None:
4486 raise ExtractorError(u'Unable to extract the media url')
4487 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4494 'thumbnail': thumbnail_url,
4497 class GametrailersIE(InfoExtractor):
4498 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4500 def _real_extract(self, url):
4501 mobj = re.match(self._VALID_URL, url)
4503 raise ExtractorError(u'Invalid URL: %s' % url)
4504 video_id = mobj.group('id')
4505 video_type = mobj.group('type')
4506 webpage = self._download_webpage(url, video_id)
4507 if video_type == 'full-episodes':
4508 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4510 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4511 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4512 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4514 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4515 video_id, u'Downloading video info')
4516 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4517 video_id, u'Downloading video urls info')
4519 self.report_extraction(video_id)
4520 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4521 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4523 <url>(?P<thumb>.*?)</url>.*
4526 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4528 raise ExtractorError(u'Unable to extract video info')
4529 video_title = m_info.group('title')
4530 video_description = m_info.group('description')
4531 video_thumb = m_info.group('thumb')
4533 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4534 if m_urls is None or len(m_urls) == 0:
4535 raise ExtractError(u'Unable to extrat video url')
4536 # They are sorted from worst to best quality
4537 video_url = m_urls[-1].group('url')
4539 return {'url': video_url,
4541 'title': video_title,
4542 # Videos are actually flv not mp4
4544 'thumbnail': video_thumb,
4545 'description': video_description,
4548 def gen_extractors():
4549 """ Return a list of an instance of every supported extractor.
4550 The order does matter; the first extractor matched is the one handling the URL.
4553 YoutubePlaylistIE(),
4578 StanfordOpenClassroomIE(),
4588 WorldStarHipHopIE(),
4617 def get_info_extractor(ie_name):
4618 """Returns the info extractor class with the given ie_name"""
4619 return globals()[ie_name+'IE']