2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
728 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
729 info = json.loads(mobj.group(1))
731 if args.get('ptk','') == 'vevo' or 'dashmpd':
732 # Vevo videos with encrypted signatures
733 self.to_screen(u'Vevo video detected.')
734 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
738 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
739 self.report_rtmp_download()
740 video_url_list = [(None, video_info['conn'][0])]
741 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
743 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
744 url_data = compat_parse_qs(url_data_str)
745 if 'itag' in url_data and 'url' in url_data:
746 url = url_data['url'][0]
747 if 'sig' in url_data:
748 url += '&signature=' + url_data['sig'][0]
751 """Decrypt the key the two subkeys must have a length of 43"""
753 b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
755 s_dec = '.'.join((a,b))[::-1]
757 key = k(url_data['s'][0])
758 url += '&signature=' + key
759 if 'ratebypass' not in url:
760 url += '&ratebypass=yes'
761 url_map[url_data['itag'][0]] = url
763 format_limit = self._downloader.params.get('format_limit', None)
764 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
765 if format_limit is not None and format_limit in available_formats:
766 format_list = available_formats[available_formats.index(format_limit):]
768 format_list = available_formats
769 existing_formats = [x for x in format_list if x in url_map]
770 if len(existing_formats) == 0:
771 raise ExtractorError(u'no known formats available for video')
772 if self._downloader.params.get('listformats', None):
773 self._print_formats(existing_formats)
775 if req_format is None or req_format == 'best':
776 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
777 elif req_format == 'worst':
778 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
779 elif req_format in ('-1', 'all'):
780 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
782 # Specific formats. We pick the first in a slash-delimeted sequence.
783 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
784 req_formats = req_format.split('/')
785 video_url_list = None
786 for rf in req_formats:
788 video_url_list = [(rf, url_map[rf])]
790 if video_url_list is None:
791 raise ExtractorError(u'requested format not available')
793 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
796 for format_param, video_real_url in video_url_list:
798 video_extension = self._video_extensions.get(format_param, 'flv')
800 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
801 self._video_dimensions.get(format_param, '???'))
805 'url': video_real_url,
806 'uploader': video_uploader,
807 'uploader_id': video_uploader_id,
808 'upload_date': upload_date,
809 'title': video_title,
810 'ext': video_extension,
811 'format': video_format,
812 'thumbnail': video_thumbnail,
813 'description': video_description,
814 'player_url': player_url,
815 'subtitles': video_subtitles,
816 'duration': video_duration
821 class MetacafeIE(InfoExtractor):
822 """Information Extractor for metacafe.com."""
824 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
825 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
826 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
827 IE_NAME = u'metacafe'
829 def report_disclaimer(self):
830 """Report disclaimer retrieval."""
831 self.to_screen(u'Retrieving disclaimer')
833 def _real_initialize(self):
834 # Retrieve disclaimer
835 request = compat_urllib_request.Request(self._DISCLAIMER)
837 self.report_disclaimer()
838 disclaimer = compat_urllib_request.urlopen(request).read()
839 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
840 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
845 'submit': "Continue - I'm over 18",
847 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
849 self.report_age_confirmation()
850 disclaimer = compat_urllib_request.urlopen(request).read()
851 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
852 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
854 def _real_extract(self, url):
855 # Extract id and simplified title from URL
856 mobj = re.match(self._VALID_URL, url)
858 raise ExtractorError(u'Invalid URL: %s' % url)
860 video_id = mobj.group(1)
862 # Check if video comes from YouTube
863 mobj2 = re.match(r'^yt-(.*)$', video_id)
864 if mobj2 is not None:
865 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
867 # Retrieve video webpage to extract further information
868 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
870 # Extract URL, uploader and title from webpage
871 self.report_extraction(video_id)
872 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
874 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
875 video_extension = mediaURL[-3:]
877 # Extract gdaKey if available
878 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
882 gdaKey = mobj.group(1)
883 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
885 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
887 raise ExtractorError(u'Unable to extract media URL')
888 vardict = compat_parse_qs(mobj.group(1))
889 if 'mediaData' not in vardict:
890 raise ExtractorError(u'Unable to extract media URL')
891 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
893 raise ExtractorError(u'Unable to extract media URL')
894 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
895 video_extension = mediaURL[-3:]
896 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
898 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
900 raise ExtractorError(u'Unable to extract title')
901 video_title = mobj.group(1).decode('utf-8')
903 mobj = re.search(r'submitter=(.*?);', webpage)
905 raise ExtractorError(u'Unable to extract uploader nickname')
906 video_uploader = mobj.group(1)
909 'id': video_id.decode('utf-8'),
910 'url': video_url.decode('utf-8'),
911 'uploader': video_uploader.decode('utf-8'),
913 'title': video_title,
914 'ext': video_extension.decode('utf-8'),
917 class DailymotionIE(InfoExtractor):
918 """Information Extractor for Dailymotion"""
920 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
921 IE_NAME = u'dailymotion'
923 def _real_extract(self, url):
924 # Extract id and simplified title from URL
925 mobj = re.match(self._VALID_URL, url)
927 raise ExtractorError(u'Invalid URL: %s' % url)
929 video_id = mobj.group(1).split('_')[0].split('?')[0]
931 video_extension = 'mp4'
933 # Retrieve video webpage to extract further information
934 request = compat_urllib_request.Request(url)
935 request.add_header('Cookie', 'family_filter=off')
936 webpage = self._download_webpage(request, video_id)
938 # Extract URL, uploader and title from webpage
939 self.report_extraction(video_id)
940 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
942 raise ExtractorError(u'Unable to extract media URL')
943 flashvars = compat_urllib_parse.unquote(mobj.group(1))
945 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
948 self.to_screen(u'Using %s' % key)
951 raise ExtractorError(u'Unable to extract video URL')
953 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
955 raise ExtractorError(u'Unable to extract video URL')
957 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
959 # TODO: support choosing qualities
961 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
963 raise ExtractorError(u'Unable to extract title')
964 video_title = unescapeHTML(mobj.group('title'))
966 video_uploader = None
967 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
968 # Looking for official user
969 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
970 webpage, 'video uploader')
972 video_upload_date = None
973 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
975 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
980 'uploader': video_uploader,
981 'upload_date': video_upload_date,
982 'title': video_title,
983 'ext': video_extension,
987 class PhotobucketIE(InfoExtractor):
988 """Information extractor for photobucket.com."""
990 # TODO: the original _VALID_URL was:
991 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
992 # Check if it's necessary to keep the old extracion process
993 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
994 IE_NAME = u'photobucket'
996 def _real_extract(self, url):
997 # Extract id from URL
998 mobj = re.match(self._VALID_URL, url)
1000 raise ExtractorError(u'Invalid URL: %s' % url)
1002 video_id = mobj.group('id')
1004 video_extension = mobj.group('ext')
1006 # Retrieve video webpage to extract further information
1007 webpage = self._download_webpage(url, video_id)
1009 # Extract URL, uploader, and title from webpage
1010 self.report_extraction(video_id)
1011 # We try first by looking the javascript code:
1012 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1013 if mobj is not None:
1014 info = json.loads(mobj.group('json'))
1017 'url': info[u'downloadUrl'],
1018 'uploader': info[u'username'],
1019 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1020 'title': info[u'title'],
1021 'ext': video_extension,
1022 'thumbnail': info[u'thumbUrl'],
1025 # We try looking in other parts of the webpage
1026 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1027 webpage, u'video URL')
1029 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1031 raise ExtractorError(u'Unable to extract title')
1032 video_title = mobj.group(1).decode('utf-8')
1033 video_uploader = mobj.group(2).decode('utf-8')
1036 'id': video_id.decode('utf-8'),
1037 'url': video_url.decode('utf-8'),
1038 'uploader': video_uploader,
1039 'upload_date': None,
1040 'title': video_title,
1041 'ext': video_extension.decode('utf-8'),
1045 class YahooIE(InfoExtractor):
1046 """Information extractor for screen.yahoo.com."""
1047 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1049 def _real_extract(self, url):
1050 mobj = re.match(self._VALID_URL, url)
1052 raise ExtractorError(u'Invalid URL: %s' % url)
1053 video_id = mobj.group('id')
1054 webpage = self._download_webpage(url, video_id)
1055 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1058 # TODO: Check which url parameters are required
1059 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1061 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1062 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1063 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1064 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1066 self.report_extraction(video_id)
1067 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1069 raise ExtractorError(u'Unable to extract video info')
1070 video_title = m_info.group('title')
1071 video_description = m_info.group('description')
1072 video_thumb = m_info.group('thumb')
1073 video_date = m_info.group('date')
1074 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1076 # TODO: Find a way to get mp4 videos
1077 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1078 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1079 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1080 video_url = m_rest.group('url')
1081 video_path = m_rest.group('path')
1083 raise ExtractorError(u'Unable to extract video url')
1085 else: # We have to use a different method if another id is defined
1086 long_id = m_id.group('new_id')
1087 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1088 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1089 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1090 info = json.loads(json_str)
1091 res = info[u'query'][u'results'][u'mediaObj'][0]
1092 stream = res[u'streams'][0]
1093 video_path = stream[u'path']
1094 video_url = stream[u'host']
1096 video_title = meta[u'title']
1097 video_description = meta[u'description']
1098 video_thumb = meta[u'thumbnail']
1099 video_date = None # I can't find it
1104 'play_path': video_path,
1105 'title':video_title,
1106 'description': video_description,
1107 'thumbnail': video_thumb,
1108 'upload_date': video_date,
1113 class VimeoIE(InfoExtractor):
1114 """Information extractor for vimeo.com."""
1116 # _VALID_URL matches Vimeo URLs
1117 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1120 def _real_extract(self, url, new_video=True):
1121 # Extract ID from URL
1122 mobj = re.match(self._VALID_URL, url)
1124 raise ExtractorError(u'Invalid URL: %s' % url)
1126 video_id = mobj.group('id')
1127 if not mobj.group('proto'):
1128 url = 'https://' + url
1129 if mobj.group('direct_link') or mobj.group('pro'):
1130 url = 'https://vimeo.com/' + video_id
1132 # Retrieve video webpage to extract further information
1133 request = compat_urllib_request.Request(url, None, std_headers)
1134 webpage = self._download_webpage(request, video_id)
1136 # Now we begin extracting as much information as we can from what we
1137 # retrieved. First we extract the information common to all extractors,
1138 # and latter we extract those that are Vimeo specific.
1139 self.report_extraction(video_id)
1141 # Extract the config JSON
1143 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1144 config = json.loads(config)
1146 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1147 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1149 raise ExtractorError(u'Unable to extract info section')
1152 video_title = config["video"]["title"]
1154 # Extract uploader and uploader_id
1155 video_uploader = config["video"]["owner"]["name"]
1156 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1158 # Extract video thumbnail
1159 video_thumbnail = config["video"]["thumbnail"]
1161 # Extract video description
1162 video_description = get_element_by_attribute("itemprop", "description", webpage)
1163 if video_description: video_description = clean_html(video_description)
1164 else: video_description = u''
1166 # Extract upload date
1167 video_upload_date = None
1168 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1169 if mobj is not None:
1170 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1172 # Vimeo specific: extract request signature and timestamp
1173 sig = config['request']['signature']
1174 timestamp = config['request']['timestamp']
1176 # Vimeo specific: extract video codec and quality information
1177 # First consider quality, then codecs, then take everything
1178 # TODO bind to format param
1179 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1180 files = { 'hd': [], 'sd': [], 'other': []}
1181 for codec_name, codec_extension in codecs:
1182 if codec_name in config["video"]["files"]:
1183 if 'hd' in config["video"]["files"][codec_name]:
1184 files['hd'].append((codec_name, codec_extension, 'hd'))
1185 elif 'sd' in config["video"]["files"][codec_name]:
1186 files['sd'].append((codec_name, codec_extension, 'sd'))
1188 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1190 for quality in ('hd', 'sd', 'other'):
1191 if len(files[quality]) > 0:
1192 video_quality = files[quality][0][2]
1193 video_codec = files[quality][0][0]
1194 video_extension = files[quality][0][1]
1195 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1198 raise ExtractorError(u'No known codec found')
1200 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1201 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1206 'uploader': video_uploader,
1207 'uploader_id': video_uploader_id,
1208 'upload_date': video_upload_date,
1209 'title': video_title,
1210 'ext': video_extension,
1211 'thumbnail': video_thumbnail,
1212 'description': video_description,
1216 class ArteTvIE(InfoExtractor):
1217 """arte.tv information extractor."""
1219 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1220 _LIVE_URL = r'index-[0-9]+\.html$'
1222 IE_NAME = u'arte.tv'
1224 def fetch_webpage(self, url):
1225 request = compat_urllib_request.Request(url)
1227 self.report_download_webpage(url)
1228 webpage = compat_urllib_request.urlopen(request).read()
1229 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1230 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1231 except ValueError as err:
1232 raise ExtractorError(u'Invalid URL: %s' % url)
1235 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1236 page = self.fetch_webpage(url)
1237 mobj = re.search(regex, page, regexFlags)
1241 raise ExtractorError(u'Invalid URL: %s' % url)
1243 for (i, key, err) in matchTuples:
1244 if mobj.group(i) is None:
1245 raise ExtractorError(err)
1247 info[key] = mobj.group(i)
1251 def extractLiveStream(self, url):
1252 video_lang = url.split('/')[-4]
1253 info = self.grep_webpage(
1255 r'src="(.*?/videothek_js.*?\.js)',
1258 (1, 'url', u'Invalid URL: %s' % url)
1261 http_host = url.split('/')[2]
1262 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1263 info = self.grep_webpage(
1265 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1266 '(http://.*?\.swf).*?' +
1270 (1, 'path', u'could not extract video path: %s' % url),
1271 (2, 'player', u'could not extract video player: %s' % url),
1272 (3, 'url', u'could not extract video url: %s' % url)
1275 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1277 def extractPlus7Stream(self, url):
1278 video_lang = url.split('/')[-3]
1279 info = self.grep_webpage(
1281 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1284 (1, 'url', u'Invalid URL: %s' % url)
1287 next_url = compat_urllib_parse.unquote(info.get('url'))
1288 info = self.grep_webpage(
1290 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1293 (1, 'url', u'Could not find <video> tag: %s' % url)
1296 next_url = compat_urllib_parse.unquote(info.get('url'))
1298 info = self.grep_webpage(
1300 r'<video id="(.*?)".*?>.*?' +
1301 '<name>(.*?)</name>.*?' +
1302 '<dateVideo>(.*?)</dateVideo>.*?' +
1303 '<url quality="hd">(.*?)</url>',
1306 (1, 'id', u'could not extract video id: %s' % url),
1307 (2, 'title', u'could not extract video title: %s' % url),
1308 (3, 'date', u'could not extract video date: %s' % url),
1309 (4, 'url', u'could not extract video url: %s' % url)
1314 'id': info.get('id'),
1315 'url': compat_urllib_parse.unquote(info.get('url')),
1316 'uploader': u'arte.tv',
1317 'upload_date': unified_strdate(info.get('date')),
1318 'title': info.get('title').decode('utf-8'),
1324 def _real_extract(self, url):
1325 video_id = url.split('/')[-1]
1326 self.report_extraction(video_id)
1328 if re.search(self._LIVE_URL, video_id) is not None:
1329 self.extractLiveStream(url)
1332 info = self.extractPlus7Stream(url)
1337 class GenericIE(InfoExtractor):
1338 """Generic last-resort information extractor."""
1341 IE_NAME = u'generic'
1343 def report_download_webpage(self, video_id):
1344 """Report webpage download."""
1345 if not self._downloader.params.get('test', False):
1346 self._downloader.report_warning(u'Falling back on generic information extractor.')
1347 super(GenericIE, self).report_download_webpage(video_id)
1349 def report_following_redirect(self, new_url):
1350 """Report information extraction."""
1351 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1353 def _test_redirect(self, url):
1354 """Check if it is a redirect, like url shorteners, in case return the new url."""
1355 class HeadRequest(compat_urllib_request.Request):
1356 def get_method(self):
1359 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1361 Subclass the HTTPRedirectHandler to make it use our
1362 HeadRequest also on the redirected URL
1364 def redirect_request(self, req, fp, code, msg, headers, newurl):
1365 if code in (301, 302, 303, 307):
1366 newurl = newurl.replace(' ', '%20')
1367 newheaders = dict((k,v) for k,v in req.headers.items()
1368 if k.lower() not in ("content-length", "content-type"))
1369 return HeadRequest(newurl,
1371 origin_req_host=req.get_origin_req_host(),
1374 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1376 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1378 Fallback to GET if HEAD is not allowed (405 HTTP error)
1380 def http_error_405(self, req, fp, code, msg, headers):
1384 newheaders = dict((k,v) for k,v in req.headers.items()
1385 if k.lower() not in ("content-length", "content-type"))
1386 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1388 origin_req_host=req.get_origin_req_host(),
1392 opener = compat_urllib_request.OpenerDirector()
1393 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1394 HTTPMethodFallback, HEADRedirectHandler,
1395 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1396 opener.add_handler(handler())
1398 response = opener.open(HeadRequest(url))
1399 if response is None:
1400 raise ExtractorError(u'Invalid URL protocol')
1401 new_url = response.geturl()
1406 self.report_following_redirect(new_url)
1409 def _real_extract(self, url):
1410 new_url = self._test_redirect(url)
1411 if new_url: return [self.url_result(new_url)]
1413 video_id = url.split('/')[-1]
1415 webpage = self._download_webpage(url, video_id)
1416 except ValueError as err:
1417 # since this is the last-resort InfoExtractor, if
1418 # this error is thrown, it'll be thrown here
1419 raise ExtractorError(u'Invalid URL: %s' % url)
1421 self.report_extraction(video_id)
1422 # Start with something easy: JW Player in SWFObject
1423 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1425 # Broaden the search a little bit
1426 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1428 # Broaden the search a little bit: JWPlayer JS loader
1429 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1431 # Try to find twitter cards info
1432 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1434 raise ExtractorError(u'Invalid URL: %s' % url)
1436 # It's possible that one of the regexes
1437 # matched, but returned an empty group:
1438 if mobj.group(1) is None:
1439 raise ExtractorError(u'Invalid URL: %s' % url)
1441 video_url = compat_urllib_parse.unquote(mobj.group(1))
1442 video_id = os.path.basename(video_url)
1444 # here's a fun little line of code for you:
1445 video_extension = os.path.splitext(video_id)[1][1:]
1446 video_id = os.path.splitext(video_id)[0]
1448 # it's tempting to parse this further, but you would
1449 # have to take into account all the variations like
1450 # Video Title - Site Name
1451 # Site Name | Video Title
1452 # Video Title - Tagline | Site Name
1453 # and so on and so forth; it's just not practical
1454 video_title = self._html_search_regex(r'<title>(.*)</title>',
1455 webpage, u'video title')
1457 # video uploader is domain name
1458 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1459 url, u'video uploader')
1464 'uploader': video_uploader,
1465 'upload_date': None,
1466 'title': video_title,
1467 'ext': video_extension,
1471 class YoutubeSearchIE(SearchInfoExtractor):
1472 """Information Extractor for YouTube search queries."""
1473 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1475 IE_NAME = u'youtube:search'
1476 _SEARCH_KEY = 'ytsearch'
1478 def report_download_page(self, query, pagenum):
1479 """Report attempt to download search page with given number."""
1480 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1482 def _get_n_results(self, query, n):
1483 """Get a specified number of results for a query"""
1489 while (50 * pagenum) < limit:
1490 self.report_download_page(query, pagenum+1)
1491 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1492 request = compat_urllib_request.Request(result_url)
1494 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1495 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1496 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1497 api_response = json.loads(data)['data']
1499 if not 'items' in api_response:
1500 raise ExtractorError(u'[youtube] No video results')
1502 new_ids = list(video['id'] for video in api_response['items'])
1503 video_ids += new_ids
1505 limit = min(n, api_response['totalItems'])
1508 if len(video_ids) > n:
1509 video_ids = video_ids[:n]
1510 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1511 return self.playlist_result(videos, query)
1514 class GoogleSearchIE(SearchInfoExtractor):
1515 """Information Extractor for Google Video search queries."""
1516 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1518 IE_NAME = u'video.google:search'
1519 _SEARCH_KEY = 'gvsearch'
1521 def _get_n_results(self, query, n):
1522 """Get a specified number of results for a query"""
1525 '_type': 'playlist',
1530 for pagenum in itertools.count(1):
1531 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1532 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1533 note='Downloading result page ' + str(pagenum))
1535 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1538 'url': mobj.group(1)
1540 res['entries'].append(e)
1542 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1545 class YahooSearchIE(SearchInfoExtractor):
1546 """Information Extractor for Yahoo! Video search queries."""
1549 IE_NAME = u'screen.yahoo:search'
1550 _SEARCH_KEY = 'yvsearch'
1552 def _get_n_results(self, query, n):
1553 """Get a specified number of results for a query"""
1556 '_type': 'playlist',
1560 for pagenum in itertools.count(0):
1561 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1562 webpage = self._download_webpage(result_url, query,
1563 note='Downloading results page '+str(pagenum+1))
1564 info = json.loads(webpage)
1566 results = info[u'results']
1568 for (i, r) in enumerate(results):
1569 if (pagenum * 30) +i >= n:
1571 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1572 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1573 res['entries'].append(e)
1574 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1580 class YoutubePlaylistIE(InfoExtractor):
1581 """Information Extractor for YouTube playlists."""
1583 _VALID_URL = r"""(?:
1588 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1589 \? (?:.*?&)*? (?:p|a|list)=
1592 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1595 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1597 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1599 IE_NAME = u'youtube:playlist'
1602 def suitable(cls, url):
1603 """Receives a URL and returns True if suitable for this IE."""
1604 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1606 def _real_extract(self, url):
1607 # Extract playlist id
1608 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1610 raise ExtractorError(u'Invalid URL: %s' % url)
1612 # Download playlist videos from API
1613 playlist_id = mobj.group(1) or mobj.group(2)
1618 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1619 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1622 response = json.loads(page)
1623 except ValueError as err:
1624 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1626 if 'feed' not in response:
1627 raise ExtractorError(u'Got a malformed response from YouTube API')
1628 playlist_title = response['feed']['title']['$t']
1629 if 'entry' not in response['feed']:
1630 # Number of videos is a multiple of self._MAX_RESULTS
1633 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1634 for entry in response['feed']['entry']
1635 if 'content' in entry ]
1637 if len(response['feed']['entry']) < self._MAX_RESULTS:
1641 videos = [v[1] for v in sorted(videos)]
1643 url_results = [self.url_result(url, 'Youtube') for url in videos]
1644 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1647 class YoutubeChannelIE(InfoExtractor):
1648 """Information Extractor for YouTube channels."""
1650 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1651 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1652 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1653 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1654 IE_NAME = u'youtube:channel'
1656 def extract_videos_from_page(self, page):
1658 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1659 if mobj.group(1) not in ids_in_page:
1660 ids_in_page.append(mobj.group(1))
1663 def _real_extract(self, url):
1664 # Extract channel id
1665 mobj = re.match(self._VALID_URL, url)
1667 raise ExtractorError(u'Invalid URL: %s' % url)
1669 # Download channel page
1670 channel_id = mobj.group(1)
1674 url = self._TEMPLATE_URL % (channel_id, pagenum)
1675 page = self._download_webpage(url, channel_id,
1676 u'Downloading page #%s' % pagenum)
1678 # Extract video identifiers
1679 ids_in_page = self.extract_videos_from_page(page)
1680 video_ids.extend(ids_in_page)
1682 # Download any subsequent channel pages using the json-based channel_ajax query
1683 if self._MORE_PAGES_INDICATOR in page:
1685 pagenum = pagenum + 1
1687 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1688 page = self._download_webpage(url, channel_id,
1689 u'Downloading page #%s' % pagenum)
1691 page = json.loads(page)
1693 ids_in_page = self.extract_videos_from_page(page['content_html'])
1694 video_ids.extend(ids_in_page)
1696 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1699 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1701 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1702 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1703 return [self.playlist_result(url_entries, channel_id)]
1706 class YoutubeUserIE(InfoExtractor):
1707 """Information Extractor for YouTube users."""
1709 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1710 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1711 _GDATA_PAGE_SIZE = 50
1712 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1713 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1714 IE_NAME = u'youtube:user'
1716 def _real_extract(self, url):
1718 mobj = re.match(self._VALID_URL, url)
1720 raise ExtractorError(u'Invalid URL: %s' % url)
1722 username = mobj.group(1)
1724 # Download video ids using YouTube Data API. Result size per
1725 # query is limited (currently to 50 videos) so we need to query
1726 # page by page until there are no video ids - it means we got
1733 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1735 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1736 page = self._download_webpage(gdata_url, username,
1737 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1739 # Extract video identifiers
1742 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1743 if mobj.group(1) not in ids_in_page:
1744 ids_in_page.append(mobj.group(1))
1746 video_ids.extend(ids_in_page)
1748 # A little optimization - if current page is not
1749 # "full", ie. does not contain PAGE_SIZE video ids then
1750 # we can assume that this page is the last one - there
1751 # are no more ids on further pages - no need to query
1754 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1759 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1760 url_results = [self.url_result(url, 'Youtube') for url in urls]
1761 return [self.playlist_result(url_results, playlist_title = username)]
1764 class BlipTVUserIE(InfoExtractor):
1765 """Information Extractor for blip.tv users."""
1767 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1769 IE_NAME = u'blip.tv:user'
1771 def _real_extract(self, url):
1773 mobj = re.match(self._VALID_URL, url)
1775 raise ExtractorError(u'Invalid URL: %s' % url)
1777 username = mobj.group(1)
1779 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1781 page = self._download_webpage(url, username, u'Downloading user page')
1782 mobj = re.search(r'data-users-id="([^"]+)"', page)
1783 page_base = page_base % mobj.group(1)
1786 # Download video ids using BlipTV Ajax calls. Result size per
1787 # query is limited (currently to 12 videos) so we need to query
1788 # page by page until there are no video ids - it means we got
1795 url = page_base + "&page=" + str(pagenum)
1796 page = self._download_webpage(url, username,
1797 u'Downloading video ids from page %d' % pagenum)
1799 # Extract video identifiers
1802 for mobj in re.finditer(r'href="/([^"]+)"', page):
1803 if mobj.group(1) not in ids_in_page:
1804 ids_in_page.append(unescapeHTML(mobj.group(1)))
1806 video_ids.extend(ids_in_page)
1808 # A little optimization - if current page is not
1809 # "full", ie. does not contain PAGE_SIZE video ids then
1810 # we can assume that this page is the last one - there
1811 # are no more ids on further pages - no need to query
1814 if len(ids_in_page) < self._PAGE_SIZE:
1819 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1820 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1821 return [self.playlist_result(url_entries, playlist_title = username)]
1824 class DepositFilesIE(InfoExtractor):
1825 """Information extractor for depositfiles.com"""
1827 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1829 def _real_extract(self, url):
1830 file_id = url.split('/')[-1]
1831 # Rebuild url in english locale
1832 url = 'http://depositfiles.com/en/files/' + file_id
1834 # Retrieve file webpage with 'Free download' button pressed
1835 free_download_indication = { 'gateway_result' : '1' }
1836 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1838 self.report_download_webpage(file_id)
1839 webpage = compat_urllib_request.urlopen(request).read()
1840 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1841 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1843 # Search for the real file URL
1844 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1845 if (mobj is None) or (mobj.group(1) is None):
1846 # Try to figure out reason of the error.
1847 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1848 if (mobj is not None) and (mobj.group(1) is not None):
1849 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1850 raise ExtractorError(u'%s' % restriction_message)
1852 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1854 file_url = mobj.group(1)
1855 file_extension = os.path.splitext(file_url)[1][1:]
1857 # Search for file title
1858 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1861 'id': file_id.decode('utf-8'),
1862 'url': file_url.decode('utf-8'),
1864 'upload_date': None,
1865 'title': file_title,
1866 'ext': file_extension.decode('utf-8'),
1870 class FacebookIE(InfoExtractor):
1871 """Information Extractor for Facebook"""
1873 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1874 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1875 _NETRC_MACHINE = 'facebook'
1876 IE_NAME = u'facebook'
1878 def report_login(self):
1879 """Report attempt to log in."""
1880 self.to_screen(u'Logging in')
1882 def _real_initialize(self):
1883 if self._downloader is None:
1888 downloader_params = self._downloader.params
1890 # Attempt to use provided username and password or .netrc data
1891 if downloader_params.get('username', None) is not None:
1892 useremail = downloader_params['username']
1893 password = downloader_params['password']
1894 elif downloader_params.get('usenetrc', False):
1896 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1897 if info is not None:
1901 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1902 except (IOError, netrc.NetrcParseError) as err:
1903 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1906 if useremail is None:
1915 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1918 login_results = compat_urllib_request.urlopen(request).read()
1919 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1920 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1922 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1923 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1926 def _real_extract(self, url):
1927 mobj = re.match(self._VALID_URL, url)
1929 raise ExtractorError(u'Invalid URL: %s' % url)
1930 video_id = mobj.group('ID')
1932 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1933 webpage = self._download_webpage(url, video_id)
1935 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1936 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1937 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1939 raise ExtractorError(u'Cannot parse data')
1940 data = dict(json.loads(m.group(1)))
1941 params_raw = compat_urllib_parse.unquote(data['params'])
1942 params = json.loads(params_raw)
1943 video_data = params['video_data'][0]
1944 video_url = video_data.get('hd_src')
1946 video_url = video_data['sd_src']
1948 raise ExtractorError(u'Cannot find video URL')
1949 video_duration = int(video_data['video_duration'])
1950 thumbnail = video_data['thumbnail_src']
1952 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1957 'title': video_title,
1960 'duration': video_duration,
1961 'thumbnail': thumbnail,
1966 class BlipTVIE(InfoExtractor):
1967 """Information extractor for blip.tv"""
1969 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1970 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1971 IE_NAME = u'blip.tv'
1973 def report_direct_download(self, title):
1974 """Report information extraction."""
1975 self.to_screen(u'%s: Direct download detected' % title)
1977 def _real_extract(self, url):
1978 mobj = re.match(self._VALID_URL, url)
1980 raise ExtractorError(u'Invalid URL: %s' % url)
1982 # See https://github.com/rg3/youtube-dl/issues/857
1983 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1984 if api_mobj is not None:
1985 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1986 urlp = compat_urllib_parse_urlparse(url)
1987 if urlp.path.startswith('/play/'):
1988 request = compat_urllib_request.Request(url)
1989 response = compat_urllib_request.urlopen(request)
1990 redirecturl = response.geturl()
1991 rurlp = compat_urllib_parse_urlparse(redirecturl)
1992 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1993 url = 'http://blip.tv/a/a-' + file_id
1994 return self._real_extract(url)
2001 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2002 request = compat_urllib_request.Request(json_url)
2003 request.add_header('User-Agent', 'iTunes/10.6.1')
2004 self.report_extraction(mobj.group(1))
2007 urlh = compat_urllib_request.urlopen(request)
2008 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2009 basename = url.split('/')[-1]
2010 title,ext = os.path.splitext(basename)
2011 title = title.decode('UTF-8')
2012 ext = ext.replace('.', '')
2013 self.report_direct_download(title)
2018 'upload_date': None,
2023 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2024 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2025 if info is None: # Regular URL
2027 json_code_bytes = urlh.read()
2028 json_code = json_code_bytes.decode('utf-8')
2029 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2030 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2033 json_data = json.loads(json_code)
2034 if 'Post' in json_data:
2035 data = json_data['Post']
2039 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2040 video_url = data['media']['url']
2041 umobj = re.match(self._URL_EXT, video_url)
2043 raise ValueError('Can not determine filename extension')
2044 ext = umobj.group(1)
2047 'id': data['item_id'],
2049 'uploader': data['display_name'],
2050 'upload_date': upload_date,
2051 'title': data['title'],
2053 'format': data['media']['mimeType'],
2054 'thumbnail': data['thumbnailUrl'],
2055 'description': data['description'],
2056 'player_url': data['embedUrl'],
2057 'user_agent': 'iTunes/10.6.1',
2059 except (ValueError,KeyError) as err:
2060 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2065 class MyVideoIE(InfoExtractor):
2066 """Information Extractor for myvideo.de."""
2068 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2069 IE_NAME = u'myvideo'
2071 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2072 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2073 # https://github.com/rg3/youtube-dl/pull/842
2074 def __rc4crypt(self,data, key):
2076 box = list(range(256))
2077 for i in list(range(256)):
2078 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2079 box[i], box[x] = box[x], box[i]
2085 y = (y + box[x]) % 256
2086 box[x], box[y] = box[y], box[x]
2087 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2091 return hashlib.md5(s).hexdigest().encode()
2093 def _real_extract(self,url):
2094 mobj = re.match(self._VALID_URL, url)
2096 raise ExtractorError(u'invalid URL: %s' % url)
2098 video_id = mobj.group(1)
2101 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2102 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2103 b'TnpsbA0KTVRkbU1tSTRNdz09'
2107 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2108 webpage = self._download_webpage(webpage_url, video_id)
2110 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2111 if mobj is not None:
2112 self.report_extraction(video_id)
2113 video_url = mobj.group(1) + '.flv'
2115 video_title = self._html_search_regex('<title>([^<]+)</title>',
2118 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2124 'upload_date': None,
2125 'title': video_title,
2130 mobj = re.search('var flashvars={(.+?)}', webpage)
2132 raise ExtractorError(u'Unable to extract video')
2137 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2138 if not a == '_encxml':
2141 encxml = compat_urllib_parse.unquote(b)
2142 if not params.get('domain'):
2143 params['domain'] = 'www.myvideo.de'
2144 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2145 if 'flash_playertype=MTV' in xmldata_url:
2146 self._downloader.report_warning(u'avoiding MTV player')
2148 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2149 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2153 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2154 enc_data_b = binascii.unhexlify(enc_data)
2156 base64.b64decode(base64.b64decode(GK)) +
2158 str(video_id).encode('utf-8')
2161 dec_data = self.__rc4crypt(enc_data_b, sk)
2164 self.report_extraction(video_id)
2167 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2169 video_url = compat_urllib_parse.unquote(mobj.group(1))
2170 if 'myvideo2flash' in video_url:
2171 self._downloader.report_warning(u'forcing RTMPT ...')
2172 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2175 # extract non rtmp videos
2176 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2178 raise ExtractorError(u'unable to extract url')
2179 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2181 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2182 video_file = compat_urllib_parse.unquote(video_file)
2184 if not video_file.endswith('f4m'):
2185 ppath, prefix = video_file.split('.')
2186 video_playpath = '%s:%s' % (prefix, ppath)
2187 video_hls_playlist = ''
2190 video_hls_playlist = (
2191 video_filepath + video_file
2192 ).replace('.f4m', '.m3u8')
2194 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2195 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2197 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2203 'tc_url': video_url,
2205 'upload_date': None,
2206 'title': video_title,
2208 'play_path': video_playpath,
2209 'video_file': video_file,
2210 'video_hls_playlist': video_hls_playlist,
2211 'player_url': video_swfobj,
2215 class ComedyCentralIE(InfoExtractor):
2216 """Information extractor for The Daily Show and Colbert Report """
2218 # urls can be abbreviations like :thedailyshow or :colbert
2219 # urls for episodes like:
2220 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2221 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2222 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2223 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2224 |(https?://)?(www\.)?
2225 (?P<showname>thedailyshow|colbertnation)\.com/
2226 (full-episodes/(?P<episode>.*)|
2228 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2229 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2232 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2234 _video_extensions = {
2242 _video_dimensions = {
2252 def suitable(cls, url):
2253 """Receives a URL and returns True if suitable for this IE."""
2254 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2256 def _print_formats(self, formats):
2257 print('Available formats:')
2259 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2262 def _real_extract(self, url):
2263 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2265 raise ExtractorError(u'Invalid URL: %s' % url)
2267 if mobj.group('shortname'):
2268 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2269 url = u'http://www.thedailyshow.com/full-episodes/'
2271 url = u'http://www.colbertnation.com/full-episodes/'
2272 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273 assert mobj is not None
2275 if mobj.group('clip'):
2276 if mobj.group('showname') == 'thedailyshow':
2277 epTitle = mobj.group('tdstitle')
2279 epTitle = mobj.group('cntitle')
2282 dlNewest = not mobj.group('episode')
2284 epTitle = mobj.group('showname')
2286 epTitle = mobj.group('episode')
2288 self.report_extraction(epTitle)
2289 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2291 url = htmlHandle.geturl()
2292 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2294 raise ExtractorError(u'Invalid redirected URL: ' + url)
2295 if mobj.group('episode') == '':
2296 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2297 epTitle = mobj.group('episode')
2299 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2301 if len(mMovieParams) == 0:
2302 # The Colbert Report embeds the information in a without
2303 # a URL prefix; so extract the alternate reference
2304 # and then add the URL prefix manually.
2306 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2307 if len(altMovieParams) == 0:
2308 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2310 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2312 uri = mMovieParams[0][1]
2313 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2314 indexXml = self._download_webpage(indexUrl, epTitle,
2315 u'Downloading show index',
2316 u'unable to download episode index')
2320 idoc = xml.etree.ElementTree.fromstring(indexXml)
2321 itemEls = idoc.findall('.//item')
2322 for partNum,itemEl in enumerate(itemEls):
2323 mediaId = itemEl.findall('./guid')[0].text
2324 shortMediaId = mediaId.split(':')[-1]
2325 showId = mediaId.split(':')[-2].replace('.com', '')
2326 officialTitle = itemEl.findall('./title')[0].text
2327 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2329 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2330 compat_urllib_parse.urlencode({'uri': mediaId}))
2331 configXml = self._download_webpage(configUrl, epTitle,
2332 u'Downloading configuration for %s' % shortMediaId)
2334 cdoc = xml.etree.ElementTree.fromstring(configXml)
2336 for rendition in cdoc.findall('.//rendition'):
2337 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2341 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2344 if self._downloader.params.get('listformats', None):
2345 self._print_formats([i[0] for i in turls])
2348 # For now, just pick the highest bitrate
2349 format,rtmp_video_url = turls[-1]
2351 # Get the format arg from the arg stream
2352 req_format = self._downloader.params.get('format', None)
2354 # Select format if we can find one
2357 format, rtmp_video_url = f, v
2360 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2362 raise ExtractorError(u'Cannot transform RTMP url')
2363 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2364 video_url = base + m.group('finalid')
2366 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2371 'upload_date': officialDate,
2376 'description': officialTitle,
2378 results.append(info)
2383 class EscapistIE(InfoExtractor):
2384 """Information extractor for The Escapist """
2386 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2387 IE_NAME = u'escapist'
2389 def _real_extract(self, url):
2390 mobj = re.match(self._VALID_URL, url)
2392 raise ExtractorError(u'Invalid URL: %s' % url)
2393 showName = mobj.group('showname')
2394 videoId = mobj.group('episode')
2396 self.report_extraction(videoId)
2397 webpage = self._download_webpage(url, videoId)
2399 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2400 webpage, u'description', fatal=False)
2402 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2403 webpage, u'thumbnail', fatal=False)
2405 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2406 webpage, u'player url')
2408 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2409 webpage, u'player url').split(' : ')[-1]
2411 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2412 configUrl = compat_urllib_parse.unquote(configUrl)
2414 configJSON = self._download_webpage(configUrl, videoId,
2415 u'Downloading configuration',
2416 u'unable to download configuration')
2418 # Technically, it's JavaScript, not JSON
2419 configJSON = configJSON.replace("'", '"')
2422 config = json.loads(configJSON)
2423 except (ValueError,) as err:
2424 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2426 playlist = config['playlist']
2427 videoUrl = playlist[1]['url']
2432 'uploader': showName,
2433 'upload_date': None,
2436 'thumbnail': imgUrl,
2437 'description': videoDesc,
2438 'player_url': playerUrl,
2443 class CollegeHumorIE(InfoExtractor):
2444 """Information extractor for collegehumor.com"""
2447 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2448 IE_NAME = u'collegehumor'
2450 def report_manifest(self, video_id):
2451 """Report information extraction."""
2452 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2454 def _real_extract(self, url):
2455 mobj = re.match(self._VALID_URL, url)
2457 raise ExtractorError(u'Invalid URL: %s' % url)
2458 video_id = mobj.group('videoid')
2463 'upload_date': None,
2466 self.report_extraction(video_id)
2467 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2469 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2473 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2475 videoNode = mdoc.findall('./video')[0]
2476 info['description'] = videoNode.findall('./description')[0].text
2477 info['title'] = videoNode.findall('./caption')[0].text
2478 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2479 manifest_url = videoNode.findall('./file')[0].text
2481 raise ExtractorError(u'Invalid metadata XML file')
2483 manifest_url += '?hdcore=2.10.3'
2484 self.report_manifest(video_id)
2486 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2490 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2492 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2493 node_id = media_node.attrib['url']
2494 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2495 except IndexError as err:
2496 raise ExtractorError(u'Invalid manifest file')
2498 url_pr = compat_urllib_parse_urlparse(manifest_url)
2499 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2506 class XVideosIE(InfoExtractor):
2507 """Information extractor for xvideos.com"""
2509 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2510 IE_NAME = u'xvideos'
2512 def _real_extract(self, url):
2513 mobj = re.match(self._VALID_URL, url)
2515 raise ExtractorError(u'Invalid URL: %s' % url)
2516 video_id = mobj.group(1)
2518 webpage = self._download_webpage(url, video_id)
2520 self.report_extraction(video_id)
2523 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2524 webpage, u'video URL'))
2527 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2530 # Extract video thumbnail
2531 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2532 webpage, u'thumbnail', fatal=False)
2538 'upload_date': None,
2539 'title': video_title,
2541 'thumbnail': video_thumbnail,
2542 'description': None,
2548 class SoundcloudIE(InfoExtractor):
2549 """Information extractor for soundcloud.com
2550 To access the media, the uid of the song and a stream token
2551 must be extracted from the page source and the script must make
2552 a request to media.soundcloud.com/crossdomain.xml. Then
2553 the media can be grabbed by requesting from an url composed
2554 of the stream token and uid
2557 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2558 IE_NAME = u'soundcloud'
2560 def report_resolve(self, video_id):
2561 """Report information extraction."""
2562 self.to_screen(u'%s: Resolving id' % video_id)
2564 def _real_extract(self, url):
2565 mobj = re.match(self._VALID_URL, url)
2567 raise ExtractorError(u'Invalid URL: %s' % url)
2569 # extract uploader (which is in the url)
2570 uploader = mobj.group(1)
2571 # extract simple title (uploader + slug of song title)
2572 slug_title = mobj.group(2)
2573 simple_title = uploader + u'-' + slug_title
2574 full_title = '%s/%s' % (uploader, slug_title)
2576 self.report_resolve(full_title)
2578 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2579 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2580 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2582 info = json.loads(info_json)
2583 video_id = info['id']
2584 self.report_extraction(full_title)
2586 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2587 stream_json = self._download_webpage(streams_url, full_title,
2588 u'Downloading stream definitions',
2589 u'unable to download stream definitions')
2591 streams = json.loads(stream_json)
2592 mediaURL = streams['http_mp3_128_url']
2593 upload_date = unified_strdate(info['created_at'])
2598 'uploader': info['user']['username'],
2599 'upload_date': upload_date,
2600 'title': info['title'],
2602 'description': info['description'],
2605 class SoundcloudSetIE(InfoExtractor):
2606 """Information extractor for soundcloud.com sets
2607 To access the media, the uid of the song and a stream token
2608 must be extracted from the page source and the script must make
2609 a request to media.soundcloud.com/crossdomain.xml. Then
2610 the media can be grabbed by requesting from an url composed
2611 of the stream token and uid
2614 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2615 IE_NAME = u'soundcloud:set'
2617 def report_resolve(self, video_id):
2618 """Report information extraction."""
2619 self.to_screen(u'%s: Resolving id' % video_id)
2621 def _real_extract(self, url):
2622 mobj = re.match(self._VALID_URL, url)
2624 raise ExtractorError(u'Invalid URL: %s' % url)
2626 # extract uploader (which is in the url)
2627 uploader = mobj.group(1)
2628 # extract simple title (uploader + slug of song title)
2629 slug_title = mobj.group(2)
2630 simple_title = uploader + u'-' + slug_title
2631 full_title = '%s/sets/%s' % (uploader, slug_title)
2633 self.report_resolve(full_title)
2635 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2636 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2637 info_json = self._download_webpage(resolv_url, full_title)
2640 info = json.loads(info_json)
2641 if 'errors' in info:
2642 for err in info['errors']:
2643 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2646 self.report_extraction(full_title)
2647 for track in info['tracks']:
2648 video_id = track['id']
2650 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2653 self.report_extraction(video_id)
2654 streams = json.loads(stream_json)
2655 mediaURL = streams['http_mp3_128_url']
2660 'uploader': track['user']['username'],
2661 'upload_date': unified_strdate(track['created_at']),
2662 'title': track['title'],
2664 'description': track['description'],
2669 class InfoQIE(InfoExtractor):
2670 """Information extractor for infoq.com"""
2671 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2673 def _real_extract(self, url):
2674 mobj = re.match(self._VALID_URL, url)
2676 raise ExtractorError(u'Invalid URL: %s' % url)
2678 webpage = self._download_webpage(url, video_id=url)
2679 self.report_extraction(url)
2682 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2684 raise ExtractorError(u'Unable to extract video url')
2685 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2686 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2689 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2692 # Extract description
2693 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2694 webpage, u'description', fatal=False)
2696 video_filename = video_url.split('/')[-1]
2697 video_id, extension = video_filename.split('.')
2703 'upload_date': None,
2704 'title': video_title,
2705 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2707 'description': video_description,
2712 class MixcloudIE(InfoExtractor):
2713 """Information extractor for www.mixcloud.com"""
2715 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2716 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2717 IE_NAME = u'mixcloud'
2719 def report_download_json(self, file_id):
2720 """Report JSON download."""
2721 self.to_screen(u'Downloading json')
2723 def get_urls(self, jsonData, fmt, bitrate='best'):
2724 """Get urls from 'audio_formats' section in json"""
2727 bitrate_list = jsonData[fmt]
2728 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2729 bitrate = max(bitrate_list) # select highest
2731 url_list = jsonData[fmt][bitrate]
2732 except TypeError: # we have no bitrate info.
2733 url_list = jsonData[fmt]
2736 def check_urls(self, url_list):
2737 """Returns 1st active url from list"""
2738 for url in url_list:
2740 compat_urllib_request.urlopen(url)
2742 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747 def _print_formats(self, formats):
2748 print('Available formats:')
2749 for fmt in formats.keys():
2750 for b in formats[fmt]:
2752 ext = formats[fmt][b][0]
2753 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2754 except TypeError: # we have no bitrate info
2755 ext = formats[fmt][0]
2756 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2759 def _real_extract(self, url):
2760 mobj = re.match(self._VALID_URL, url)
2762 raise ExtractorError(u'Invalid URL: %s' % url)
2763 # extract uploader & filename from url
2764 uploader = mobj.group(1).decode('utf-8')
2765 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2767 # construct API request
2768 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2769 # retrieve .json file with links to files
2770 request = compat_urllib_request.Request(file_url)
2772 self.report_download_json(file_url)
2773 jsonData = compat_urllib_request.urlopen(request).read()
2774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2775 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2778 json_data = json.loads(jsonData)
2779 player_url = json_data['player_swf_url']
2780 formats = dict(json_data['audio_formats'])
2782 req_format = self._downloader.params.get('format', None)
2785 if self._downloader.params.get('listformats', None):
2786 self._print_formats(formats)
2789 if req_format is None or req_format == 'best':
2790 for format_param in formats.keys():
2791 url_list = self.get_urls(formats, format_param)
2793 file_url = self.check_urls(url_list)
2794 if file_url is not None:
2797 if req_format not in formats:
2798 raise ExtractorError(u'Format is not available')
2800 url_list = self.get_urls(formats, req_format)
2801 file_url = self.check_urls(url_list)
2802 format_param = req_format
2805 'id': file_id.decode('utf-8'),
2806 'url': file_url.decode('utf-8'),
2807 'uploader': uploader.decode('utf-8'),
2808 'upload_date': None,
2809 'title': json_data['name'],
2810 'ext': file_url.split('.')[-1].decode('utf-8'),
2811 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2812 'thumbnail': json_data['thumbnail_url'],
2813 'description': json_data['description'],
2814 'player_url': player_url.decode('utf-8'),
2817 class StanfordOpenClassroomIE(InfoExtractor):
2818 """Information extractor for Stanford's Open ClassRoom"""
2820 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2821 IE_NAME = u'stanfordoc'
2823 def _real_extract(self, url):
2824 mobj = re.match(self._VALID_URL, url)
2826 raise ExtractorError(u'Invalid URL: %s' % url)
2828 if mobj.group('course') and mobj.group('video'): # A specific video
2829 course = mobj.group('course')
2830 video = mobj.group('video')
2832 'id': course + '_' + video,
2834 'upload_date': None,
2837 self.report_extraction(info['id'])
2838 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2839 xmlUrl = baseUrl + video + '.xml'
2841 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2843 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2844 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2846 info['title'] = mdoc.findall('./title')[0].text
2847 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2849 raise ExtractorError(u'Invalid metadata XML file')
2850 info['ext'] = info['url'].rpartition('.')[2]
2852 elif mobj.group('course'): # A course page
2853 course = mobj.group('course')
2858 'upload_date': None,
2861 coursepage = self._download_webpage(url, info['id'],
2862 note='Downloading course info page',
2863 errnote='Unable to download course info page')
2865 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2867 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2868 coursepage, u'description', fatal=False)
2870 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2873 'type': 'reference',
2874 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2878 for entry in info['list']:
2879 assert entry['type'] == 'reference'
2880 results += self.extract(entry['url'])
2884 'id': 'Stanford OpenClassroom',
2887 'upload_date': None,
2890 self.report_download_webpage(info['id'])
2891 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2893 rootpage = compat_urllib_request.urlopen(rootURL).read()
2894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2897 info['title'] = info['id']
2899 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2902 'type': 'reference',
2903 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2908 for entry in info['list']:
2909 assert entry['type'] == 'reference'
2910 results += self.extract(entry['url'])
2913 class MTVIE(InfoExtractor):
2914 """Information extractor for MTV.com"""
2916 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2919 def _real_extract(self, url):
2920 mobj = re.match(self._VALID_URL, url)
2922 raise ExtractorError(u'Invalid URL: %s' % url)
2923 if not mobj.group('proto'):
2924 url = 'http://' + url
2925 video_id = mobj.group('videoid')
2927 webpage = self._download_webpage(url, video_id)
2929 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2930 webpage, u'song name', fatal=False)
2932 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2935 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2936 webpage, u'mtvn_uri', fatal=False)
2938 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2939 webpage, u'content id', fatal=False)
2941 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2942 self.report_extraction(video_id)
2943 request = compat_urllib_request.Request(videogen_url)
2945 metadataXml = compat_urllib_request.urlopen(request).read()
2946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2949 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2950 renditions = mdoc.findall('.//rendition')
2952 # For now, always pick the highest quality.
2953 rendition = renditions[-1]
2956 _,_,ext = rendition.attrib['type'].partition('/')
2957 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2958 video_url = rendition.find('./src').text
2960 raise ExtractorError('Invalid rendition field.')
2965 'uploader': performer,
2966 'upload_date': None,
2967 'title': video_title,
2975 class YoukuIE(InfoExtractor):
2976 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2979 nowTime = int(time.time() * 1000)
2980 random1 = random.randint(1000,1998)
2981 random2 = random.randint(1000,9999)
2983 return "%d%d%d" %(nowTime,random1,random2)
2985 def _get_file_ID_mix_string(self, seed):
2987 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2989 for i in range(len(source)):
2990 seed = (seed * 211 + 30031 ) % 65536
2991 index = math.floor(seed / 65536 * len(source) )
2992 mixed.append(source[int(index)])
2993 source.remove(source[int(index)])
2994 #return ''.join(mixed)
2997 def _get_file_id(self, fileId, seed):
2998 mixed = self._get_file_ID_mix_string(seed)
2999 ids = fileId.split('*')
3003 realId.append(mixed[int(ch)])
3004 return ''.join(realId)
3006 def _real_extract(self, url):
3007 mobj = re.match(self._VALID_URL, url)
3009 raise ExtractorError(u'Invalid URL: %s' % url)
3010 video_id = mobj.group('ID')
3012 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3014 jsondata = self._download_webpage(info_url, video_id)
3016 self.report_extraction(video_id)
3018 config = json.loads(jsondata)
3020 video_title = config['data'][0]['title']
3021 seed = config['data'][0]['seed']
3023 format = self._downloader.params.get('format', None)
3024 supported_format = list(config['data'][0]['streamfileids'].keys())
3026 if format is None or format == 'best':
3027 if 'hd2' in supported_format:
3032 elif format == 'worst':
3040 fileid = config['data'][0]['streamfileids'][format]
3041 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3042 except (UnicodeDecodeError, ValueError, KeyError):
3043 raise ExtractorError(u'Unable to extract info section')
3046 sid = self._gen_sid()
3047 fileid = self._get_file_id(fileid, seed)
3049 #column 8,9 of fileid represent the segment number
3050 #fileid[7:9] should be changed
3051 for index, key in enumerate(keys):
3053 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3054 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3057 'id': '%s_part%02d' % (video_id, index),
3058 'url': download_url,
3060 'upload_date': None,
3061 'title': video_title,
3064 files_info.append(info)
3069 class XNXXIE(InfoExtractor):
3070 """Information extractor for xnxx.com"""
3072 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3074 VIDEO_URL_RE = r'flv_url=(.*?)&'
3075 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3076 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3078 def _real_extract(self, url):
3079 mobj = re.match(self._VALID_URL, url)
3081 raise ExtractorError(u'Invalid URL: %s' % url)
3082 video_id = mobj.group(1)
3084 # Get webpage content
3085 webpage = self._download_webpage(url, video_id)
3087 video_url = self._search_regex(self.VIDEO_URL_RE,
3088 webpage, u'video URL')
3089 video_url = compat_urllib_parse.unquote(video_url)
3091 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3094 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3095 webpage, u'thumbnail', fatal=False)
3101 'upload_date': None,
3102 'title': video_title,
3104 'thumbnail': video_thumbnail,
3105 'description': None,
3109 class GooglePlusIE(InfoExtractor):
3110 """Information extractor for plus.google.com."""
3112 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3113 IE_NAME = u'plus.google'
3115 def _real_extract(self, url):
3116 # Extract id from URL
3117 mobj = re.match(self._VALID_URL, url)
3119 raise ExtractorError(u'Invalid URL: %s' % url)
3121 post_url = mobj.group(0)
3122 video_id = mobj.group(1)
3124 video_extension = 'flv'
3126 # Step 1, Retrieve post webpage to extract further information
3127 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3129 self.report_extraction(video_id)
3131 # Extract update date
3132 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3133 webpage, u'upload date', fatal=False)
3135 # Convert timestring to a format suitable for filename
3136 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3137 upload_date = upload_date.strftime('%Y%m%d')
3140 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3141 webpage, u'uploader', fatal=False)
3144 # Get the first line for title
3145 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3146 webpage, 'title', default=u'NA')
3148 # Step 2, Stimulate clicking the image box to launch video
3149 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3150 webpage, u'video page URL')
3151 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3153 # Extract video links on video page
3154 """Extract video links of all sizes"""
3155 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3156 mobj = re.findall(pattern, webpage)
3158 raise ExtractorError(u'Unable to extract video links')
3160 # Sort in resolution
3161 links = sorted(mobj)
3163 # Choose the lowest of the sort, i.e. highest resolution
3164 video_url = links[-1]
3165 # Only get the url. The resolution part in the tuple has no use anymore
3166 video_url = video_url[-1]
3167 # Treat escaped \u0026 style hex
3169 video_url = video_url.decode("unicode_escape")
3170 except AttributeError: # Python 3
3171 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3177 'uploader': uploader,
3178 'upload_date': upload_date,
3179 'title': video_title,
3180 'ext': video_extension,
3183 class NBAIE(InfoExtractor):
3184 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3187 def _real_extract(self, url):
3188 mobj = re.match(self._VALID_URL, url)
3190 raise ExtractorError(u'Invalid URL: %s' % url)
3192 video_id = mobj.group(1)
3194 webpage = self._download_webpage(url, video_id)
3196 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3198 shortened_video_id = video_id.rpartition('/')[2]
3199 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3200 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3202 # It isn't there in the HTML it returns to us
3203 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3205 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3208 'id': shortened_video_id,
3212 # 'uploader_date': uploader_date,
3213 'description': description,
3217 class JustinTVIE(InfoExtractor):
3218 """Information extractor for justin.tv and twitch.tv"""
3219 # TODO: One broadcast may be split into multiple videos. The key
3220 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3221 # starts at 1 and increases. Can we treat all parts as one video?
3223 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3225 (?P<channelid>[^/]+)|
3226 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3227 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3231 _JUSTIN_PAGE_LIMIT = 100
3232 IE_NAME = u'justin.tv'
3234 def report_download_page(self, channel, offset):
3235 """Report attempt to download a single page of videos."""
3236 self.to_screen(u'%s: Downloading video information from %d to %d' %
3237 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3239 # Return count of items, list of *valid* items
3240 def _parse_page(self, url, video_id):
3241 webpage = self._download_webpage(url, video_id,
3242 u'Downloading video info JSON',
3243 u'unable to download video info JSON')
3245 response = json.loads(webpage)
3246 if type(response) != list:
3247 error_text = response.get('error', 'unknown error')
3248 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3250 for clip in response:
3251 video_url = clip['video_file_url']
3253 video_extension = os.path.splitext(video_url)[1][1:]
3254 video_date = re.sub('-', '', clip['start_time'][:10])
3255 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3256 video_id = clip['id']
3257 video_title = clip.get('title', video_id)
3261 'title': video_title,
3262 'uploader': clip.get('channel_name', video_uploader_id),
3263 'uploader_id': video_uploader_id,
3264 'upload_date': video_date,
3265 'ext': video_extension,
3267 return (len(response), info)
3269 def _real_extract(self, url):
3270 mobj = re.match(self._VALID_URL, url)
3272 raise ExtractorError(u'invalid URL: %s' % url)
3274 api_base = 'http://api.justin.tv'
3276 if mobj.group('channelid'):
3278 video_id = mobj.group('channelid')
3279 api = api_base + '/channel/archives/%s.json' % video_id
3280 elif mobj.group('chapterid'):
3281 chapter_id = mobj.group('chapterid')
3283 webpage = self._download_webpage(url, chapter_id)
3284 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3286 raise ExtractorError(u'Cannot find archive of a chapter')
3287 archive_id = m.group(1)
3289 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3290 chapter_info_xml = self._download_webpage(api, chapter_id,
3291 note=u'Downloading chapter information',
3292 errnote=u'Chapter information download failed')
3293 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3294 for a in doc.findall('.//archive'):
3295 if archive_id == a.find('./id').text:
3298 raise ExtractorError(u'Could not find chapter in chapter information')
3300 video_url = a.find('./video_file_url').text
3301 video_ext = video_url.rpartition('.')[2] or u'flv'
3303 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3304 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3305 note='Downloading chapter metadata',
3306 errnote='Download of chapter metadata failed')
3307 chapter_info = json.loads(chapter_info_json)
3309 bracket_start = int(doc.find('.//bracket_start').text)
3310 bracket_end = int(doc.find('.//bracket_end').text)
3312 # TODO determine start (and probably fix up file)
3313 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3314 #video_url += u'?start=' + TODO:start_timestamp
3315 # bracket_start is 13290, but we want 51670615
3316 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3317 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3320 'id': u'c' + chapter_id,
3323 'title': chapter_info['title'],
3324 'thumbnail': chapter_info['preview'],
3325 'description': chapter_info['description'],
3326 'uploader': chapter_info['channel']['display_name'],
3327 'uploader_id': chapter_info['channel']['name'],
3331 video_id = mobj.group('videoid')
3332 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3334 self.report_extraction(video_id)
3338 limit = self._JUSTIN_PAGE_LIMIT
3341 self.report_download_page(video_id, offset)
3342 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3343 page_count, page_info = self._parse_page(page_url, video_id)
3344 info.extend(page_info)
3345 if not paged or page_count != limit:
3350 class FunnyOrDieIE(InfoExtractor):
3351 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3353 def _real_extract(self, url):
3354 mobj = re.match(self._VALID_URL, url)
3356 raise ExtractorError(u'invalid URL: %s' % url)
3358 video_id = mobj.group('id')
3359 webpage = self._download_webpage(url, video_id)
3361 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3362 webpage, u'video URL', flags=re.DOTALL)
3364 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3365 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3367 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3368 webpage, u'description', fatal=False, flags=re.DOTALL)
3375 'description': video_description,
3379 class SteamIE(InfoExtractor):
3380 _VALID_URL = r"""http://store\.steampowered\.com/
3382 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3384 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3386 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3387 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3390 def suitable(cls, url):
3391 """Receives a URL and returns True if suitable for this IE."""
3392 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3394 def _real_extract(self, url):
3395 m = re.match(self._VALID_URL, url, re.VERBOSE)
3396 gameID = m.group('gameID')
3398 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3399 webpage = self._download_webpage(videourl, gameID)
3401 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3402 videourl = self._AGECHECK_TEMPLATE % gameID
3403 self.report_age_confirmation()
3404 webpage = self._download_webpage(videourl, gameID)
3406 self.report_extraction(gameID)
3407 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3408 webpage, 'game title')
3410 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3411 mweb = re.finditer(urlRE, webpage)
3412 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3413 titles = re.finditer(namesRE, webpage)
3414 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3415 thumbs = re.finditer(thumbsRE, webpage)
3417 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3418 video_id = vid.group('videoID')
3419 title = vtitle.group('videoName')
3420 video_url = vid.group('videoURL')
3421 video_thumb = thumb.group('thumbnail')
3423 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3428 'title': unescapeHTML(title),
3429 'thumbnail': video_thumb
3432 return [self.playlist_result(videos, gameID, game_title)]
3434 class UstreamIE(InfoExtractor):
3435 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3436 IE_NAME = u'ustream'
3438 def _real_extract(self, url):
3439 m = re.match(self._VALID_URL, url)
3440 video_id = m.group('videoID')
3442 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3443 webpage = self._download_webpage(url, video_id)
3445 self.report_extraction(video_id)
3447 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3450 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3451 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3453 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3454 webpage, u'thumbnail', fatal=False)
3460 'title': video_title,
3461 'uploader': uploader,
3462 'thumbnail': thumbnail,
3466 class WorldStarHipHopIE(InfoExtractor):
3467 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3468 IE_NAME = u'WorldStarHipHop'
3470 def _real_extract(self, url):
3471 m = re.match(self._VALID_URL, url)
3472 video_id = m.group('id')
3474 webpage_src = self._download_webpage(url, video_id)
3476 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3477 webpage_src, u'video URL')
3479 if 'mp4' in video_url:
3484 video_title = self._html_search_regex(r"<title>(.*)</title>",
3485 webpage_src, u'title')
3487 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3488 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3489 webpage_src, u'thumbnail', fatal=False)
3492 _title = r"""candytitles.*>(.*)</span>"""
3493 mobj = re.search(_title, webpage_src)
3494 if mobj is not None:
3495 video_title = mobj.group(1)
3500 'title' : video_title,
3501 'thumbnail' : thumbnail,
3506 class RBMARadioIE(InfoExtractor):
3507 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3509 def _real_extract(self, url):
3510 m = re.match(self._VALID_URL, url)
3511 video_id = m.group('videoID')
3513 webpage = self._download_webpage(url, video_id)
3515 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3516 webpage, u'json data', flags=re.MULTILINE)
3519 data = json.loads(json_data)
3520 except ValueError as e:
3521 raise ExtractorError(u'Invalid JSON: ' + str(e))
3523 video_url = data['akamai_url'] + '&cbr=256'
3524 url_parts = compat_urllib_parse_urlparse(video_url)
3525 video_ext = url_parts.path.rpartition('.')[2]
3530 'title': data['title'],
3531 'description': data.get('teaser_text'),
3532 'location': data.get('country_of_origin'),
3533 'uploader': data.get('host', {}).get('name'),
3534 'uploader_id': data.get('host', {}).get('slug'),
3535 'thumbnail': data.get('image', {}).get('large_url_2x'),
3536 'duration': data.get('duration'),
3541 class YouPornIE(InfoExtractor):
3542 """Information extractor for youporn.com."""
3543 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3545 def _print_formats(self, formats):
3546 """Print all available formats"""
3547 print(u'Available formats:')
3548 print(u'ext\t\tformat')
3549 print(u'---------------------------------')
3550 for format in formats:
3551 print(u'%s\t\t%s' % (format['ext'], format['format']))
3553 def _specific(self, req_format, formats):
3555 if(x["format"]==req_format):
3559 def _real_extract(self, url):
3560 mobj = re.match(self._VALID_URL, url)
3562 raise ExtractorError(u'Invalid URL: %s' % url)
3563 video_id = mobj.group('videoid')
3565 req = compat_urllib_request.Request(url)
3566 req.add_header('Cookie', 'age_verified=1')
3567 webpage = self._download_webpage(req, video_id)
3569 # Get JSON parameters
3570 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3572 params = json.loads(json_params)
3574 raise ExtractorError(u'Invalid JSON')
3576 self.report_extraction(video_id)
3578 video_title = params['title']
3579 upload_date = unified_strdate(params['release_date_f'])
3580 video_description = params['description']
3581 video_uploader = params['submitted_by']
3582 thumbnail = params['thumbnails'][0]['image']
3584 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3586 # Get all of the formats available
3587 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3588 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3589 webpage, u'download list').strip()
3591 # Get all of the links from the page
3592 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3593 links = re.findall(LINK_RE, download_list_html)
3594 if(len(links) == 0):
3595 raise ExtractorError(u'ERROR: no known formats available for video')
3597 self.to_screen(u'Links found: %d' % len(links))
3602 # A link looks like this:
3603 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3604 # A path looks like this:
3605 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3606 video_url = unescapeHTML( link )
3607 path = compat_urllib_parse_urlparse( video_url ).path
3608 extension = os.path.splitext( path )[1][1:]
3609 format = path.split('/')[4].split('_')[:2]
3612 format = "-".join( format )
3613 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3618 'uploader': video_uploader,
3619 'upload_date': upload_date,
3620 'title': video_title,
3623 'thumbnail': thumbnail,
3624 'description': video_description
3627 if self._downloader.params.get('listformats', None):
3628 self._print_formats(formats)
3631 req_format = self._downloader.params.get('format', None)
3632 self.to_screen(u'Format: %s' % req_format)
3634 if req_format is None or req_format == 'best':
3636 elif req_format == 'worst':
3637 return [formats[-1]]
3638 elif req_format in ('-1', 'all'):
3641 format = self._specific( req_format, formats )
3643 raise ExtractorError(u'Requested format not available')
3648 class PornotubeIE(InfoExtractor):
3649 """Information extractor for pornotube.com."""
3650 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3652 def _real_extract(self, url):
3653 mobj = re.match(self._VALID_URL, url)
3655 raise ExtractorError(u'Invalid URL: %s' % url)
3657 video_id = mobj.group('videoid')
3658 video_title = mobj.group('title')
3660 # Get webpage content
3661 webpage = self._download_webpage(url, video_id)
3664 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3665 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3666 video_url = compat_urllib_parse.unquote(video_url)
3668 #Get the uploaded date
3669 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3670 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3671 if upload_date: upload_date = unified_strdate(upload_date)
3673 info = {'id': video_id,
3676 'upload_date': upload_date,
3677 'title': video_title,
3683 class YouJizzIE(InfoExtractor):
3684 """Information extractor for youjizz.com."""
3685 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3687 def _real_extract(self, url):
3688 mobj = re.match(self._VALID_URL, url)
3690 raise ExtractorError(u'Invalid URL: %s' % url)
3692 video_id = mobj.group('videoid')
3694 # Get webpage content
3695 webpage = self._download_webpage(url, video_id)
3697 # Get the video title
3698 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3699 webpage, u'title').strip()
3701 # Get the embed page
3702 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3704 raise ExtractorError(u'ERROR: unable to extract embed page')
3706 embed_page_url = result.group(0).strip()
3707 video_id = result.group('videoid')
3709 webpage = self._download_webpage(embed_page_url, video_id)
3712 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3713 webpage, u'video URL')
3715 info = {'id': video_id,
3717 'title': video_title,
3720 'player_url': embed_page_url}
3724 class EightTracksIE(InfoExtractor):
3726 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3728 def _real_extract(self, url):
3729 mobj = re.match(self._VALID_URL, url)
3731 raise ExtractorError(u'Invalid URL: %s' % url)
3732 playlist_id = mobj.group('id')
3734 webpage = self._download_webpage(url, playlist_id)
3736 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3737 data = json.loads(json_like)
3739 session = str(random.randint(0, 1000000000))
3741 track_count = data['tracks_count']
3742 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3743 next_url = first_url
3745 for i in itertools.count():
3746 api_json = self._download_webpage(next_url, playlist_id,
3747 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3748 errnote=u'Failed to download song information')
3749 api_data = json.loads(api_json)
3750 track_data = api_data[u'set']['track']
3752 'id': track_data['id'],
3753 'url': track_data['track_file_stream_url'],
3754 'title': track_data['performer'] + u' - ' + track_data['name'],
3755 'raw_title': track_data['name'],
3756 'uploader_id': data['user']['login'],
3760 if api_data['set']['at_last_track']:
3762 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3765 class KeekIE(InfoExtractor):
3766 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3769 def _real_extract(self, url):
3770 m = re.match(self._VALID_URL, url)
3771 video_id = m.group('videoID')
3773 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3774 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3775 webpage = self._download_webpage(url, video_id)
3777 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3780 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3781 webpage, u'uploader', fatal=False)
3787 'title': video_title,
3788 'thumbnail': thumbnail,
3789 'uploader': uploader
3793 class TEDIE(InfoExtractor):
3794 _VALID_URL=r'''http://www\.ted\.com/
3796 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3798 ((?P<type_talk>talks)) # We have a simple talk
3800 (/lang/(.*?))? # The url may contain the language
3801 /(?P<name>\w+) # Here goes the name and then ".html"
3805 def suitable(cls, url):
3806 """Receives a URL and returns True if suitable for this IE."""
3807 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3809 def _real_extract(self, url):
3810 m=re.match(self._VALID_URL, url, re.VERBOSE)
3811 if m.group('type_talk'):
3812 return [self._talk_info(url)]
3814 playlist_id=m.group('playlist_id')
3815 name=m.group('name')
3816 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3817 return [self._playlist_videos_info(url,name,playlist_id)]
3819 def _playlist_videos_info(self,url,name,playlist_id=0):
3820 '''Returns the videos of the playlist'''
3822 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3823 ([.\s]*?)data-playlist_item_id="(\d+)"
3824 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3826 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3827 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3828 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3829 m_names=re.finditer(video_name_RE,webpage)
3831 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3832 webpage, 'playlist title')
3834 playlist_entries = []
3835 for m_video, m_name in zip(m_videos,m_names):
3836 video_id=m_video.group('video_id')
3837 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3838 playlist_entries.append(self.url_result(talk_url, 'TED'))
3839 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3841 def _talk_info(self, url, video_id=0):
3842 """Return the video for the talk in the url"""
3843 m = re.match(self._VALID_URL, url,re.VERBOSE)
3844 video_name = m.group('name')
3845 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3846 self.report_extraction(video_name)
3847 # If the url includes the language we get the title translated
3848 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3850 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3851 webpage, 'json data')
3852 info = json.loads(json_data)
3853 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3854 webpage, 'description', flags = re.DOTALL)
3856 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3857 webpage, 'thumbnail')
3860 'url': info['htmlStreams'][-1]['file'],
3863 'thumbnail': thumbnail,
3864 'description': desc,
3868 class MySpassIE(InfoExtractor):
3869 _VALID_URL = r'http://www.myspass.de/.*'
3871 def _real_extract(self, url):
3872 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3874 # video id is the last path element of the URL
3875 # usually there is a trailing slash, so also try the second but last
3876 url_path = compat_urllib_parse_urlparse(url).path
3877 url_parent_path, video_id = os.path.split(url_path)
3879 _, video_id = os.path.split(url_parent_path)
3882 metadata_url = META_DATA_URL_TEMPLATE % video_id
3883 metadata_text = self._download_webpage(metadata_url, video_id)
3884 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3886 # extract values from metadata
3887 url_flv_el = metadata.find('url_flv')
3888 if url_flv_el is None:
3889 raise ExtractorError(u'Unable to extract download url')
3890 video_url = url_flv_el.text
3891 extension = os.path.splitext(video_url)[1][1:]
3892 title_el = metadata.find('title')
3893 if title_el is None:
3894 raise ExtractorError(u'Unable to extract title')
3895 title = title_el.text
3896 format_id_el = metadata.find('format_id')
3897 if format_id_el is None:
3900 format = format_id_el.text
3901 description_el = metadata.find('description')
3902 if description_el is not None:
3903 description = description_el.text
3906 imagePreview_el = metadata.find('imagePreview')
3907 if imagePreview_el is not None:
3908 thumbnail = imagePreview_el.text
3917 'thumbnail': thumbnail,
3918 'description': description
3922 class SpiegelIE(InfoExtractor):
3923 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3925 def _real_extract(self, url):
3926 m = re.match(self._VALID_URL, url)
3927 video_id = m.group('videoID')
3929 webpage = self._download_webpage(url, video_id)
3931 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3934 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3935 xml_code = self._download_webpage(xml_url, video_id,
3936 note=u'Downloading XML', errnote=u'Failed to download XML')
3938 idoc = xml.etree.ElementTree.fromstring(xml_code)
3939 last_type = idoc[-1]
3940 filename = last_type.findall('./filename')[0].text
3941 duration = float(last_type.findall('./duration')[0].text)
3943 video_url = 'http://video2.spiegel.de/flash/' + filename
3944 video_ext = filename.rpartition('.')[2]
3949 'title': video_title,
3950 'duration': duration,
3954 class LiveLeakIE(InfoExtractor):
3956 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3957 IE_NAME = u'liveleak'
3959 def _real_extract(self, url):
3960 mobj = re.match(self._VALID_URL, url)
3962 raise ExtractorError(u'Invalid URL: %s' % url)
3964 video_id = mobj.group('video_id')
3966 webpage = self._download_webpage(url, video_id)
3968 video_url = self._search_regex(r'file: "(.*?)",',
3969 webpage, u'video URL')
3971 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3972 webpage, u'title').replace('LiveLeak.com -', '').strip()
3974 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3975 webpage, u'description', fatal=False)
3977 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3978 webpage, u'uploader', fatal=False)
3984 'title': video_title,
3985 'description': video_description,
3986 'uploader': video_uploader
3991 class ARDIE(InfoExtractor):
3992 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3993 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3994 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3996 def _real_extract(self, url):
3997 # determine video id from url
3998 m = re.match(self._VALID_URL, url)
4000 numid = re.search(r'documentId=([0-9]+)', url)
4002 video_id = numid.group(1)
4004 video_id = m.group('video_id')
4006 # determine title and media streams from webpage
4007 html = self._download_webpage(url, video_id)
4008 title = re.search(self._TITLE, html).group('title')
4009 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4011 assert '"fsk"' in html
4012 raise ExtractorError(u'This video is only available after 8:00 pm')
4014 # choose default media type and highest quality for now
4015 stream = max([s for s in streams if int(s["media_type"]) == 0],
4016 key=lambda s: int(s["quality"]))
4018 # there's two possibilities: RTMP stream or HTTP download
4019 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4020 if stream['rtmp_url']:
4021 self.to_screen(u'RTMP download detected')
4022 assert stream['video_url'].startswith('mp4:')
4023 info["url"] = stream["rtmp_url"]
4024 info["play_path"] = stream['video_url']
4026 assert stream["video_url"].endswith('.mp4')
4027 info["url"] = stream["video_url"]
4030 class ZDFIE(InfoExtractor):
4031 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4032 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4033 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4034 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4035 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4037 def _real_extract(self, url):
4038 mobj = re.match(self._VALID_URL, url)
4040 raise ExtractorError(u'Invalid URL: %s' % url)
4041 video_id = mobj.group('video_id')
4043 html = self._download_webpage(url, video_id)
4044 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4046 raise ExtractorError(u'No media url found.')
4048 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4049 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4050 # choose first/default media type and highest quality for now
4051 for s in streams: #find 300 - dsl1000mbit
4052 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4055 for s in streams: #find veryhigh - dsl2000mbit
4056 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4060 raise ExtractorError(u'No stream found.')
4062 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4064 self.report_extraction(video_id)
4065 mobj = re.search(self._TITLE, html)
4067 raise ExtractorError(u'Cannot extract title')
4068 title = unescapeHTML(mobj.group('title'))
4070 mobj = re.search(self._MMS_STREAM, media_link)
4072 mobj = re.search(self._RTSP_STREAM, media_link)
4074 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4075 mms_url = mobj.group('video_url')
4077 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4079 raise ExtractorError(u'Cannot extract extention')
4080 ext = mobj.group('ext')
4082 return [{'id': video_id,
4088 class TumblrIE(InfoExtractor):
4089 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4091 def _real_extract(self, url):
4092 m_url = re.match(self._VALID_URL, url)
4093 video_id = m_url.group('id')
4094 blog = m_url.group('blog_name')
4096 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4097 webpage = self._download_webpage(url, video_id)
4099 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4100 video = re.search(re_video, webpage)
4102 raise ExtractorError(u'Unable to extract video')
4103 video_url = video.group('video_url')
4104 ext = video.group('ext')
4106 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4107 webpage, u'thumbnail', fatal=False) # We pick the first poster
4108 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4110 # The only place where you can get a title, it's not complete,
4111 # but searching in other places doesn't work for all videos
4112 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4113 webpage, u'title', flags=re.DOTALL)
4115 return [{'id': video_id,
4117 'title': video_title,
4118 'thumbnail': video_thumbnail,
4122 class BandcampIE(InfoExtractor):
4123 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4125 def _real_extract(self, url):
4126 mobj = re.match(self._VALID_URL, url)
4127 title = mobj.group('title')
4128 webpage = self._download_webpage(url, title)
4129 # We get the link to the free download page
4130 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4131 if m_download is None:
4132 raise ExtractorError(u'No free songs found')
4134 download_link = m_download.group(1)
4135 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4136 webpage, re.MULTILINE|re.DOTALL).group('id')
4138 download_webpage = self._download_webpage(download_link, id,
4139 'Downloading free downloads page')
4140 # We get the dictionary of the track from some javascrip code
4141 info = re.search(r'items: (.*?),$',
4142 download_webpage, re.MULTILINE).group(1)
4143 info = json.loads(info)[0]
4144 # We pick mp3-320 for now, until format selection can be easily implemented.
4145 mp3_info = info[u'downloads'][u'mp3-320']
4146 # If we try to use this url it says the link has expired
4147 initial_url = mp3_info[u'url']
4148 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4149 m_url = re.match(re_url, initial_url)
4150 #We build the url we will use to get the final track url
4151 # This url is build in Bandcamp in the script download_bunde_*.js
4152 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4153 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4154 # If we could correctly generate the .rand field the url would be
4155 #in the "download_url" key
4156 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4158 track_info = {'id':id,
4159 'title' : info[u'title'],
4162 'thumbnail' : info[u'thumb_url'],
4163 'uploader' : info[u'artist']
4168 class RedTubeIE(InfoExtractor):
4169 """Information Extractor for redtube"""
4170 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4172 def _real_extract(self,url):
4173 mobj = re.match(self._VALID_URL, url)
4175 raise ExtractorError(u'Invalid URL: %s' % url)
4177 video_id = mobj.group('id')
4178 video_extension = 'mp4'
4179 webpage = self._download_webpage(url, video_id)
4181 self.report_extraction(video_id)
4183 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4184 webpage, u'video URL')
4186 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4192 'ext': video_extension,
4193 'title': video_title,
4196 class InaIE(InfoExtractor):
4197 """Information Extractor for Ina.fr"""
4198 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4200 def _real_extract(self,url):
4201 mobj = re.match(self._VALID_URL, url)
4203 video_id = mobj.group('id')
4204 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4205 video_extension = 'mp4'
4206 webpage = self._download_webpage(mrss_url, video_id)
4208 self.report_extraction(video_id)
4210 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4211 webpage, u'video URL')
4213 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4219 'ext': video_extension,
4220 'title': video_title,
4223 class HowcastIE(InfoExtractor):
4224 """Information Extractor for Howcast.com"""
4225 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4227 def _real_extract(self, url):
4228 mobj = re.match(self._VALID_URL, url)
4230 video_id = mobj.group('id')
4231 webpage_url = 'http://www.howcast.com/videos/' + video_id
4232 webpage = self._download_webpage(webpage_url, video_id)
4234 self.report_extraction(video_id)
4236 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4237 webpage, u'video URL')
4239 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4242 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4243 webpage, u'description', fatal=False)
4245 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4246 webpage, u'thumbnail', fatal=False)
4252 'title': video_title,
4253 'description': video_description,
4254 'thumbnail': thumbnail,
4257 class VineIE(InfoExtractor):
4258 """Information Extractor for Vine.co"""
4259 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4261 def _real_extract(self, url):
4262 mobj = re.match(self._VALID_URL, url)
4264 video_id = mobj.group('id')
4265 webpage_url = 'https://vine.co/v/' + video_id
4266 webpage = self._download_webpage(webpage_url, video_id)
4268 self.report_extraction(video_id)
4270 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4271 webpage, u'video URL')
4273 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4276 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4277 webpage, u'thumbnail', fatal=False)
4279 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4280 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4286 'title': video_title,
4287 'thumbnail': thumbnail,
4288 'uploader': uploader,
4291 class FlickrIE(InfoExtractor):
4292 """Information Extractor for Flickr videos"""
4293 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4295 def _real_extract(self, url):
4296 mobj = re.match(self._VALID_URL, url)
4298 video_id = mobj.group('id')
4299 video_uploader_id = mobj.group('uploader_id')
4300 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4301 webpage = self._download_webpage(webpage_url, video_id)
4303 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4305 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4306 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4308 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4309 first_xml, u'node_id')
4311 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4312 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4314 self.report_extraction(video_id)
4316 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4318 raise ExtractorError(u'Unable to extract video url')
4319 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4321 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4322 webpage, u'video title')
4324 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4325 webpage, u'description', fatal=False)
4327 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4328 webpage, u'thumbnail', fatal=False)
4334 'title': video_title,
4335 'description': video_description,
4336 'thumbnail': thumbnail,
4337 'uploader_id': video_uploader_id,
4340 class TeamcocoIE(InfoExtractor):
4341 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4343 def _real_extract(self, url):
4344 mobj = re.match(self._VALID_URL, url)
4346 raise ExtractorError(u'Invalid URL: %s' % url)
4347 url_title = mobj.group('url_title')
4348 webpage = self._download_webpage(url, url_title)
4350 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4351 webpage, u'video id')
4353 self.report_extraction(video_id)
4355 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4358 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4359 webpage, u'thumbnail', fatal=False)
4361 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4362 webpage, u'description', fatal=False)
4364 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4365 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4367 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4374 'title': video_title,
4375 'thumbnail': thumbnail,
4376 'description': video_description,
4379 class XHamsterIE(InfoExtractor):
4380 """Information Extractor for xHamster"""
4381 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4383 def _real_extract(self,url):
4384 mobj = re.match(self._VALID_URL, url)
4386 video_id = mobj.group('id')
4387 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4388 webpage = self._download_webpage(mrss_url, video_id)
4390 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4392 raise ExtractorError(u'Unable to extract media URL')
4393 if len(mobj.group('server')) == 0:
4394 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4396 video_url = mobj.group('server')+'/key='+mobj.group('file')
4397 video_extension = video_url.split('.')[-1]
4399 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4402 # Can't see the description anywhere in the UI
4403 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4404 # webpage, u'description', fatal=False)
4405 # if video_description: video_description = unescapeHTML(video_description)
4407 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4409 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4411 video_upload_date = None
4412 self._downloader.report_warning(u'Unable to extract upload date')
4414 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4415 webpage, u'uploader id', default=u'anonymous')
4417 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4418 webpage, u'thumbnail', fatal=False)
4423 'ext': video_extension,
4424 'title': video_title,
4425 # 'description': video_description,
4426 'upload_date': video_upload_date,
4427 'uploader_id': video_uploader_id,
4428 'thumbnail': video_thumbnail
4431 class HypemIE(InfoExtractor):
4432 """Information Extractor for hypem"""
4433 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4435 def _real_extract(self, url):
4436 mobj = re.match(self._VALID_URL, url)
4438 raise ExtractorError(u'Invalid URL: %s' % url)
4439 track_id = mobj.group(1)
4441 data = { 'ax': 1, 'ts': time.time() }
4442 data_encoded = compat_urllib_parse.urlencode(data)
4443 complete_url = url + "?" + data_encoded
4444 request = compat_urllib_request.Request(complete_url)
4445 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4446 cookie = urlh.headers.get('Set-Cookie', '')
4448 self.report_extraction(track_id)
4450 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4451 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4453 track_list = json.loads(html_tracks)
4454 track = track_list[u'tracks'][0]
4456 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4459 track_id = track[u"id"]
4460 artist = track[u"artist"]
4461 title = track[u"song"]
4463 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4464 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4465 request.add_header('cookie', cookie)
4466 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4468 song_data = json.loads(song_data_json)
4470 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4471 final_url = song_data[u"url"]
4481 class Vbox7IE(InfoExtractor):
4482 """Information Extractor for Vbox7"""
4483 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4485 def _real_extract(self,url):
4486 mobj = re.match(self._VALID_URL, url)
4488 raise ExtractorError(u'Invalid URL: %s' % url)
4489 video_id = mobj.group(1)
4491 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4492 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4493 redirect_url = urlh.geturl() + new_location
4494 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4496 title = self._html_search_regex(r'<title>(.*)</title>',
4497 webpage, u'title').split('/')[0].strip()
4500 info_url = "http://vbox7.com/play/magare.do"
4501 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4502 info_request = compat_urllib_request.Request(info_url, data)
4503 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4504 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4505 if info_response is None:
4506 raise ExtractorError(u'Unable to extract the media url')
4507 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4514 'thumbnail': thumbnail_url,
4517 class GametrailersIE(InfoExtractor):
4518 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4520 def _real_extract(self, url):
4521 mobj = re.match(self._VALID_URL, url)
4523 raise ExtractorError(u'Invalid URL: %s' % url)
4524 video_id = mobj.group('id')
4525 video_type = mobj.group('type')
4526 webpage = self._download_webpage(url, video_id)
4527 if video_type == 'full-episodes':
4528 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4530 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4531 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4532 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4534 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4535 video_id, u'Downloading video info')
4536 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4537 video_id, u'Downloading video urls info')
4539 self.report_extraction(video_id)
4540 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4541 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4543 <url>(?P<thumb>.*?)</url>.*
4546 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4548 raise ExtractorError(u'Unable to extract video info')
4549 video_title = m_info.group('title')
4550 video_description = m_info.group('description')
4551 video_thumb = m_info.group('thumb')
4553 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4554 if m_urls is None or len(m_urls) == 0:
4555 raise ExtractError(u'Unable to extrat video url')
4556 # They are sorted from worst to best quality
4557 video_url = m_urls[-1].group('url')
4559 return {'url': video_url,
4561 'title': video_title,
4562 # Videos are actually flv not mp4
4564 'thumbnail': video_thumb,
4565 'description': video_description,
4568 def gen_extractors():
4569 """ Return a list of an instance of every supported extractor.
4570 The order does matter; the first extractor matched is the one handling the URL.
4573 YoutubePlaylistIE(),
4598 StanfordOpenClassroomIE(),
4608 WorldStarHipHopIE(),
4637 def get_info_extractor(ie_name):
4638 """Returns the info extractor class with the given ie_name"""
4639 return globals()[ie_name+'IE']