9 from .common import InfoExtractor, SearchInfoExtractor
10 from .subtitles import SubtitlesInfoExtractor
16 compat_urllib_request,
27 class YoutubeBaseInfoExtractor(InfoExtractor):
28 """Provide base functions for Youtube extractors"""
29 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
30 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
31 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
32 _NETRC_MACHINE = 'youtube'
33 # If True it will raise an error if no login info is provided
34 _LOGIN_REQUIRED = False
36 def report_lang(self):
37 """Report attempt to set language."""
38 self.to_screen(u'Setting language')
40 def _set_language(self):
41 request = compat_urllib_request.Request(self._LANG_URL)
44 compat_urllib_request.urlopen(request).read()
45 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
46 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
51 (username, password) = self._get_login_info()
52 # No authentication to be performed
54 if self._LOGIN_REQUIRED:
55 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 request = compat_urllib_request.Request(self._LOGIN_URL)
60 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
61 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
62 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
67 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
70 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
76 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 u'PersistentCookie': u'yes',
82 u'bgresponse': u'js_disabled',
83 u'checkConnection': u'',
84 u'checkedDomains': u'youtube',
90 u'signIn': u'Sign in',
92 u'service': u'youtube',
96 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
98 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
99 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
100 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
103 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
107 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
108 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
112 def _confirm_age(self):
115 'action_confirm': 'Confirm',
117 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
119 self.report_age_confirmation()
120 compat_urllib_request.urlopen(request).read().decode('utf-8')
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
125 def _real_initialize(self):
126 if self._downloader is None:
128 if not self._set_language():
130 if not self._login():
135 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
136 IE_DESC = u'YouTube.com'
139 (?:https?://)? # http(s):// (optional)
140 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
141 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
142 (?:.*?\#/)? # handle anchor (#/) redirect urls
143 (?: # the various things that can precede the ID:
144 (?:(?:v|embed|e)/) # v/ or embed/ or e/
145 |(?: # or the v= param in all its forms
146 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
147 (?:\?|\#!?) # the params delimiter ? or # or #!
148 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
152 |youtu\.be/ # just youtu.be/xxxx
154 )? # all until now is optional -> you can pass the naked ID
155 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
156 (?(1).+)? # if we found the ID, everything can follow
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 # Listed in order of quality
160 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
161 # Apple HTTP Live Streaming
162 '96', '95', '94', '93', '92', '132', '151',
164 '85', '84', '102', '83', '101', '82', '100',
166 '138', '137', '248', '136', '247', '135', '246',
167 '245', '244', '134', '243', '133', '242', '160',
169 '141', '172', '140', '171', '139',
171 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '102', '84', '101', '83', '100', '82',
177 '138', '248', '137', '247', '136', '246', '245',
178 '244', '135', '243', '134', '242', '133', '160',
180 '172', '141', '171', '140', '139',
182 _video_formats_map = {
183 'flv': ['35', '34', '6', '5'],
184 '3gp': ['36', '17', '13'],
185 'mp4': ['38', '37', '22', '18'],
186 'webm': ['46', '45', '44', '43'],
188 _video_extensions = {
210 # Apple HTTP Live Streaming
242 _video_dimensions = {
324 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
325 u"file": u"BaW_jenozKc.mp4",
327 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
328 u"uploader": u"Philipp Hagemeister",
329 u"uploader_id": u"phihag",
330 u"upload_date": u"20121002",
331 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
335 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
336 u"file": u"1ltcDfZMA3U.flv",
337 u"note": u"Test VEVO video (#897)",
339 u"upload_date": u"20070518",
340 u"title": u"Maps - It Will Find You",
341 u"description": u"Music video by Maps performing It Will Find You.",
342 u"uploader": u"MuteUSA",
343 u"uploader_id": u"MuteUSA"
347 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
348 u"file": u"UxxajLWwzqY.mp4",
349 u"note": u"Test generic use_cipher_signature video (#897)",
351 u"upload_date": u"20120506",
352 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
353 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
354 u"uploader": u"Icona Pop",
355 u"uploader_id": u"IconaPop"
359 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
360 u"file": u"07FYdnEawAQ.mp4",
361 u"note": u"Test VEVO video with age protection (#956)",
363 u"upload_date": u"20130703",
364 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
365 u"description": u"md5:64249768eec3bc4276236606ea996373",
366 u"uploader": u"justintimberlakeVEVO",
367 u"uploader_id": u"justintimberlakeVEVO"
371 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
372 u'file': u'TGi3HqYrWHE.mp4',
373 u'note': u'm3u8 video',
375 u'title': u'Triathlon - Men - London 2012 Olympic Games',
376 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
377 u'uploader': u'olympic',
378 u'upload_date': u'20120807',
379 u'uploader_id': u'olympic',
382 u'skip_download': True,
389 def suitable(cls, url):
390 """Receives a URL and returns True if suitable for this IE."""
391 if YoutubePlaylistIE.suitable(url): return False
392 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
394 def report_video_webpage_download(self, video_id):
395 """Report attempt to download video webpage."""
396 self.to_screen(u'%s: Downloading video webpage' % video_id)
398 def report_video_info_webpage_download(self, video_id):
399 """Report attempt to download video info webpage."""
400 self.to_screen(u'%s: Downloading video info webpage' % video_id)
402 def report_information_extraction(self, video_id):
403 """Report attempt to extract video information."""
404 self.to_screen(u'%s: Extracting video information' % video_id)
406 def report_unavailable_format(self, video_id, format):
407 """Report extracted video URL."""
408 self.to_screen(u'%s: Format %s not available' % (video_id, format))
410 def report_rtmp_download(self):
411 """Indicate the download will use the RTMP protocol."""
412 self.to_screen(u'RTMP download detected')
414 def _decrypt_signature(self, s):
415 """Turn the encrypted s field into a working signature"""
418 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
420 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
422 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
424 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
426 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
428 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
430 return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]
432 return s[81:36:-1] + s[0] + s[35:2:-1]
434 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
436 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
438 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
440 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
442 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
445 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
447 def _decrypt_signature_age_gate(self, s):
448 # The videos with age protection use another player, so the algorithms
451 return s[2:63] + s[82] + s[64:82] + s[63]
453 # Fallback to the other algortihms
454 return self._decrypt_signature(s)
456 def _get_available_subtitles(self, video_id):
458 sub_list = self._download_webpage(
459 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
460 video_id, note=False)
461 except ExtractorError as err:
462 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
464 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
469 params = compat_urllib_parse.urlencode({
472 'fmt': self._downloader.params.get('subtitlesformat'),
474 url = u'http://www.youtube.com/api/timedtext?' + params
475 sub_lang_list[lang] = url
476 if not sub_lang_list:
477 self._downloader.report_warning(u'video doesn\'t have subtitles')
481 def _request_automatic_caption(self, video_id, webpage):
482 """We need the webpage for getting the captions url, pass it as an
483 argument to speed up the process."""
484 sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
485 sub_format = self._downloader.params.get('subtitlesformat')
486 self.to_screen(u'%s: Looking for automatic captions' % video_id)
487 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
488 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
490 self._downloader.report_warning(err_msg)
492 player_config = json.loads(mobj.group(1))
494 args = player_config[u'args']
495 caption_url = args[u'ttsurl']
496 timestamp = args[u'timestamp']
497 params = compat_urllib_parse.urlencode({
504 subtitles_url = caption_url + '&' + params
505 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
506 return {sub_lang: sub}
507 # An extractor error can be raise by the download process if there are
508 # no automatic captions but there are subtitles
509 except (KeyError, ExtractorError):
510 self._downloader.report_warning(err_msg)
513 def _print_formats(self, formats):
514 print('Available formats:')
516 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
517 self._video_dimensions.get(x, '???'),
518 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
520 def _extract_id(self, url):
521 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
523 raise ExtractorError(u'Invalid URL: %s' % url)
524 video_id = mobj.group(2)
527 def _get_video_url_list(self, url_map):
529 Transform a dictionary in the format {itag:url} to a list of (itag, url)
530 with the requested formats.
532 req_format = self._downloader.params.get('format', None)
533 format_limit = self._downloader.params.get('format_limit', None)
534 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
535 if format_limit is not None and format_limit in available_formats:
536 format_list = available_formats[available_formats.index(format_limit):]
538 format_list = available_formats
539 existing_formats = [x for x in format_list if x in url_map]
540 if len(existing_formats) == 0:
541 raise ExtractorError(u'no known formats available for video')
542 if self._downloader.params.get('listformats', None):
543 self._print_formats(existing_formats)
545 if req_format is None or req_format == 'best':
546 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547 elif req_format == 'worst':
548 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
549 elif req_format in ('-1', 'all'):
550 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
552 # Specific formats. We pick the first in a slash-delimeted sequence.
553 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
554 # available in the specified format. For example,
555 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
556 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
557 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
558 req_formats = req_format.split('/')
559 video_url_list = None
560 for rf in req_formats:
562 video_url_list = [(rf, url_map[rf])]
564 if rf in self._video_formats_map:
565 for srf in self._video_formats_map[rf]:
567 video_url_list = [(srf, url_map[srf])]
572 if video_url_list is None:
573 raise ExtractorError(u'requested format not available')
574 return video_url_list
576 def _extract_from_m3u8(self, manifest_url, video_id):
578 def _get_urls(_manifest):
579 lines = _manifest.split('\n')
580 urls = filter(lambda l: l and not l.startswith('#'),
583 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
584 formats_urls = _get_urls(manifest)
585 for format_url in formats_urls:
586 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
587 url_map[itag] = format_url
590 def _real_extract(self, url):
591 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
592 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
594 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
595 mobj = re.search(self._NEXT_URL_RE, url)
597 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
598 video_id = self._extract_id(url)
601 self.report_video_webpage_download(video_id)
602 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
603 request = compat_urllib_request.Request(url)
605 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
606 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
607 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
609 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
611 # Attempt to extract SWF player URL
612 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
614 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
619 self.report_video_info_webpage_download(video_id)
620 if re.search(r'player-age-gate-content">', video_webpage) is not None:
621 self.report_age_confirmation()
623 # We simulate the access to the video from www.youtube.com/v/{video_id}
624 # this can be viewed without login into Youtube
625 data = compat_urllib_parse.urlencode({'video_id': video_id,
629 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
633 video_info_url = 'https://www.youtube.com/get_video_info?' + data
634 video_info_webpage = self._download_webpage(video_info_url, video_id,
636 errnote='unable to download video info webpage')
637 video_info = compat_parse_qs(video_info_webpage)
640 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
641 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
642 % (video_id, el_type))
643 video_info_webpage = self._download_webpage(video_info_url, video_id,
645 errnote='unable to download video info webpage')
646 video_info = compat_parse_qs(video_info_webpage)
647 if 'token' in video_info:
649 if 'token' not in video_info:
650 if 'reason' in video_info:
651 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
653 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
655 # Check for "rental" videos
656 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
657 raise ExtractorError(u'"rental" videos not supported')
659 # Start extracting information
660 self.report_information_extraction(video_id)
663 if 'author' not in video_info:
664 raise ExtractorError(u'Unable to extract uploader name')
665 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
668 video_uploader_id = None
669 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
671 video_uploader_id = mobj.group(1)
673 self._downloader.report_warning(u'unable to extract uploader nickname')
676 if 'title' not in video_info:
677 raise ExtractorError(u'Unable to extract video title')
678 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
681 # We try first to get a high quality image:
682 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
683 video_webpage, re.DOTALL)
684 if m_thumb is not None:
685 video_thumbnail = m_thumb.group(1)
686 elif 'thumbnail_url' not in video_info:
687 self._downloader.report_warning(u'unable to extract video thumbnail')
689 else: # don't panic if we can't find it
690 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
694 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
696 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
697 upload_date = unified_strdate(upload_date)
700 video_description = get_element_by_id("eow-description", video_webpage)
701 if video_description:
702 video_description = clean_html(video_description)
704 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
706 video_description = unescapeHTML(fd_mobj.group(1))
708 video_description = u''
711 video_subtitles = self.extract_subtitles(video_id, video_webpage)
713 if self._downloader.params.get('listsubtitles', False):
714 self._list_available_subtitles(video_id)
717 if 'length_seconds' not in video_info:
718 self._downloader.report_warning(u'unable to extract video duration')
721 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
723 # Decide which formats to download
726 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
728 raise ValueError('Could not find vevo ID')
729 info = json.loads(mobj.group(1))
731 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
732 # this signatures are encrypted
733 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
735 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
736 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
737 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
739 if 'url_encoded_fmt_stream_map' in video_info:
740 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
742 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
743 elif 'adaptive_fmts' in video_info:
744 if 'url_encoded_fmt_stream_map' in video_info:
745 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
747 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
751 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
752 self.report_rtmp_download()
753 video_url_list = [(None, video_info['conn'][0])]
754 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
755 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
756 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
758 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
759 url_data = compat_parse_qs(url_data_str)
760 if 'itag' in url_data and 'url' in url_data:
761 url = url_data['url'][0]
762 if 'sig' in url_data:
763 url += '&signature=' + url_data['sig'][0]
764 elif 's' in url_data:
765 if self._downloader.params.get('verbose'):
768 player_version = self._search_regex(r'ad3-(.+?)\.swf',
769 video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
770 'flash player', fatal=False)
771 player = 'flash player %s' % player_version
773 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
774 'html5 player', fatal=False)
775 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
776 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
777 (len(s), parts_sizes, url_data['itag'][0], player))
778 encrypted_sig = url_data['s'][0]
780 signature = self._decrypt_signature_age_gate(encrypted_sig)
782 signature = self._decrypt_signature(encrypted_sig)
783 url += '&signature=' + signature
784 if 'ratebypass' not in url:
785 url += '&ratebypass=yes'
786 url_map[url_data['itag'][0]] = url
787 video_url_list = self._get_video_url_list(url_map)
788 if not video_url_list:
790 elif video_info.get('hlsvp'):
791 manifest_url = video_info['hlsvp'][0]
792 url_map = self._extract_from_m3u8(manifest_url, video_id)
793 video_url_list = self._get_video_url_list(url_map)
794 if not video_url_list:
798 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
801 for format_param, video_real_url in video_url_list:
803 video_extension = self._video_extensions.get(format_param, 'flv')
805 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
806 self._video_dimensions.get(format_param, '???'),
807 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
811 'url': video_real_url,
812 'uploader': video_uploader,
813 'uploader_id': video_uploader_id,
814 'upload_date': upload_date,
815 'title': video_title,
816 'ext': video_extension,
817 'format': video_format,
818 'thumbnail': video_thumbnail,
819 'description': video_description,
820 'player_url': player_url,
821 'subtitles': video_subtitles,
822 'duration': video_duration
826 class YoutubePlaylistIE(InfoExtractor):
827 IE_DESC = u'YouTube.com playlists'
833 (?:course|view_play_list|my_playlists|artist|playlist|watch)
834 \? (?:.*?&)*? (?:p|a|list)=
837 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
840 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
842 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
844 IE_NAME = u'youtube:playlist'
847 def suitable(cls, url):
848 """Receives a URL and returns True if suitable for this IE."""
849 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
851 def _real_extract(self, url):
852 # Extract playlist id
853 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
855 raise ExtractorError(u'Invalid URL: %s' % url)
857 # Download playlist videos from API
858 playlist_id = mobj.group(1) or mobj.group(2)
861 for page_num in itertools.count(1):
862 start_index = self._MAX_RESULTS * (page_num - 1) + 1
863 if start_index >= 1000:
864 self._downloader.report_warning(u'Max number of results reached')
866 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
867 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
870 response = json.loads(page)
871 except ValueError as err:
872 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
874 if 'feed' not in response:
875 raise ExtractorError(u'Got a malformed response from YouTube API')
876 playlist_title = response['feed']['title']['$t']
877 if 'entry' not in response['feed']:
878 # Number of videos is a multiple of self._MAX_RESULTS
881 for entry in response['feed']['entry']:
882 index = entry['yt$position']['$t']
883 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
886 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
889 videos = [v[1] for v in sorted(videos)]
891 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
892 return [self.playlist_result(url_results, playlist_id, playlist_title)]
895 class YoutubeChannelIE(InfoExtractor):
896 IE_DESC = u'YouTube.com channels'
897 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
898 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
899 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
900 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
901 IE_NAME = u'youtube:channel'
903 def extract_videos_from_page(self, page):
905 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
906 if mobj.group(1) not in ids_in_page:
907 ids_in_page.append(mobj.group(1))
910 def _real_extract(self, url):
912 mobj = re.match(self._VALID_URL, url)
914 raise ExtractorError(u'Invalid URL: %s' % url)
916 # Download channel page
917 channel_id = mobj.group(1)
921 url = self._TEMPLATE_URL % (channel_id, pagenum)
922 page = self._download_webpage(url, channel_id,
923 u'Downloading page #%s' % pagenum)
925 # Extract video identifiers
926 ids_in_page = self.extract_videos_from_page(page)
927 video_ids.extend(ids_in_page)
929 # Download any subsequent channel pages using the json-based channel_ajax query
930 if self._MORE_PAGES_INDICATOR in page:
931 for pagenum in itertools.count(1):
932 url = self._MORE_PAGES_URL % (pagenum, channel_id)
933 page = self._download_webpage(url, channel_id,
934 u'Downloading page #%s' % pagenum)
936 page = json.loads(page)
938 ids_in_page = self.extract_videos_from_page(page['content_html'])
939 video_ids.extend(ids_in_page)
941 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
944 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
946 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
947 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
948 return [self.playlist_result(url_entries, channel_id)]
951 class YoutubeUserIE(InfoExtractor):
952 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
953 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
954 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
955 _GDATA_PAGE_SIZE = 50
956 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
957 IE_NAME = u'youtube:user'
960 def suitable(cls, url):
961 # Don't return True if the url can be extracted with other youtube
962 # extractor, the regex would is too permissive and it would match.
963 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
964 if any(ie.suitable(url) for ie in other_ies): return False
965 else: return super(YoutubeUserIE, cls).suitable(url)
967 def _real_extract(self, url):
969 mobj = re.match(self._VALID_URL, url)
971 raise ExtractorError(u'Invalid URL: %s' % url)
973 username = mobj.group(1)
975 # Download video ids using YouTube Data API. Result size per
976 # query is limited (currently to 50 videos) so we need to query
977 # page by page until there are no video ids - it means we got
982 for pagenum in itertools.count(0):
983 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
985 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
986 page = self._download_webpage(gdata_url, username,
987 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
990 response = json.loads(page)
991 except ValueError as err:
992 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
994 # Extract video identifiers
996 for entry in response['feed']['entry']:
997 ids_in_page.append(entry['id']['$t'].split('/')[-1])
998 video_ids.extend(ids_in_page)
1000 # A little optimization - if current page is not
1001 # "full", ie. does not contain PAGE_SIZE video ids then
1002 # we can assume that this page is the last one - there
1003 # are no more ids on further pages - no need to query
1006 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1009 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1010 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1011 return [self.playlist_result(url_results, playlist_title = username)]
1013 class YoutubeSearchIE(SearchInfoExtractor):
1014 IE_DESC = u'YouTube.com searches'
1015 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1017 IE_NAME = u'youtube:search'
1018 _SEARCH_KEY = 'ytsearch'
1020 def report_download_page(self, query, pagenum):
1021 """Report attempt to download search page with given number."""
1022 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1024 def _get_n_results(self, query, n):
1025 """Get a specified number of results for a query"""
1031 while (50 * pagenum) < limit:
1032 self.report_download_page(query, pagenum+1)
1033 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1034 request = compat_urllib_request.Request(result_url)
1036 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1037 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1038 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1039 api_response = json.loads(data)['data']
1041 if not 'items' in api_response:
1042 raise ExtractorError(u'[youtube] No video results')
1044 new_ids = list(video['id'] for video in api_response['items'])
1045 video_ids += new_ids
1047 limit = min(n, api_response['totalItems'])
1050 if len(video_ids) > n:
1051 video_ids = video_ids[:n]
1052 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1053 return self.playlist_result(videos, query)
1056 class YoutubeShowIE(InfoExtractor):
1057 IE_DESC = u'YouTube.com (multi-season) shows'
1058 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1059 IE_NAME = u'youtube:show'
1061 def _real_extract(self, url):
1062 mobj = re.match(self._VALID_URL, url)
1063 show_name = mobj.group(1)
1064 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1065 # There's one playlist for each season of the show
1066 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1067 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1068 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1071 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1073 Base class for extractors that fetch info from
1074 http://www.youtube.com/feed_ajax
1075 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1077 _LOGIN_REQUIRED = True
1079 # use action_load_personal_feed instead of action_load_system_feed
1080 _PERSONAL_FEED = False
1083 def _FEED_TEMPLATE(self):
1084 action = 'action_load_system_feed'
1085 if self._PERSONAL_FEED:
1086 action = 'action_load_personal_feed'
1087 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1091 return u'youtube:%s' % self._FEED_NAME
1093 def _real_initialize(self):
1096 def _real_extract(self, url):
1098 # The step argument is available only in 2.7 or higher
1099 for i in itertools.count(0):
1100 paging = i*self._PAGING_STEP
1101 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1102 u'%s feed' % self._FEED_NAME,
1103 u'Downloading page %s' % i)
1104 info = json.loads(info)
1105 feed_html = info['feed_html']
1106 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1107 ids = orderedSet(m.group(1) for m in m_ids)
1108 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1109 if info['paging'] is None:
1111 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1113 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1114 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1115 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1116 _FEED_NAME = 'subscriptions'
1117 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1119 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1120 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1121 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1122 _FEED_NAME = 'recommended'
1123 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1125 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1126 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1127 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1128 _FEED_NAME = 'watch_later'
1129 _PLAYLIST_TITLE = u'Youtube Watch Later'
1131 _PERSONAL_FEED = True
1133 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1134 IE_NAME = u'youtube:favorites'
1135 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1136 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1137 _LOGIN_REQUIRED = True
1139 def _real_extract(self, url):
1140 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1141 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1142 return self.url_result(playlist_id, 'YoutubePlaylist')