8 import xml.etree.ElementTree
10 from .common import InfoExtractor, SearchInfoExtractor
11 from .subtitles import SubtitlesInfoExtractor
17 compat_urllib_request,
28 class YoutubeBaseInfoExtractor(InfoExtractor):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE = 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED = False
37 def report_lang(self):
38 """Report attempt to set language."""
39 self.to_screen(u'Setting language')
41 def _set_language(self):
42 request = compat_urllib_request.Request(self._LANG_URL)
45 compat_urllib_request.urlopen(request).read()
46 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
47 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 request = compat_urllib_request.Request(self._LOGIN_URL)
61 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
62 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
63 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
68 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
71 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'PersistentCookie': u'yes',
83 u'bgresponse': u'js_disabled',
84 u'checkConnection': u'',
85 u'checkedDomains': u'youtube',
91 u'signIn': u'Sign in',
93 u'service': u'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
100 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
101 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
104 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
109 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
113 def _confirm_age(self):
116 'action_confirm': 'Confirm',
118 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
120 self.report_age_confirmation()
121 compat_urllib_request.urlopen(request).read().decode('utf-8')
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
126 def _real_initialize(self):
127 if self._downloader is None:
129 if not self._set_language():
131 if not self._login():
136 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
137 IE_DESC = u'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
154 |youtu\.be/ # just youtu.be/xxxx
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 # Listed in order of quality
162 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
163 # Apple HTTP Live Streaming
164 '96', '95', '94', '93', '92', '132', '151',
166 '85', '84', '102', '83', '101', '82', '100',
168 '138', '137', '248', '136', '247', '135', '246',
169 '245', '244', '134', '243', '133', '242', '160',
171 '141', '172', '140', '171', '139',
173 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '102', '84', '101', '83', '100', '82',
179 '138', '248', '137', '247', '136', '246', '245',
180 '244', '135', '243', '134', '242', '133', '160',
182 '172', '141', '171', '140', '139',
184 _video_formats_map = {
185 'flv': ['35', '34', '6', '5'],
186 '3gp': ['36', '17', '13'],
187 'mp4': ['38', '37', '22', '18'],
188 'webm': ['46', '45', '44', '43'],
190 _video_extensions = {
212 # Apple HTTP Live Streaming
244 _video_dimensions = {
326 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
327 u"file": u"BaW_jenozKc.mp4",
329 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
330 u"uploader": u"Philipp Hagemeister",
331 u"uploader_id": u"phihag",
332 u"upload_date": u"20121002",
333 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
337 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
338 u"file": u"1ltcDfZMA3U.flv",
339 u"note": u"Test VEVO video (#897)",
341 u"upload_date": u"20070518",
342 u"title": u"Maps - It Will Find You",
343 u"description": u"Music video by Maps performing It Will Find You.",
344 u"uploader": u"MuteUSA",
345 u"uploader_id": u"MuteUSA"
349 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
350 u"file": u"UxxajLWwzqY.mp4",
351 u"note": u"Test generic use_cipher_signature video (#897)",
353 u"upload_date": u"20120506",
354 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
355 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
356 u"uploader": u"Icona Pop",
357 u"uploader_id": u"IconaPop"
361 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
362 u"file": u"07FYdnEawAQ.mp4",
363 u"note": u"Test VEVO video with age protection (#956)",
365 u"upload_date": u"20130703",
366 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
367 u"description": u"md5:64249768eec3bc4276236606ea996373",
368 u"uploader": u"justintimberlakeVEVO",
369 u"uploader_id": u"justintimberlakeVEVO"
373 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
374 u'file': u'TGi3HqYrWHE.mp4',
375 u'note': u'm3u8 video',
377 u'title': u'Triathlon - Men - London 2012 Olympic Games',
378 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
379 u'uploader': u'olympic',
380 u'upload_date': u'20120807',
381 u'uploader_id': u'olympic',
384 u'skip_download': True,
391 def suitable(cls, url):
392 """Receives a URL and returns True if suitable for this IE."""
393 if YoutubePlaylistIE.suitable(url): return False
394 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
396 def report_video_webpage_download(self, video_id):
397 """Report attempt to download video webpage."""
398 self.to_screen(u'%s: Downloading video webpage' % video_id)
400 def report_video_info_webpage_download(self, video_id):
401 """Report attempt to download video info webpage."""
402 self.to_screen(u'%s: Downloading video info webpage' % video_id)
404 def report_information_extraction(self, video_id):
405 """Report attempt to extract video information."""
406 self.to_screen(u'%s: Extracting video information' % video_id)
408 def report_unavailable_format(self, video_id, format):
409 """Report extracted video URL."""
410 self.to_screen(u'%s: Format %s not available' % (video_id, format))
412 def report_rtmp_download(self):
413 """Indicate the download will use the RTMP protocol."""
414 self.to_screen(u'RTMP download detected')
416 def _decrypt_signature(self, s):
417 """Turn the encrypted s field into a working signature"""
420 return s[86:29:-1] + s[88] + s[28:5:-1]
422 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
424 return s[84:27:-1] + s[86] + s[26:5:-1]
426 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
428 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
430 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
432 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
434 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
436 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
438 return s[81:36:-1] + s[0] + s[35:2:-1]
440 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
442 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
444 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
446 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
448 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
451 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
453 def _decrypt_signature_age_gate(self, s):
454 # The videos with age protection use another player, so the algorithms
457 return s[2:63] + s[82] + s[64:82] + s[63]
459 # Fallback to the other algortihms
460 return self._decrypt_signature(s)
462 def _get_available_subtitles(self, video_id):
464 sub_list = self._download_webpage(
465 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
466 video_id, note=False)
467 except ExtractorError as err:
468 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
470 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
475 params = compat_urllib_parse.urlencode({
478 'fmt': self._downloader.params.get('subtitlesformat'),
480 url = u'http://www.youtube.com/api/timedtext?' + params
481 sub_lang_list[lang] = url
482 if not sub_lang_list:
483 self._downloader.report_warning(u'video doesn\'t have subtitles')
487 def _get_available_automatic_caption(self, video_id, webpage):
488 """We need the webpage for getting the captions url, pass it as an
489 argument to speed up the process."""
490 sub_format = self._downloader.params.get('subtitlesformat')
491 self.to_screen(u'%s: Looking for automatic captions' % video_id)
492 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
493 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
495 self._downloader.report_warning(err_msg)
497 player_config = json.loads(mobj.group(1))
499 args = player_config[u'args']
500 caption_url = args[u'ttsurl']
501 timestamp = args[u'timestamp']
502 # We get the available subtitles
503 list_params = compat_urllib_parse.urlencode({
508 list_url = caption_url + '&' + list_params
509 list_page = self._download_webpage(list_url, video_id)
510 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
511 original_lang_node = caption_list.find('track')
512 if original_lang_node.attrib.get('kind') != 'asr' :
513 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
515 original_lang = original_lang_node.attrib['lang_code']
518 for lang_node in caption_list.findall('target'):
519 sub_lang = lang_node.attrib['lang_code']
520 params = compat_urllib_parse.urlencode({
521 'lang': original_lang,
527 sub_lang_list[sub_lang] = caption_url + '&' + params
529 # An extractor error can be raise by the download process if there are
530 # no automatic captions but there are subtitles
531 except (KeyError, ExtractorError):
532 self._downloader.report_warning(err_msg)
535 def _print_formats(self, formats):
536 print('Available formats:')
538 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
539 self._video_dimensions.get(x, '???'),
540 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
542 def _extract_id(self, url):
543 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
545 raise ExtractorError(u'Invalid URL: %s' % url)
546 video_id = mobj.group(2)
549 def _get_video_url_list(self, url_map):
551 Transform a dictionary in the format {itag:url} to a list of (itag, url)
552 with the requested formats.
554 req_format = self._downloader.params.get('format', None)
555 format_limit = self._downloader.params.get('format_limit', None)
556 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
557 if format_limit is not None and format_limit in available_formats:
558 format_list = available_formats[available_formats.index(format_limit):]
560 format_list = available_formats
561 existing_formats = [x for x in format_list if x in url_map]
562 if len(existing_formats) == 0:
563 raise ExtractorError(u'no known formats available for video')
564 if self._downloader.params.get('listformats', None):
565 self._print_formats(existing_formats)
567 if req_format is None or req_format == 'best':
568 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
569 elif req_format == 'worst':
570 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
571 elif req_format in ('-1', 'all'):
572 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
574 # Specific formats. We pick the first in a slash-delimeted sequence.
575 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
576 # available in the specified format. For example,
577 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
578 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
579 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
580 req_formats = req_format.split('/')
581 video_url_list = None
582 for rf in req_formats:
584 video_url_list = [(rf, url_map[rf])]
586 if rf in self._video_formats_map:
587 for srf in self._video_formats_map[rf]:
589 video_url_list = [(srf, url_map[srf])]
594 if video_url_list is None:
595 raise ExtractorError(u'requested format not available')
596 return video_url_list
598 def _extract_from_m3u8(self, manifest_url, video_id):
600 def _get_urls(_manifest):
601 lines = _manifest.split('\n')
602 urls = filter(lambda l: l and not l.startswith('#'),
605 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
606 formats_urls = _get_urls(manifest)
607 for format_url in formats_urls:
608 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
609 url_map[itag] = format_url
612 def _real_extract(self, url):
613 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
614 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
616 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
617 mobj = re.search(self._NEXT_URL_RE, url)
619 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
620 video_id = self._extract_id(url)
623 self.report_video_webpage_download(video_id)
624 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
625 request = compat_urllib_request.Request(url)
627 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
628 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
629 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
631 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
633 # Attempt to extract SWF player URL
634 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
636 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
641 self.report_video_info_webpage_download(video_id)
642 if re.search(r'player-age-gate-content">', video_webpage) is not None:
643 self.report_age_confirmation()
645 # We simulate the access to the video from www.youtube.com/v/{video_id}
646 # this can be viewed without login into Youtube
647 data = compat_urllib_parse.urlencode({'video_id': video_id,
651 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
655 video_info_url = 'https://www.youtube.com/get_video_info?' + data
656 video_info_webpage = self._download_webpage(video_info_url, video_id,
658 errnote='unable to download video info webpage')
659 video_info = compat_parse_qs(video_info_webpage)
662 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
663 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
664 % (video_id, el_type))
665 video_info_webpage = self._download_webpage(video_info_url, video_id,
667 errnote='unable to download video info webpage')
668 video_info = compat_parse_qs(video_info_webpage)
669 if 'token' in video_info:
671 if 'token' not in video_info:
672 if 'reason' in video_info:
673 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
675 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
677 # Check for "rental" videos
678 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
679 raise ExtractorError(u'"rental" videos not supported')
681 # Start extracting information
682 self.report_information_extraction(video_id)
685 if 'author' not in video_info:
686 raise ExtractorError(u'Unable to extract uploader name')
687 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
690 video_uploader_id = None
691 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
693 video_uploader_id = mobj.group(1)
695 self._downloader.report_warning(u'unable to extract uploader nickname')
698 if 'title' not in video_info:
699 raise ExtractorError(u'Unable to extract video title')
700 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
703 # We try first to get a high quality image:
704 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
705 video_webpage, re.DOTALL)
706 if m_thumb is not None:
707 video_thumbnail = m_thumb.group(1)
708 elif 'thumbnail_url' not in video_info:
709 self._downloader.report_warning(u'unable to extract video thumbnail')
711 else: # don't panic if we can't find it
712 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
716 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
718 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
719 upload_date = unified_strdate(upload_date)
722 video_description = get_element_by_id("eow-description", video_webpage)
723 if video_description:
724 video_description = clean_html(video_description)
726 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
728 video_description = unescapeHTML(fd_mobj.group(1))
730 video_description = u''
733 video_subtitles = self.extract_subtitles(video_id, video_webpage)
735 if self._downloader.params.get('listsubtitles', False):
736 self._list_available_subtitles(video_id, video_webpage)
739 if 'length_seconds' not in video_info:
740 self._downloader.report_warning(u'unable to extract video duration')
743 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
745 # Decide which formats to download
748 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
750 raise ValueError('Could not find vevo ID')
751 info = json.loads(mobj.group(1))
753 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
754 # this signatures are encrypted
755 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
757 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
758 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
759 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
761 if 'url_encoded_fmt_stream_map' in video_info:
762 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
764 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
765 elif 'adaptive_fmts' in video_info:
766 if 'url_encoded_fmt_stream_map' in video_info:
767 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
769 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
773 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
774 self.report_rtmp_download()
775 video_url_list = [(None, video_info['conn'][0])]
776 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
777 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
778 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
780 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
781 url_data = compat_parse_qs(url_data_str)
782 if 'itag' in url_data and 'url' in url_data:
783 url = url_data['url'][0]
784 if 'sig' in url_data:
785 url += '&signature=' + url_data['sig'][0]
786 elif 's' in url_data:
787 if self._downloader.params.get('verbose'):
790 player = 'flash player'
792 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
793 'html5 player', fatal=False)
794 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
795 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
796 (len(s), parts_sizes, url_data['itag'][0], player))
797 encrypted_sig = url_data['s'][0]
799 signature = self._decrypt_signature_age_gate(encrypted_sig)
801 signature = self._decrypt_signature(encrypted_sig)
802 url += '&signature=' + signature
803 if 'ratebypass' not in url:
804 url += '&ratebypass=yes'
805 url_map[url_data['itag'][0]] = url
806 video_url_list = self._get_video_url_list(url_map)
807 if not video_url_list:
809 elif video_info.get('hlsvp'):
810 manifest_url = video_info['hlsvp'][0]
811 url_map = self._extract_from_m3u8(manifest_url, video_id)
812 video_url_list = self._get_video_url_list(url_map)
813 if not video_url_list:
817 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
820 for format_param, video_real_url in video_url_list:
822 video_extension = self._video_extensions.get(format_param, 'flv')
824 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
825 self._video_dimensions.get(format_param, '???'),
826 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
830 'url': video_real_url,
831 'uploader': video_uploader,
832 'uploader_id': video_uploader_id,
833 'upload_date': upload_date,
834 'title': video_title,
835 'ext': video_extension,
836 'format': video_format,
837 'thumbnail': video_thumbnail,
838 'description': video_description,
839 'player_url': player_url,
840 'subtitles': video_subtitles,
841 'duration': video_duration
845 class YoutubePlaylistIE(InfoExtractor):
846 IE_DESC = u'YouTube.com playlists'
852 (?:course|view_play_list|my_playlists|artist|playlist|watch)
853 \? (?:.*?&)*? (?:p|a|list)=
856 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
859 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
861 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
863 IE_NAME = u'youtube:playlist'
866 def suitable(cls, url):
867 """Receives a URL and returns True if suitable for this IE."""
868 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
870 def _real_extract(self, url):
871 # Extract playlist id
872 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
874 raise ExtractorError(u'Invalid URL: %s' % url)
876 # Download playlist videos from API
877 playlist_id = mobj.group(1) or mobj.group(2)
880 for page_num in itertools.count(1):
881 start_index = self._MAX_RESULTS * (page_num - 1) + 1
882 if start_index >= 1000:
883 self._downloader.report_warning(u'Max number of results reached')
885 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
886 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
889 response = json.loads(page)
890 except ValueError as err:
891 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
893 if 'feed' not in response:
894 raise ExtractorError(u'Got a malformed response from YouTube API')
895 playlist_title = response['feed']['title']['$t']
896 if 'entry' not in response['feed']:
897 # Number of videos is a multiple of self._MAX_RESULTS
900 for entry in response['feed']['entry']:
901 index = entry['yt$position']['$t']
902 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
905 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
908 videos = [v[1] for v in sorted(videos)]
910 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
911 return [self.playlist_result(url_results, playlist_id, playlist_title)]
914 class YoutubeChannelIE(InfoExtractor):
915 IE_DESC = u'YouTube.com channels'
916 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
917 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
918 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
919 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
920 IE_NAME = u'youtube:channel'
922 def extract_videos_from_page(self, page):
924 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
925 if mobj.group(1) not in ids_in_page:
926 ids_in_page.append(mobj.group(1))
929 def _real_extract(self, url):
931 mobj = re.match(self._VALID_URL, url)
933 raise ExtractorError(u'Invalid URL: %s' % url)
935 # Download channel page
936 channel_id = mobj.group(1)
940 url = self._TEMPLATE_URL % (channel_id, pagenum)
941 page = self._download_webpage(url, channel_id,
942 u'Downloading page #%s' % pagenum)
944 # Extract video identifiers
945 ids_in_page = self.extract_videos_from_page(page)
946 video_ids.extend(ids_in_page)
948 # Download any subsequent channel pages using the json-based channel_ajax query
949 if self._MORE_PAGES_INDICATOR in page:
950 for pagenum in itertools.count(1):
951 url = self._MORE_PAGES_URL % (pagenum, channel_id)
952 page = self._download_webpage(url, channel_id,
953 u'Downloading page #%s' % pagenum)
955 page = json.loads(page)
957 ids_in_page = self.extract_videos_from_page(page['content_html'])
958 video_ids.extend(ids_in_page)
960 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
963 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
965 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
966 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
967 return [self.playlist_result(url_entries, channel_id)]
970 class YoutubeUserIE(InfoExtractor):
971 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
972 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
973 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
974 _GDATA_PAGE_SIZE = 50
975 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
976 IE_NAME = u'youtube:user'
979 def suitable(cls, url):
980 # Don't return True if the url can be extracted with other youtube
981 # extractor, the regex would is too permissive and it would match.
982 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
983 if any(ie.suitable(url) for ie in other_ies): return False
984 else: return super(YoutubeUserIE, cls).suitable(url)
986 def _real_extract(self, url):
988 mobj = re.match(self._VALID_URL, url)
990 raise ExtractorError(u'Invalid URL: %s' % url)
992 username = mobj.group(1)
994 # Download video ids using YouTube Data API. Result size per
995 # query is limited (currently to 50 videos) so we need to query
996 # page by page until there are no video ids - it means we got
1001 for pagenum in itertools.count(0):
1002 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1004 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1005 page = self._download_webpage(gdata_url, username,
1006 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1009 response = json.loads(page)
1010 except ValueError as err:
1011 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1012 if 'entry' not in response['feed']:
1013 # Number of videos is a multiple of self._MAX_RESULTS
1016 # Extract video identifiers
1018 for entry in response['feed']['entry']:
1019 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1020 video_ids.extend(ids_in_page)
1022 # A little optimization - if current page is not
1023 # "full", ie. does not contain PAGE_SIZE video ids then
1024 # we can assume that this page is the last one - there
1025 # are no more ids on further pages - no need to query
1028 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1031 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1032 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1033 return [self.playlist_result(url_results, playlist_title = username)]
1035 class YoutubeSearchIE(SearchInfoExtractor):
1036 IE_DESC = u'YouTube.com searches'
1037 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1039 IE_NAME = u'youtube:search'
1040 _SEARCH_KEY = 'ytsearch'
1042 def report_download_page(self, query, pagenum):
1043 """Report attempt to download search page with given number."""
1044 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1046 def _get_n_results(self, query, n):
1047 """Get a specified number of results for a query"""
1053 while (50 * pagenum) < limit:
1054 self.report_download_page(query, pagenum+1)
1055 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1056 request = compat_urllib_request.Request(result_url)
1058 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1059 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1060 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1061 api_response = json.loads(data)['data']
1063 if not 'items' in api_response:
1064 raise ExtractorError(u'[youtube] No video results')
1066 new_ids = list(video['id'] for video in api_response['items'])
1067 video_ids += new_ids
1069 limit = min(n, api_response['totalItems'])
1072 if len(video_ids) > n:
1073 video_ids = video_ids[:n]
1074 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1075 return self.playlist_result(videos, query)
1078 class YoutubeShowIE(InfoExtractor):
1079 IE_DESC = u'YouTube.com (multi-season) shows'
1080 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1081 IE_NAME = u'youtube:show'
1083 def _real_extract(self, url):
1084 mobj = re.match(self._VALID_URL, url)
1085 show_name = mobj.group(1)
1086 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1087 # There's one playlist for each season of the show
1088 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1089 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1090 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1093 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1095 Base class for extractors that fetch info from
1096 http://www.youtube.com/feed_ajax
1097 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1099 _LOGIN_REQUIRED = True
1101 # use action_load_personal_feed instead of action_load_system_feed
1102 _PERSONAL_FEED = False
1105 def _FEED_TEMPLATE(self):
1106 action = 'action_load_system_feed'
1107 if self._PERSONAL_FEED:
1108 action = 'action_load_personal_feed'
1109 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1113 return u'youtube:%s' % self._FEED_NAME
1115 def _real_initialize(self):
1118 def _real_extract(self, url):
1120 # The step argument is available only in 2.7 or higher
1121 for i in itertools.count(0):
1122 paging = i*self._PAGING_STEP
1123 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1124 u'%s feed' % self._FEED_NAME,
1125 u'Downloading page %s' % i)
1126 info = json.loads(info)
1127 feed_html = info['feed_html']
1128 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1129 ids = orderedSet(m.group(1) for m in m_ids)
1130 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1131 if info['paging'] is None:
1133 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1135 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1136 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1137 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1138 _FEED_NAME = 'subscriptions'
1139 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1141 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1142 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1143 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1144 _FEED_NAME = 'recommended'
1145 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1147 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1148 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1149 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1150 _FEED_NAME = 'watch_later'
1151 _PLAYLIST_TITLE = u'Youtube Watch Later'
1153 _PERSONAL_FEED = True
1155 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1156 IE_NAME = u'youtube:favorites'
1157 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1158 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1159 _LOGIN_REQUIRED = True
1161 def _real_extract(self, url):
1162 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1163 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1164 return self.url_result(playlist_id, 'YoutubePlaylist')