8 import xml.etree.ElementTree
10 from .common import InfoExtractor, SearchInfoExtractor
11 from .subtitles import SubtitlesInfoExtractor
17 compat_urllib_request,
28 class YoutubeBaseInfoExtractor(InfoExtractor):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE = 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED = False
37 def report_lang(self):
38 """Report attempt to set language."""
39 self.to_screen(u'Setting language')
41 def _set_language(self):
42 request = compat_urllib_request.Request(self._LANG_URL)
45 compat_urllib_request.urlopen(request).read()
46 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
47 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 request = compat_urllib_request.Request(self._LOGIN_URL)
61 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
62 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
63 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
68 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
71 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'PersistentCookie': u'yes',
83 u'bgresponse': u'js_disabled',
84 u'checkConnection': u'',
85 u'checkedDomains': u'youtube',
91 u'signIn': u'Sign in',
93 u'service': u'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
100 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
101 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
104 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
109 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
113 def _confirm_age(self):
116 'action_confirm': 'Confirm',
118 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
120 self.report_age_confirmation()
121 compat_urllib_request.urlopen(request).read().decode('utf-8')
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
126 def _real_initialize(self):
127 if self._downloader is None:
129 if not self._set_language():
131 if not self._login():
136 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
137 IE_DESC = u'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
153 |youtu\.be/ # just youtu.be/xxxx
155 )? # all until now is optional -> you can pass the naked ID
156 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
157 (?(1).+)? # if we found the ID, everything can follow
159 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
160 # Listed in order of quality
161 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
162 # Apple HTTP Live Streaming
163 '96', '95', '94', '93', '92', '132', '151',
165 '85', '84', '102', '83', '101', '82', '100',
167 '138', '137', '248', '136', '247', '135', '246',
168 '245', '244', '134', '243', '133', '242', '160',
170 '141', '172', '140', '171', '139',
172 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
173 # Apple HTTP Live Streaming
174 '96', '95', '94', '93', '92', '132', '151',
176 '85', '102', '84', '101', '83', '100', '82',
178 '138', '248', '137', '247', '136', '246', '245',
179 '244', '135', '243', '134', '242', '133', '160',
181 '172', '141', '171', '140', '139',
183 _video_formats_map = {
184 'flv': ['35', '34', '6', '5'],
185 '3gp': ['36', '17', '13'],
186 'mp4': ['38', '37', '22', '18'],
187 'webm': ['46', '45', '44', '43'],
189 _video_extensions = {
211 # Apple HTTP Live Streaming
243 _video_dimensions = {
325 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
326 u"file": u"BaW_jenozKc.mp4",
328 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
329 u"uploader": u"Philipp Hagemeister",
330 u"uploader_id": u"phihag",
331 u"upload_date": u"20121002",
332 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
336 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
337 u"file": u"1ltcDfZMA3U.flv",
338 u"note": u"Test VEVO video (#897)",
340 u"upload_date": u"20070518",
341 u"title": u"Maps - It Will Find You",
342 u"description": u"Music video by Maps performing It Will Find You.",
343 u"uploader": u"MuteUSA",
344 u"uploader_id": u"MuteUSA"
348 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
349 u"file": u"UxxajLWwzqY.mp4",
350 u"note": u"Test generic use_cipher_signature video (#897)",
352 u"upload_date": u"20120506",
353 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
354 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
355 u"uploader": u"Icona Pop",
356 u"uploader_id": u"IconaPop"
360 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
361 u"file": u"07FYdnEawAQ.mp4",
362 u"note": u"Test VEVO video with age protection (#956)",
364 u"upload_date": u"20130703",
365 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
366 u"description": u"md5:64249768eec3bc4276236606ea996373",
367 u"uploader": u"justintimberlakeVEVO",
368 u"uploader_id": u"justintimberlakeVEVO"
372 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
373 u'file': u'TGi3HqYrWHE.mp4',
374 u'note': u'm3u8 video',
376 u'title': u'Triathlon - Men - London 2012 Olympic Games',
377 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
378 u'uploader': u'olympic',
379 u'upload_date': u'20120807',
380 u'uploader_id': u'olympic',
383 u'skip_download': True,
390 def suitable(cls, url):
391 """Receives a URL and returns True if suitable for this IE."""
392 if YoutubePlaylistIE.suitable(url): return False
393 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
395 def report_video_webpage_download(self, video_id):
396 """Report attempt to download video webpage."""
397 self.to_screen(u'%s: Downloading video webpage' % video_id)
399 def report_video_info_webpage_download(self, video_id):
400 """Report attempt to download video info webpage."""
401 self.to_screen(u'%s: Downloading video info webpage' % video_id)
403 def report_information_extraction(self, video_id):
404 """Report attempt to extract video information."""
405 self.to_screen(u'%s: Extracting video information' % video_id)
407 def report_unavailable_format(self, video_id, format):
408 """Report extracted video URL."""
409 self.to_screen(u'%s: Format %s not available' % (video_id, format))
411 def report_rtmp_download(self):
412 """Indicate the download will use the RTMP protocol."""
413 self.to_screen(u'RTMP download detected')
415 def _decrypt_signature(self, s):
416 """Turn the encrypted s field into a working signature"""
419 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
421 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
423 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
425 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
427 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
429 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
431 return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]
433 return s[81:36:-1] + s[0] + s[35:2:-1]
435 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
437 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
439 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
441 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
443 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
446 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
448 def _decrypt_signature_age_gate(self, s):
449 # The videos with age protection use another player, so the algorithms
452 return s[2:63] + s[82] + s[64:82] + s[63]
454 # Fallback to the other algortihms
455 return self._decrypt_signature(s)
457 def _get_available_subtitles(self, video_id):
459 sub_list = self._download_webpage(
460 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
461 video_id, note=False)
462 except ExtractorError as err:
463 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
465 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
470 params = compat_urllib_parse.urlencode({
473 'fmt': self._downloader.params.get('subtitlesformat'),
475 url = u'http://www.youtube.com/api/timedtext?' + params
476 sub_lang_list[lang] = url
477 if not sub_lang_list:
478 self._downloader.report_warning(u'video doesn\'t have subtitles')
482 def _get_available_automatic_caption(self, video_id, webpage):
483 """We need the webpage for getting the captions url, pass it as an
484 argument to speed up the process."""
485 sub_format = self._downloader.params.get('subtitlesformat')
486 self.to_screen(u'%s: Looking for automatic captions' % video_id)
487 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
488 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
490 self._downloader.report_warning(err_msg)
492 player_config = json.loads(mobj.group(1))
494 args = player_config[u'args']
495 caption_url = args[u'ttsurl']
496 timestamp = args[u'timestamp']
497 # We get the available subtitles
498 list_params = compat_urllib_parse.urlencode({
503 list_url = caption_url + '&' + list_params
504 list_page = self._download_webpage(list_url, video_id)
505 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
506 original_lang = caption_list.find('track').attrib['lang_code']
509 for lang_node in caption_list.findall('target'):
510 sub_lang = lang_node.attrib['lang_code']
511 params = compat_urllib_parse.urlencode({
512 'lang': original_lang,
518 sub_lang_list[sub_lang] = caption_url + '&' + params
520 # An extractor error can be raise by the download process if there are
521 # no automatic captions but there are subtitles
522 except (KeyError, ExtractorError):
523 self._downloader.report_warning(err_msg)
526 def _print_formats(self, formats):
527 print('Available formats:')
529 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
530 self._video_dimensions.get(x, '???'),
531 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
533 def _extract_id(self, url):
534 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
536 raise ExtractorError(u'Invalid URL: %s' % url)
537 video_id = mobj.group(2)
540 def _get_video_url_list(self, url_map):
542 Transform a dictionary in the format {itag:url} to a list of (itag, url)
543 with the requested formats.
545 req_format = self._downloader.params.get('format', None)
546 format_limit = self._downloader.params.get('format_limit', None)
547 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
548 if format_limit is not None and format_limit in available_formats:
549 format_list = available_formats[available_formats.index(format_limit):]
551 format_list = available_formats
552 existing_formats = [x for x in format_list if x in url_map]
553 if len(existing_formats) == 0:
554 raise ExtractorError(u'no known formats available for video')
555 if self._downloader.params.get('listformats', None):
556 self._print_formats(existing_formats)
558 if req_format is None or req_format == 'best':
559 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
560 elif req_format == 'worst':
561 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
562 elif req_format in ('-1', 'all'):
563 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
565 # Specific formats. We pick the first in a slash-delimeted sequence.
566 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
567 # available in the specified format. For example,
568 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
569 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
570 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
571 req_formats = req_format.split('/')
572 video_url_list = None
573 for rf in req_formats:
575 video_url_list = [(rf, url_map[rf])]
577 if rf in self._video_formats_map:
578 for srf in self._video_formats_map[rf]:
580 video_url_list = [(srf, url_map[srf])]
585 if video_url_list is None:
586 raise ExtractorError(u'requested format not available')
587 return video_url_list
589 def _extract_from_m3u8(self, manifest_url, video_id):
591 def _get_urls(_manifest):
592 lines = _manifest.split('\n')
593 urls = filter(lambda l: l and not l.startswith('#'),
596 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
597 formats_urls = _get_urls(manifest)
598 for format_url in formats_urls:
599 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
600 url_map[itag] = format_url
603 def _real_extract(self, url):
604 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
605 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
607 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
608 mobj = re.search(self._NEXT_URL_RE, url)
610 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
611 video_id = self._extract_id(url)
614 self.report_video_webpage_download(video_id)
615 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
616 request = compat_urllib_request.Request(url)
618 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
622 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
624 # Attempt to extract SWF player URL
625 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
627 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
632 self.report_video_info_webpage_download(video_id)
633 if re.search(r'player-age-gate-content">', video_webpage) is not None:
634 self.report_age_confirmation()
636 # We simulate the access to the video from www.youtube.com/v/{video_id}
637 # this can be viewed without login into Youtube
638 data = compat_urllib_parse.urlencode({'video_id': video_id,
642 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
646 video_info_url = 'https://www.youtube.com/get_video_info?' + data
647 video_info_webpage = self._download_webpage(video_info_url, video_id,
649 errnote='unable to download video info webpage')
650 video_info = compat_parse_qs(video_info_webpage)
653 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
654 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
655 % (video_id, el_type))
656 video_info_webpage = self._download_webpage(video_info_url, video_id,
658 errnote='unable to download video info webpage')
659 video_info = compat_parse_qs(video_info_webpage)
660 if 'token' in video_info:
662 if 'token' not in video_info:
663 if 'reason' in video_info:
664 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
666 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
668 # Check for "rental" videos
669 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
670 raise ExtractorError(u'"rental" videos not supported')
672 # Start extracting information
673 self.report_information_extraction(video_id)
676 if 'author' not in video_info:
677 raise ExtractorError(u'Unable to extract uploader name')
678 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
681 video_uploader_id = None
682 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
684 video_uploader_id = mobj.group(1)
686 self._downloader.report_warning(u'unable to extract uploader nickname')
689 if 'title' not in video_info:
690 raise ExtractorError(u'Unable to extract video title')
691 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
694 # We try first to get a high quality image:
695 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
696 video_webpage, re.DOTALL)
697 if m_thumb is not None:
698 video_thumbnail = m_thumb.group(1)
699 elif 'thumbnail_url' not in video_info:
700 self._downloader.report_warning(u'unable to extract video thumbnail')
702 else: # don't panic if we can't find it
703 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
707 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
709 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
710 upload_date = unified_strdate(upload_date)
713 video_description = get_element_by_id("eow-description", video_webpage)
714 if video_description:
715 video_description = clean_html(video_description)
717 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
719 video_description = unescapeHTML(fd_mobj.group(1))
721 video_description = u''
724 video_subtitles = self.extract_subtitles(video_id, video_webpage)
726 if self._downloader.params.get('listsubtitles', False):
727 self._list_available_subtitles(video_id, video_webpage)
730 if 'length_seconds' not in video_info:
731 self._downloader.report_warning(u'unable to extract video duration')
734 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
736 # Decide which formats to download
739 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
741 raise ValueError('Could not find vevo ID')
742 info = json.loads(mobj.group(1))
744 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
745 # this signatures are encrypted
746 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
748 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
749 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
750 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
752 if 'url_encoded_fmt_stream_map' in video_info:
753 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
755 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
756 elif 'adaptive_fmts' in video_info:
757 if 'url_encoded_fmt_stream_map' in video_info:
758 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
760 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
764 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
765 self.report_rtmp_download()
766 video_url_list = [(None, video_info['conn'][0])]
767 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
768 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
769 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
771 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
772 url_data = compat_parse_qs(url_data_str)
773 if 'itag' in url_data and 'url' in url_data:
774 url = url_data['url'][0]
775 if 'sig' in url_data:
776 url += '&signature=' + url_data['sig'][0]
777 elif 's' in url_data:
778 if self._downloader.params.get('verbose'):
781 player_version = self._search_regex(r'ad3-(.+?)\.swf',
782 video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
783 'flash player', fatal=False)
784 player = 'flash player %s' % player_version
786 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
787 'html5 player', fatal=False)
788 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
789 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
790 (len(s), parts_sizes, url_data['itag'][0], player))
791 encrypted_sig = url_data['s'][0]
793 signature = self._decrypt_signature_age_gate(encrypted_sig)
795 signature = self._decrypt_signature(encrypted_sig)
796 url += '&signature=' + signature
797 if 'ratebypass' not in url:
798 url += '&ratebypass=yes'
799 url_map[url_data['itag'][0]] = url
800 video_url_list = self._get_video_url_list(url_map)
801 if not video_url_list:
803 elif video_info.get('hlsvp'):
804 manifest_url = video_info['hlsvp'][0]
805 url_map = self._extract_from_m3u8(manifest_url, video_id)
806 video_url_list = self._get_video_url_list(url_map)
807 if not video_url_list:
811 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
814 for format_param, video_real_url in video_url_list:
816 video_extension = self._video_extensions.get(format_param, 'flv')
818 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
819 self._video_dimensions.get(format_param, '???'),
820 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
824 'url': video_real_url,
825 'uploader': video_uploader,
826 'uploader_id': video_uploader_id,
827 'upload_date': upload_date,
828 'title': video_title,
829 'ext': video_extension,
830 'format': video_format,
831 'thumbnail': video_thumbnail,
832 'description': video_description,
833 'player_url': player_url,
834 'subtitles': video_subtitles,
835 'duration': video_duration
839 class YoutubePlaylistIE(InfoExtractor):
840 IE_DESC = u'YouTube.com playlists'
846 (?:course|view_play_list|my_playlists|artist|playlist|watch)
847 \? (?:.*?&)*? (?:p|a|list)=
850 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
853 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
855 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
857 IE_NAME = u'youtube:playlist'
860 def suitable(cls, url):
861 """Receives a URL and returns True if suitable for this IE."""
862 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
864 def _real_extract(self, url):
865 # Extract playlist id
866 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
868 raise ExtractorError(u'Invalid URL: %s' % url)
870 # Download playlist videos from API
871 playlist_id = mobj.group(1) or mobj.group(2)
874 for page_num in itertools.count(1):
875 start_index = self._MAX_RESULTS * (page_num - 1) + 1
876 if start_index >= 1000:
877 self._downloader.report_warning(u'Max number of results reached')
879 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
880 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
883 response = json.loads(page)
884 except ValueError as err:
885 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
887 if 'feed' not in response:
888 raise ExtractorError(u'Got a malformed response from YouTube API')
889 playlist_title = response['feed']['title']['$t']
890 if 'entry' not in response['feed']:
891 # Number of videos is a multiple of self._MAX_RESULTS
894 for entry in response['feed']['entry']:
895 index = entry['yt$position']['$t']
896 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
899 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
902 videos = [v[1] for v in sorted(videos)]
904 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
905 return [self.playlist_result(url_results, playlist_id, playlist_title)]
908 class YoutubeChannelIE(InfoExtractor):
909 IE_DESC = u'YouTube.com channels'
910 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
911 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
912 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
913 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
914 IE_NAME = u'youtube:channel'
916 def extract_videos_from_page(self, page):
918 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
919 if mobj.group(1) not in ids_in_page:
920 ids_in_page.append(mobj.group(1))
923 def _real_extract(self, url):
925 mobj = re.match(self._VALID_URL, url)
927 raise ExtractorError(u'Invalid URL: %s' % url)
929 # Download channel page
930 channel_id = mobj.group(1)
934 url = self._TEMPLATE_URL % (channel_id, pagenum)
935 page = self._download_webpage(url, channel_id,
936 u'Downloading page #%s' % pagenum)
938 # Extract video identifiers
939 ids_in_page = self.extract_videos_from_page(page)
940 video_ids.extend(ids_in_page)
942 # Download any subsequent channel pages using the json-based channel_ajax query
943 if self._MORE_PAGES_INDICATOR in page:
944 for pagenum in itertools.count(1):
945 url = self._MORE_PAGES_URL % (pagenum, channel_id)
946 page = self._download_webpage(url, channel_id,
947 u'Downloading page #%s' % pagenum)
949 page = json.loads(page)
951 ids_in_page = self.extract_videos_from_page(page['content_html'])
952 video_ids.extend(ids_in_page)
954 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
957 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
959 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
960 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
961 return [self.playlist_result(url_entries, channel_id)]
964 class YoutubeUserIE(InfoExtractor):
965 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
966 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
967 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
968 _GDATA_PAGE_SIZE = 50
969 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
970 IE_NAME = u'youtube:user'
973 def suitable(cls, url):
974 # Don't return True if the url can be extracted with other youtube
975 # extractor, the regex would is too permissive and it would match.
976 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
977 if any(ie.suitable(url) for ie in other_ies): return False
978 else: return super(YoutubeUserIE, cls).suitable(url)
980 def _real_extract(self, url):
982 mobj = re.match(self._VALID_URL, url)
984 raise ExtractorError(u'Invalid URL: %s' % url)
986 username = mobj.group(1)
988 # Download video ids using YouTube Data API. Result size per
989 # query is limited (currently to 50 videos) so we need to query
990 # page by page until there are no video ids - it means we got
995 for pagenum in itertools.count(0):
996 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
998 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
999 page = self._download_webpage(gdata_url, username,
1000 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1003 response = json.loads(page)
1004 except ValueError as err:
1005 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1007 # Extract video identifiers
1009 for entry in response['feed']['entry']:
1010 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1011 video_ids.extend(ids_in_page)
1013 # A little optimization - if current page is not
1014 # "full", ie. does not contain PAGE_SIZE video ids then
1015 # we can assume that this page is the last one - there
1016 # are no more ids on further pages - no need to query
1019 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1022 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1023 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1024 return [self.playlist_result(url_results, playlist_title = username)]
1026 class YoutubeSearchIE(SearchInfoExtractor):
1027 IE_DESC = u'YouTube.com searches'
1028 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1030 IE_NAME = u'youtube:search'
1031 _SEARCH_KEY = 'ytsearch'
1033 def report_download_page(self, query, pagenum):
1034 """Report attempt to download search page with given number."""
1035 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1037 def _get_n_results(self, query, n):
1038 """Get a specified number of results for a query"""
1044 while (50 * pagenum) < limit:
1045 self.report_download_page(query, pagenum+1)
1046 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1047 request = compat_urllib_request.Request(result_url)
1049 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1052 api_response = json.loads(data)['data']
1054 if not 'items' in api_response:
1055 raise ExtractorError(u'[youtube] No video results')
1057 new_ids = list(video['id'] for video in api_response['items'])
1058 video_ids += new_ids
1060 limit = min(n, api_response['totalItems'])
1063 if len(video_ids) > n:
1064 video_ids = video_ids[:n]
1065 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1066 return self.playlist_result(videos, query)
1069 class YoutubeShowIE(InfoExtractor):
1070 IE_DESC = u'YouTube.com (multi-season) shows'
1071 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1072 IE_NAME = u'youtube:show'
1074 def _real_extract(self, url):
1075 mobj = re.match(self._VALID_URL, url)
1076 show_name = mobj.group(1)
1077 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1078 # There's one playlist for each season of the show
1079 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1080 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1081 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1084 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1086 Base class for extractors that fetch info from
1087 http://www.youtube.com/feed_ajax
1088 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1090 _LOGIN_REQUIRED = True
1092 # use action_load_personal_feed instead of action_load_system_feed
1093 _PERSONAL_FEED = False
1096 def _FEED_TEMPLATE(self):
1097 action = 'action_load_system_feed'
1098 if self._PERSONAL_FEED:
1099 action = 'action_load_personal_feed'
1100 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1104 return u'youtube:%s' % self._FEED_NAME
1106 def _real_initialize(self):
1109 def _real_extract(self, url):
1111 # The step argument is available only in 2.7 or higher
1112 for i in itertools.count(0):
1113 paging = i*self._PAGING_STEP
1114 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1115 u'%s feed' % self._FEED_NAME,
1116 u'Downloading page %s' % i)
1117 info = json.loads(info)
1118 feed_html = info['feed_html']
1119 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1120 ids = orderedSet(m.group(1) for m in m_ids)
1121 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1122 if info['paging'] is None:
1124 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1126 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1127 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1128 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1129 _FEED_NAME = 'subscriptions'
1130 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1132 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1133 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1134 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1135 _FEED_NAME = 'recommended'
1136 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1138 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1139 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1140 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1141 _FEED_NAME = 'watch_later'
1142 _PLAYLIST_TITLE = u'Youtube Watch Later'
1144 _PERSONAL_FEED = True
1146 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1147 IE_NAME = u'youtube:favorites'
1148 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1149 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1150 _LOGIN_REQUIRED = True
1152 def _real_extract(self, url):
1153 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1154 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1155 return self.url_result(playlist_id, 'YoutubePlaylist')