8 import xml.etree.ElementTree
10 from .common import InfoExtractor, SearchInfoExtractor
11 from .subtitles import SubtitlesInfoExtractor
17 compat_urllib_request,
28 class YoutubeBaseInfoExtractor(InfoExtractor):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE = 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED = False
37 def report_lang(self):
38 """Report attempt to set language."""
39 self.to_screen(u'Setting language')
41 def _set_language(self):
42 request = compat_urllib_request.Request(self._LANG_URL)
45 compat_urllib_request.urlopen(request).read()
46 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
47 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 request = compat_urllib_request.Request(self._LOGIN_URL)
61 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
62 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
63 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
68 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
71 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'PersistentCookie': u'yes',
83 u'bgresponse': u'js_disabled',
84 u'checkConnection': u'',
85 u'checkedDomains': u'youtube',
91 u'signIn': u'Sign in',
93 u'service': u'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
100 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
101 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
104 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
109 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
113 def _confirm_age(self):
116 'action_confirm': 'Confirm',
118 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
120 self.report_age_confirmation()
121 compat_urllib_request.urlopen(request).read().decode('utf-8')
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
126 def _real_initialize(self):
127 if self._downloader is None:
129 if not self._set_language():
131 if not self._login():
136 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
137 IE_DESC = u'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
154 |youtu\.be/ # just youtu.be/xxxx
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 # Listed in order of quality
162 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
163 # Apple HTTP Live Streaming
164 '96', '95', '94', '93', '92', '132', '151',
166 '85', '84', '102', '83', '101', '82', '100',
168 '138', '137', '248', '136', '247', '135', '246',
169 '245', '244', '134', '243', '133', '242', '160',
171 '141', '172', '140', '171', '139',
173 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '102', '84', '101', '83', '100', '82',
179 '138', '248', '137', '247', '136', '246', '245',
180 '244', '135', '243', '134', '242', '133', '160',
182 '172', '141', '171', '140', '139',
184 _video_formats_map = {
185 'flv': ['35', '34', '6', '5'],
186 '3gp': ['36', '17', '13'],
187 'mp4': ['38', '37', '22', '18'],
188 'webm': ['46', '45', '44', '43'],
190 _video_extensions = {
212 # Apple HTTP Live Streaming
244 _video_dimensions = {
326 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
327 u"file": u"BaW_jenozKc.mp4",
329 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
330 u"uploader": u"Philipp Hagemeister",
331 u"uploader_id": u"phihag",
332 u"upload_date": u"20121002",
333 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
337 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
338 u"file": u"1ltcDfZMA3U.flv",
339 u"note": u"Test VEVO video (#897)",
341 u"upload_date": u"20070518",
342 u"title": u"Maps - It Will Find You",
343 u"description": u"Music video by Maps performing It Will Find You.",
344 u"uploader": u"MuteUSA",
345 u"uploader_id": u"MuteUSA"
349 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
350 u"file": u"UxxajLWwzqY.mp4",
351 u"note": u"Test generic use_cipher_signature video (#897)",
353 u"upload_date": u"20120506",
354 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
355 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
356 u"uploader": u"Icona Pop",
357 u"uploader_id": u"IconaPop"
361 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
362 u"file": u"07FYdnEawAQ.mp4",
363 u"note": u"Test VEVO video with age protection (#956)",
365 u"upload_date": u"20130703",
366 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
367 u"description": u"md5:64249768eec3bc4276236606ea996373",
368 u"uploader": u"justintimberlakeVEVO",
369 u"uploader_id": u"justintimberlakeVEVO"
373 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
374 u'file': u'TGi3HqYrWHE.mp4',
375 u'note': u'm3u8 video',
377 u'title': u'Triathlon - Men - London 2012 Olympic Games',
378 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
379 u'uploader': u'olympic',
380 u'upload_date': u'20120807',
381 u'uploader_id': u'olympic',
384 u'skip_download': True,
391 def suitable(cls, url):
392 """Receives a URL and returns True if suitable for this IE."""
393 if YoutubePlaylistIE.suitable(url): return False
394 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
396 def report_video_webpage_download(self, video_id):
397 """Report attempt to download video webpage."""
398 self.to_screen(u'%s: Downloading video webpage' % video_id)
400 def report_video_info_webpage_download(self, video_id):
401 """Report attempt to download video info webpage."""
402 self.to_screen(u'%s: Downloading video info webpage' % video_id)
404 def report_information_extraction(self, video_id):
405 """Report attempt to extract video information."""
406 self.to_screen(u'%s: Extracting video information' % video_id)
408 def report_unavailable_format(self, video_id, format):
409 """Report extracted video URL."""
410 self.to_screen(u'%s: Format %s not available' % (video_id, format))
412 def report_rtmp_download(self):
413 """Indicate the download will use the RTMP protocol."""
414 self.to_screen(u'RTMP download detected')
416 def _decrypt_signature(self, s):
417 """Turn the encrypted s field into a working signature"""
420 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
422 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
424 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
426 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
428 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
430 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
432 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
434 return s[81:36:-1] + s[0] + s[35:2:-1]
436 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
438 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
440 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
442 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
444 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
447 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
449 def _decrypt_signature_age_gate(self, s):
450 # The videos with age protection use another player, so the algorithms
453 return s[2:63] + s[82] + s[64:82] + s[63]
455 # Fallback to the other algortihms
456 return self._decrypt_signature(s)
458 def _get_available_subtitles(self, video_id):
460 sub_list = self._download_webpage(
461 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
462 video_id, note=False)
463 except ExtractorError as err:
464 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
466 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
471 params = compat_urllib_parse.urlencode({
474 'fmt': self._downloader.params.get('subtitlesformat'),
476 url = u'http://www.youtube.com/api/timedtext?' + params
477 sub_lang_list[lang] = url
478 if not sub_lang_list:
479 self._downloader.report_warning(u'video doesn\'t have subtitles')
483 def _get_available_automatic_caption(self, video_id, webpage):
484 """We need the webpage for getting the captions url, pass it as an
485 argument to speed up the process."""
486 sub_format = self._downloader.params.get('subtitlesformat')
487 self.to_screen(u'%s: Looking for automatic captions' % video_id)
488 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
489 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
491 self._downloader.report_warning(err_msg)
493 player_config = json.loads(mobj.group(1))
495 args = player_config[u'args']
496 caption_url = args[u'ttsurl']
497 timestamp = args[u'timestamp']
498 # We get the available subtitles
499 list_params = compat_urllib_parse.urlencode({
504 list_url = caption_url + '&' + list_params
505 list_page = self._download_webpage(list_url, video_id)
506 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
507 original_lang_node = caption_list.find('track')
508 if original_lang_node.attrib.get('kind') != 'asr' :
509 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
511 original_lang = original_lang_node.attrib['lang_code']
514 for lang_node in caption_list.findall('target'):
515 sub_lang = lang_node.attrib['lang_code']
516 params = compat_urllib_parse.urlencode({
517 'lang': original_lang,
523 sub_lang_list[sub_lang] = caption_url + '&' + params
525 # An extractor error can be raise by the download process if there are
526 # no automatic captions but there are subtitles
527 except (KeyError, ExtractorError):
528 self._downloader.report_warning(err_msg)
531 def _print_formats(self, formats):
532 print('Available formats:')
534 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
535 self._video_dimensions.get(x, '???'),
536 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
538 def _extract_id(self, url):
539 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
541 raise ExtractorError(u'Invalid URL: %s' % url)
542 video_id = mobj.group(2)
545 def _get_video_url_list(self, url_map):
547 Transform a dictionary in the format {itag:url} to a list of (itag, url)
548 with the requested formats.
550 req_format = self._downloader.params.get('format', None)
551 format_limit = self._downloader.params.get('format_limit', None)
552 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
553 if format_limit is not None and format_limit in available_formats:
554 format_list = available_formats[available_formats.index(format_limit):]
556 format_list = available_formats
557 existing_formats = [x for x in format_list if x in url_map]
558 if len(existing_formats) == 0:
559 raise ExtractorError(u'no known formats available for video')
560 if self._downloader.params.get('listformats', None):
561 self._print_formats(existing_formats)
563 if req_format is None or req_format == 'best':
564 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
565 elif req_format == 'worst':
566 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
567 elif req_format in ('-1', 'all'):
568 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
570 # Specific formats. We pick the first in a slash-delimeted sequence.
571 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
572 # available in the specified format. For example,
573 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
574 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
575 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
576 req_formats = req_format.split('/')
577 video_url_list = None
578 for rf in req_formats:
580 video_url_list = [(rf, url_map[rf])]
582 if rf in self._video_formats_map:
583 for srf in self._video_formats_map[rf]:
585 video_url_list = [(srf, url_map[srf])]
590 if video_url_list is None:
591 raise ExtractorError(u'requested format not available')
592 return video_url_list
594 def _extract_from_m3u8(self, manifest_url, video_id):
596 def _get_urls(_manifest):
597 lines = _manifest.split('\n')
598 urls = filter(lambda l: l and not l.startswith('#'),
601 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
602 formats_urls = _get_urls(manifest)
603 for format_url in formats_urls:
604 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
605 url_map[itag] = format_url
608 def _real_extract(self, url):
609 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
610 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
612 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
613 mobj = re.search(self._NEXT_URL_RE, url)
615 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
616 video_id = self._extract_id(url)
619 self.report_video_webpage_download(video_id)
620 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
621 request = compat_urllib_request.Request(url)
623 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
624 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
627 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
629 # Attempt to extract SWF player URL
630 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
632 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
637 self.report_video_info_webpage_download(video_id)
638 if re.search(r'player-age-gate-content">', video_webpage) is not None:
639 self.report_age_confirmation()
641 # We simulate the access to the video from www.youtube.com/v/{video_id}
642 # this can be viewed without login into Youtube
643 data = compat_urllib_parse.urlencode({'video_id': video_id,
647 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
651 video_info_url = 'https://www.youtube.com/get_video_info?' + data
652 video_info_webpage = self._download_webpage(video_info_url, video_id,
654 errnote='unable to download video info webpage')
655 video_info = compat_parse_qs(video_info_webpage)
658 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
659 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
660 % (video_id, el_type))
661 video_info_webpage = self._download_webpage(video_info_url, video_id,
663 errnote='unable to download video info webpage')
664 video_info = compat_parse_qs(video_info_webpage)
665 if 'token' in video_info:
667 if 'token' not in video_info:
668 if 'reason' in video_info:
669 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
671 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
673 # Check for "rental" videos
674 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
675 raise ExtractorError(u'"rental" videos not supported')
677 # Start extracting information
678 self.report_information_extraction(video_id)
681 if 'author' not in video_info:
682 raise ExtractorError(u'Unable to extract uploader name')
683 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
686 video_uploader_id = None
687 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
689 video_uploader_id = mobj.group(1)
691 self._downloader.report_warning(u'unable to extract uploader nickname')
694 if 'title' not in video_info:
695 raise ExtractorError(u'Unable to extract video title')
696 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
699 # We try first to get a high quality image:
700 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
701 video_webpage, re.DOTALL)
702 if m_thumb is not None:
703 video_thumbnail = m_thumb.group(1)
704 elif 'thumbnail_url' not in video_info:
705 self._downloader.report_warning(u'unable to extract video thumbnail')
707 else: # don't panic if we can't find it
708 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
712 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
714 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
715 upload_date = unified_strdate(upload_date)
718 video_description = get_element_by_id("eow-description", video_webpage)
719 if video_description:
720 video_description = clean_html(video_description)
722 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
724 video_description = unescapeHTML(fd_mobj.group(1))
726 video_description = u''
729 video_subtitles = self.extract_subtitles(video_id, video_webpage)
731 if self._downloader.params.get('listsubtitles', False):
732 self._list_available_subtitles(video_id, video_webpage)
735 if 'length_seconds' not in video_info:
736 self._downloader.report_warning(u'unable to extract video duration')
739 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
741 # Decide which formats to download
744 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
746 raise ValueError('Could not find vevo ID')
747 info = json.loads(mobj.group(1))
749 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
750 # this signatures are encrypted
751 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
753 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
754 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
755 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
757 if 'url_encoded_fmt_stream_map' in video_info:
758 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
760 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
761 elif 'adaptive_fmts' in video_info:
762 if 'url_encoded_fmt_stream_map' in video_info:
763 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
765 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
769 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
770 self.report_rtmp_download()
771 video_url_list = [(None, video_info['conn'][0])]
772 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
773 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
774 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
776 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
777 url_data = compat_parse_qs(url_data_str)
778 if 'itag' in url_data and 'url' in url_data:
779 url = url_data['url'][0]
780 if 'sig' in url_data:
781 url += '&signature=' + url_data['sig'][0]
782 elif 's' in url_data:
783 if self._downloader.params.get('verbose'):
786 player = 'flash player'
788 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
789 'html5 player', fatal=False)
790 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
791 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
792 (len(s), parts_sizes, url_data['itag'][0], player))
793 encrypted_sig = url_data['s'][0]
795 signature = self._decrypt_signature_age_gate(encrypted_sig)
797 signature = self._decrypt_signature(encrypted_sig)
798 url += '&signature=' + signature
799 if 'ratebypass' not in url:
800 url += '&ratebypass=yes'
801 url_map[url_data['itag'][0]] = url
802 video_url_list = self._get_video_url_list(url_map)
803 if not video_url_list:
805 elif video_info.get('hlsvp'):
806 manifest_url = video_info['hlsvp'][0]
807 url_map = self._extract_from_m3u8(manifest_url, video_id)
808 video_url_list = self._get_video_url_list(url_map)
809 if not video_url_list:
813 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
816 for format_param, video_real_url in video_url_list:
818 video_extension = self._video_extensions.get(format_param, 'flv')
820 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
821 self._video_dimensions.get(format_param, '???'),
822 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
826 'url': video_real_url,
827 'uploader': video_uploader,
828 'uploader_id': video_uploader_id,
829 'upload_date': upload_date,
830 'title': video_title,
831 'ext': video_extension,
832 'format': video_format,
833 'thumbnail': video_thumbnail,
834 'description': video_description,
835 'player_url': player_url,
836 'subtitles': video_subtitles,
837 'duration': video_duration
841 class YoutubePlaylistIE(InfoExtractor):
842 IE_DESC = u'YouTube.com playlists'
848 (?:course|view_play_list|my_playlists|artist|playlist|watch)
849 \? (?:.*?&)*? (?:p|a|list)=
852 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
855 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
857 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
859 IE_NAME = u'youtube:playlist'
862 def suitable(cls, url):
863 """Receives a URL and returns True if suitable for this IE."""
864 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
866 def _real_extract(self, url):
867 # Extract playlist id
868 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
870 raise ExtractorError(u'Invalid URL: %s' % url)
872 # Download playlist videos from API
873 playlist_id = mobj.group(1) or mobj.group(2)
876 for page_num in itertools.count(1):
877 start_index = self._MAX_RESULTS * (page_num - 1) + 1
878 if start_index >= 1000:
879 self._downloader.report_warning(u'Max number of results reached')
881 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
882 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
885 response = json.loads(page)
886 except ValueError as err:
887 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
889 if 'feed' not in response:
890 raise ExtractorError(u'Got a malformed response from YouTube API')
891 playlist_title = response['feed']['title']['$t']
892 if 'entry' not in response['feed']:
893 # Number of videos is a multiple of self._MAX_RESULTS
896 for entry in response['feed']['entry']:
897 index = entry['yt$position']['$t']
898 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
901 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
904 videos = [v[1] for v in sorted(videos)]
906 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
907 return [self.playlist_result(url_results, playlist_id, playlist_title)]
910 class YoutubeChannelIE(InfoExtractor):
911 IE_DESC = u'YouTube.com channels'
912 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
913 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
914 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
915 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
916 IE_NAME = u'youtube:channel'
918 def extract_videos_from_page(self, page):
920 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
921 if mobj.group(1) not in ids_in_page:
922 ids_in_page.append(mobj.group(1))
925 def _real_extract(self, url):
927 mobj = re.match(self._VALID_URL, url)
929 raise ExtractorError(u'Invalid URL: %s' % url)
931 # Download channel page
932 channel_id = mobj.group(1)
936 url = self._TEMPLATE_URL % (channel_id, pagenum)
937 page = self._download_webpage(url, channel_id,
938 u'Downloading page #%s' % pagenum)
940 # Extract video identifiers
941 ids_in_page = self.extract_videos_from_page(page)
942 video_ids.extend(ids_in_page)
944 # Download any subsequent channel pages using the json-based channel_ajax query
945 if self._MORE_PAGES_INDICATOR in page:
946 for pagenum in itertools.count(1):
947 url = self._MORE_PAGES_URL % (pagenum, channel_id)
948 page = self._download_webpage(url, channel_id,
949 u'Downloading page #%s' % pagenum)
951 page = json.loads(page)
953 ids_in_page = self.extract_videos_from_page(page['content_html'])
954 video_ids.extend(ids_in_page)
956 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
959 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
961 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
962 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
963 return [self.playlist_result(url_entries, channel_id)]
966 class YoutubeUserIE(InfoExtractor):
967 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
968 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
969 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
970 _GDATA_PAGE_SIZE = 50
971 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
972 IE_NAME = u'youtube:user'
975 def suitable(cls, url):
976 # Don't return True if the url can be extracted with other youtube
977 # extractor, the regex would is too permissive and it would match.
978 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
979 if any(ie.suitable(url) for ie in other_ies): return False
980 else: return super(YoutubeUserIE, cls).suitable(url)
982 def _real_extract(self, url):
984 mobj = re.match(self._VALID_URL, url)
986 raise ExtractorError(u'Invalid URL: %s' % url)
988 username = mobj.group(1)
990 # Download video ids using YouTube Data API. Result size per
991 # query is limited (currently to 50 videos) so we need to query
992 # page by page until there are no video ids - it means we got
997 for pagenum in itertools.count(0):
998 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1000 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1001 page = self._download_webpage(gdata_url, username,
1002 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1005 response = json.loads(page)
1006 except ValueError as err:
1007 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1009 # Extract video identifiers
1011 for entry in response['feed']['entry']:
1012 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1013 video_ids.extend(ids_in_page)
1015 # A little optimization - if current page is not
1016 # "full", ie. does not contain PAGE_SIZE video ids then
1017 # we can assume that this page is the last one - there
1018 # are no more ids on further pages - no need to query
1021 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1024 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1025 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1026 return [self.playlist_result(url_results, playlist_title = username)]
1028 class YoutubeSearchIE(SearchInfoExtractor):
1029 IE_DESC = u'YouTube.com searches'
1030 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1032 IE_NAME = u'youtube:search'
1033 _SEARCH_KEY = 'ytsearch'
1035 def report_download_page(self, query, pagenum):
1036 """Report attempt to download search page with given number."""
1037 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1039 def _get_n_results(self, query, n):
1040 """Get a specified number of results for a query"""
1046 while (50 * pagenum) < limit:
1047 self.report_download_page(query, pagenum+1)
1048 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1049 request = compat_urllib_request.Request(result_url)
1051 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1052 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1053 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1054 api_response = json.loads(data)['data']
1056 if not 'items' in api_response:
1057 raise ExtractorError(u'[youtube] No video results')
1059 new_ids = list(video['id'] for video in api_response['items'])
1060 video_ids += new_ids
1062 limit = min(n, api_response['totalItems'])
1065 if len(video_ids) > n:
1066 video_ids = video_ids[:n]
1067 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1068 return self.playlist_result(videos, query)
1071 class YoutubeShowIE(InfoExtractor):
1072 IE_DESC = u'YouTube.com (multi-season) shows'
1073 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1074 IE_NAME = u'youtube:show'
1076 def _real_extract(self, url):
1077 mobj = re.match(self._VALID_URL, url)
1078 show_name = mobj.group(1)
1079 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1080 # There's one playlist for each season of the show
1081 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1082 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1083 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1086 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1088 Base class for extractors that fetch info from
1089 http://www.youtube.com/feed_ajax
1090 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1092 _LOGIN_REQUIRED = True
1094 # use action_load_personal_feed instead of action_load_system_feed
1095 _PERSONAL_FEED = False
1098 def _FEED_TEMPLATE(self):
1099 action = 'action_load_system_feed'
1100 if self._PERSONAL_FEED:
1101 action = 'action_load_personal_feed'
1102 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1106 return u'youtube:%s' % self._FEED_NAME
1108 def _real_initialize(self):
1111 def _real_extract(self, url):
1113 # The step argument is available only in 2.7 or higher
1114 for i in itertools.count(0):
1115 paging = i*self._PAGING_STEP
1116 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1117 u'%s feed' % self._FEED_NAME,
1118 u'Downloading page %s' % i)
1119 info = json.loads(info)
1120 feed_html = info['feed_html']
1121 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1122 ids = orderedSet(m.group(1) for m in m_ids)
1123 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1124 if info['paging'] is None:
1126 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1128 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1129 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1130 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1131 _FEED_NAME = 'subscriptions'
1132 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1134 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1135 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1136 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1137 _FEED_NAME = 'recommended'
1138 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1140 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1141 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1142 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1143 _FEED_NAME = 'watch_later'
1144 _PLAYLIST_TITLE = u'Youtube Watch Later'
1146 _PERSONAL_FEED = True
1148 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1149 IE_NAME = u'youtube:favorites'
1150 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1151 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1152 _LOGIN_REQUIRED = True
1154 def _real_extract(self, url):
1155 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1156 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1157 return self.url_result(playlist_id, 'YoutubePlaylist')