9 from .common import InfoExtractor, SearchInfoExtractor
10 from .subtitles import SubtitlesIE
16 compat_urllib_request,
27 class YoutubeBaseInfoExtractor(SubtitlesIE):
28 """Provide base functions for Youtube extractors"""
29 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
30 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
31 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
32 _NETRC_MACHINE = 'youtube'
33 # If True it will raise an error if no login info is provided
34 _LOGIN_REQUIRED = False
36 def report_lang(self):
37 """Report attempt to set language."""
38 self.to_screen(u'Setting language')
40 def _set_language(self):
41 request = compat_urllib_request.Request(self._LANG_URL)
44 compat_urllib_request.urlopen(request).read()
45 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
46 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
51 (username, password) = self._get_login_info()
52 # No authentication to be performed
54 if self._LOGIN_REQUIRED:
55 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 request = compat_urllib_request.Request(self._LOGIN_URL)
60 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
61 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
62 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
67 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
70 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
76 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 u'PersistentCookie': u'yes',
82 u'bgresponse': u'js_disabled',
83 u'checkConnection': u'',
84 u'checkedDomains': u'youtube',
90 u'signIn': u'Sign in',
92 u'service': u'youtube',
96 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
98 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
99 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
100 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
103 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
107 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
108 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
112 def _confirm_age(self):
115 'action_confirm': 'Confirm',
117 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
119 self.report_age_confirmation()
120 compat_urllib_request.urlopen(request).read().decode('utf-8')
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
125 def _real_initialize(self):
126 if self._downloader is None:
128 if not self._set_language():
130 if not self._login():
134 def _get_available_subtitles(self, video_id):
135 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
137 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
138 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
139 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
141 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
146 params = compat_urllib_parse.urlencode({
149 'fmt': self._downloader.params.get('subtitlesformat'),
151 url = u'http://www.youtube.com/api/timedtext?' + params
152 sub_lang_list[lang] = url
153 if not sub_lang_list:
154 self._downloader.report_warning(u'video doesn\'t have subtitles')
158 def _request_automatic_caption(self, video_id, webpage):
159 """We need the webpage for getting the captions url, pass it as an
160 argument to speed up the process."""
161 sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
162 sub_format = self._downloader.params.get('subtitlesformat')
163 self.to_screen(u'%s: Looking for automatic captions' % video_id)
164 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
165 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
167 self._downloader.report_warning(err_msg)
169 player_config = json.loads(mobj.group(1))
171 args = player_config[u'args']
172 caption_url = args[u'ttsurl']
173 timestamp = args[u'timestamp']
174 params = compat_urllib_parse.urlencode({
181 subtitles_url = caption_url + '&' + params
182 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
183 return {sub_lang: sub}
184 # An extractor error can be raise by the download process if there are
185 # no automatic captions but there are subtitles
186 except (KeyError, ExtractorError):
187 self._downloader.report_warning(err_msg)
190 class YoutubeIE(YoutubeBaseInfoExtractor):
191 IE_DESC = u'YouTube.com'
194 (?:https?://)? # http(s):// (optional)
195 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
196 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
197 (?:.*?\#/)? # handle anchor (#/) redirect urls
198 (?: # the various things that can precede the ID:
199 (?:(?:v|embed|e)/) # v/ or embed/ or e/
200 |(?: # or the v= param in all its forms
201 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
202 (?:\?|\#!?) # the params delimiter ? or # or #!
203 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
207 |youtu\.be/ # just youtu.be/xxxx
209 )? # all until now is optional -> you can pass the naked ID
210 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
211 (?(1).+)? # if we found the ID, everything can follow
213 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
216 # Apple HTTP Live Streaming
217 '96', '95', '94', '93', '92', '132', '151',
219 '85', '84', '102', '83', '101', '82', '100',
221 '138', '137', '248', '136', '247', '135', '246',
222 '245', '244', '134', '243', '133', '242', '160',
224 '141', '172', '140', '171', '139',
226 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
227 # Apple HTTP Live Streaming
228 '96', '95', '94', '93', '92', '132', '151',
230 '85', '102', '84', '101', '83', '100', '82',
232 '138', '248', '137', '247', '136', '246', '245',
233 '244', '135', '243', '134', '242', '133', '160',
235 '172', '141', '171', '140', '139',
237 _video_formats_map = {
238 'flv': ['35', '34', '6', '5'],
239 '3gp': ['36', '17', '13'],
240 'mp4': ['38', '37', '22', '18'],
241 'webm': ['46', '45', '44', '43'],
243 _video_extensions = {
265 # Apple HTTP Live Streaming
297 _video_dimensions = {
379 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
380 u"file": u"BaW_jenozKc.mp4",
382 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
383 u"uploader": u"Philipp Hagemeister",
384 u"uploader_id": u"phihag",
385 u"upload_date": u"20121002",
386 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
390 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
391 u"file": u"1ltcDfZMA3U.flv",
392 u"note": u"Test VEVO video (#897)",
394 u"upload_date": u"20070518",
395 u"title": u"Maps - It Will Find You",
396 u"description": u"Music video by Maps performing It Will Find You.",
397 u"uploader": u"MuteUSA",
398 u"uploader_id": u"MuteUSA"
402 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
403 u"file": u"UxxajLWwzqY.mp4",
404 u"note": u"Test generic use_cipher_signature video (#897)",
406 u"upload_date": u"20120506",
407 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
408 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
409 u"uploader": u"Icona Pop",
410 u"uploader_id": u"IconaPop"
414 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
415 u"file": u"07FYdnEawAQ.mp4",
416 u"note": u"Test VEVO video with age protection (#956)",
418 u"upload_date": u"20130703",
419 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
420 u"description": u"md5:64249768eec3bc4276236606ea996373",
421 u"uploader": u"justintimberlakeVEVO",
422 u"uploader_id": u"justintimberlakeVEVO"
426 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
427 u'file': u'TGi3HqYrWHE.mp4',
428 u'note': u'm3u8 video',
430 u'title': u'Triathlon - Men - London 2012 Olympic Games',
431 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
432 u'uploader': u'olympic',
433 u'upload_date': u'20120807',
434 u'uploader_id': u'olympic',
437 u'skip_download': True,
444 def suitable(cls, url):
445 """Receives a URL and returns True if suitable for this IE."""
446 if YoutubePlaylistIE.suitable(url): return False
447 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
449 def report_video_webpage_download(self, video_id):
450 """Report attempt to download video webpage."""
451 self.to_screen(u'%s: Downloading video webpage' % video_id)
453 def report_video_info_webpage_download(self, video_id):
454 """Report attempt to download video info webpage."""
455 self.to_screen(u'%s: Downloading video info webpage' % video_id)
457 def report_information_extraction(self, video_id):
458 """Report attempt to extract video information."""
459 self.to_screen(u'%s: Extracting video information' % video_id)
461 def report_unavailable_format(self, video_id, format):
462 """Report extracted video URL."""
463 self.to_screen(u'%s: Format %s not available' % (video_id, format))
465 def report_rtmp_download(self):
466 """Indicate the download will use the RTMP protocol."""
467 self.to_screen(u'RTMP download detected')
469 def _decrypt_signature(self, s):
470 """Turn the encrypted s field into a working signature"""
473 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
475 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
477 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
479 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
481 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
483 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
485 return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]
487 return s[81:36:-1] + s[0] + s[35:2:-1]
489 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
491 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
493 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
495 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
497 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
500 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
502 def _decrypt_signature_age_gate(self, s):
503 # The videos with age protection use another player, so the algorithms
506 return s[2:63] + s[82] + s[64:82] + s[63]
508 # Fallback to the other algortihms
509 return self._decrypt_signature(s)
511 def _print_formats(self, formats):
512 print('Available formats:')
514 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
515 self._video_dimensions.get(x, '???'),
516 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
518 def _extract_id(self, url):
519 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
521 raise ExtractorError(u'Invalid URL: %s' % url)
522 video_id = mobj.group(2)
525 def _get_video_url_list(self, url_map):
527 Transform a dictionary in the format {itag:url} to a list of (itag, url)
528 with the requested formats.
530 req_format = self._downloader.params.get('format', None)
531 format_limit = self._downloader.params.get('format_limit', None)
532 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
533 if format_limit is not None and format_limit in available_formats:
534 format_list = available_formats[available_formats.index(format_limit):]
536 format_list = available_formats
537 existing_formats = [x for x in format_list if x in url_map]
538 if len(existing_formats) == 0:
539 raise ExtractorError(u'no known formats available for video')
540 if self._downloader.params.get('listformats', None):
541 self._print_formats(existing_formats)
543 if req_format is None or req_format == 'best':
544 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
545 elif req_format == 'worst':
546 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
547 elif req_format in ('-1', 'all'):
548 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
550 # Specific formats. We pick the first in a slash-delimeted sequence.
551 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
552 # available in the specified format. For example,
553 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
555 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
556 req_formats = req_format.split('/')
557 video_url_list = None
558 for rf in req_formats:
560 video_url_list = [(rf, url_map[rf])]
562 if rf in self._video_formats_map:
563 for srf in self._video_formats_map[rf]:
565 video_url_list = [(srf, url_map[srf])]
570 if video_url_list is None:
571 raise ExtractorError(u'requested format not available')
572 return video_url_list
574 def _extract_from_m3u8(self, manifest_url, video_id):
576 def _get_urls(_manifest):
577 lines = _manifest.split('\n')
578 urls = filter(lambda l: l and not l.startswith('#'),
581 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
582 formats_urls = _get_urls(manifest)
583 for format_url in formats_urls:
584 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
585 url_map[itag] = format_url
588 def _real_extract(self, url):
589 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
590 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
592 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
593 mobj = re.search(self._NEXT_URL_RE, url)
595 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
596 video_id = self._extract_id(url)
599 self.report_video_webpage_download(video_id)
600 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
601 request = compat_urllib_request.Request(url)
603 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
604 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
605 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
607 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
609 # Attempt to extract SWF player URL
610 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
612 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
617 self.report_video_info_webpage_download(video_id)
618 if re.search(r'player-age-gate-content">', video_webpage) is not None:
619 self.report_age_confirmation()
621 # We simulate the access to the video from www.youtube.com/v/{video_id}
622 # this can be viewed without login into Youtube
623 data = compat_urllib_parse.urlencode({'video_id': video_id,
627 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
631 video_info_url = 'https://www.youtube.com/get_video_info?' + data
632 video_info_webpage = self._download_webpage(video_info_url, video_id,
634 errnote='unable to download video info webpage')
635 video_info = compat_parse_qs(video_info_webpage)
638 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
639 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
640 % (video_id, el_type))
641 video_info_webpage = self._download_webpage(video_info_url, video_id,
643 errnote='unable to download video info webpage')
644 video_info = compat_parse_qs(video_info_webpage)
645 if 'token' in video_info:
647 if 'token' not in video_info:
648 if 'reason' in video_info:
649 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
651 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
653 # Check for "rental" videos
654 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
655 raise ExtractorError(u'"rental" videos not supported')
657 # Start extracting information
658 self.report_information_extraction(video_id)
661 if 'author' not in video_info:
662 raise ExtractorError(u'Unable to extract uploader name')
663 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
666 video_uploader_id = None
667 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
669 video_uploader_id = mobj.group(1)
671 self._downloader.report_warning(u'unable to extract uploader nickname')
674 if 'title' not in video_info:
675 raise ExtractorError(u'Unable to extract video title')
676 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
679 # We try first to get a high quality image:
680 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
681 video_webpage, re.DOTALL)
682 if m_thumb is not None:
683 video_thumbnail = m_thumb.group(1)
684 elif 'thumbnail_url' not in video_info:
685 self._downloader.report_warning(u'unable to extract video thumbnail')
687 else: # don't panic if we can't find it
688 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
692 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
694 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
695 upload_date = unified_strdate(upload_date)
698 video_description = get_element_by_id("eow-description", video_webpage)
699 if video_description:
700 video_description = clean_html(video_description)
702 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
704 video_description = unescapeHTML(fd_mobj.group(1))
706 video_description = u''
709 video_subtitles = None
711 if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
712 video_subtitles = self._extract_subtitles(video_id)
713 elif self._downloader.params.get('writeautomaticsub', False):
714 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
716 if self._downloader.params.get('listsubtitles', False):
717 self._list_available_subtitles(video_id)
720 if 'length_seconds' not in video_info:
721 self._downloader.report_warning(u'unable to extract video duration')
724 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
726 # Decide which formats to download
729 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
731 raise ValueError('Could not find vevo ID')
732 info = json.loads(mobj.group(1))
734 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
735 # this signatures are encrypted
736 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
738 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
739 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
740 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
742 if 'url_encoded_fmt_stream_map' in video_info:
743 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
745 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
746 elif 'adaptive_fmts' in video_info:
747 if 'url_encoded_fmt_stream_map' in video_info:
748 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
750 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
754 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
755 self.report_rtmp_download()
756 video_url_list = [(None, video_info['conn'][0])]
757 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
758 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
759 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
761 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
762 url_data = compat_parse_qs(url_data_str)
763 if 'itag' in url_data and 'url' in url_data:
764 url = url_data['url'][0]
765 if 'sig' in url_data:
766 url += '&signature=' + url_data['sig'][0]
767 elif 's' in url_data:
768 if self._downloader.params.get('verbose'):
771 player_version = self._search_regex(r'ad3-(.+?)\.swf',
772 video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
773 'flash player', fatal=False)
774 player = 'flash player %s' % player_version
776 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
777 'html5 player', fatal=False)
778 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
779 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
780 (len(s), parts_sizes, url_data['itag'][0], player))
781 encrypted_sig = url_data['s'][0]
783 signature = self._decrypt_signature_age_gate(encrypted_sig)
785 signature = self._decrypt_signature(encrypted_sig)
786 url += '&signature=' + signature
787 if 'ratebypass' not in url:
788 url += '&ratebypass=yes'
789 url_map[url_data['itag'][0]] = url
790 video_url_list = self._get_video_url_list(url_map)
791 if not video_url_list:
793 elif video_info.get('hlsvp'):
794 manifest_url = video_info['hlsvp'][0]
795 url_map = self._extract_from_m3u8(manifest_url, video_id)
796 video_url_list = self._get_video_url_list(url_map)
797 if not video_url_list:
801 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
804 for format_param, video_real_url in video_url_list:
806 video_extension = self._video_extensions.get(format_param, 'flv')
808 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
809 self._video_dimensions.get(format_param, '???'),
810 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
814 'url': video_real_url,
815 'uploader': video_uploader,
816 'uploader_id': video_uploader_id,
817 'upload_date': upload_date,
818 'title': video_title,
819 'ext': video_extension,
820 'format': video_format,
821 'thumbnail': video_thumbnail,
822 'description': video_description,
823 'player_url': player_url,
824 'subtitles': video_subtitles,
825 'duration': video_duration
829 class YoutubePlaylistIE(InfoExtractor):
830 IE_DESC = u'YouTube.com playlists'
836 (?:course|view_play_list|my_playlists|artist|playlist|watch)
837 \? (?:.*?&)*? (?:p|a|list)=
840 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
843 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
845 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
847 IE_NAME = u'youtube:playlist'
850 def suitable(cls, url):
851 """Receives a URL and returns True if suitable for this IE."""
852 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
854 def _real_extract(self, url):
855 # Extract playlist id
856 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
858 raise ExtractorError(u'Invalid URL: %s' % url)
860 # Download playlist videos from API
861 playlist_id = mobj.group(1) or mobj.group(2)
864 for page_num in itertools.count(1):
865 start_index = self._MAX_RESULTS * (page_num - 1) + 1
866 if start_index >= 1000:
867 self._downloader.report_warning(u'Max number of results reached')
869 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
870 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
873 response = json.loads(page)
874 except ValueError as err:
875 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
877 if 'feed' not in response:
878 raise ExtractorError(u'Got a malformed response from YouTube API')
879 playlist_title = response['feed']['title']['$t']
880 if 'entry' not in response['feed']:
881 # Number of videos is a multiple of self._MAX_RESULTS
884 for entry in response['feed']['entry']:
885 index = entry['yt$position']['$t']
886 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
889 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
892 videos = [v[1] for v in sorted(videos)]
894 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
895 return [self.playlist_result(url_results, playlist_id, playlist_title)]
898 class YoutubeChannelIE(InfoExtractor):
899 IE_DESC = u'YouTube.com channels'
900 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
901 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
902 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
903 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
904 IE_NAME = u'youtube:channel'
906 def extract_videos_from_page(self, page):
908 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
909 if mobj.group(1) not in ids_in_page:
910 ids_in_page.append(mobj.group(1))
913 def _real_extract(self, url):
915 mobj = re.match(self._VALID_URL, url)
917 raise ExtractorError(u'Invalid URL: %s' % url)
919 # Download channel page
920 channel_id = mobj.group(1)
924 url = self._TEMPLATE_URL % (channel_id, pagenum)
925 page = self._download_webpage(url, channel_id,
926 u'Downloading page #%s' % pagenum)
928 # Extract video identifiers
929 ids_in_page = self.extract_videos_from_page(page)
930 video_ids.extend(ids_in_page)
932 # Download any subsequent channel pages using the json-based channel_ajax query
933 if self._MORE_PAGES_INDICATOR in page:
934 for pagenum in itertools.count(1):
935 url = self._MORE_PAGES_URL % (pagenum, channel_id)
936 page = self._download_webpage(url, channel_id,
937 u'Downloading page #%s' % pagenum)
939 page = json.loads(page)
941 ids_in_page = self.extract_videos_from_page(page['content_html'])
942 video_ids.extend(ids_in_page)
944 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
947 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
949 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
950 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
951 return [self.playlist_result(url_entries, channel_id)]
954 class YoutubeUserIE(InfoExtractor):
955 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
956 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
957 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
958 _GDATA_PAGE_SIZE = 50
959 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
960 IE_NAME = u'youtube:user'
963 def suitable(cls, url):
964 # Don't return True if the url can be extracted with other youtube
965 # extractor, the regex would is too permissive and it would match.
966 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
967 if any(ie.suitable(url) for ie in other_ies): return False
968 else: return super(YoutubeUserIE, cls).suitable(url)
970 def _real_extract(self, url):
972 mobj = re.match(self._VALID_URL, url)
974 raise ExtractorError(u'Invalid URL: %s' % url)
976 username = mobj.group(1)
978 # Download video ids using YouTube Data API. Result size per
979 # query is limited (currently to 50 videos) so we need to query
980 # page by page until there are no video ids - it means we got
985 for pagenum in itertools.count(0):
986 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
988 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
989 page = self._download_webpage(gdata_url, username,
990 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
993 response = json.loads(page)
994 except ValueError as err:
995 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
997 # Extract video identifiers
999 for entry in response['feed']['entry']:
1000 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1001 video_ids.extend(ids_in_page)
1003 # A little optimization - if current page is not
1004 # "full", ie. does not contain PAGE_SIZE video ids then
1005 # we can assume that this page is the last one - there
1006 # are no more ids on further pages - no need to query
1009 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1012 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1013 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1014 return [self.playlist_result(url_results, playlist_title = username)]
1016 class YoutubeSearchIE(SearchInfoExtractor):
1017 IE_DESC = u'YouTube.com searches'
1018 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1020 IE_NAME = u'youtube:search'
1021 _SEARCH_KEY = 'ytsearch'
1023 def report_download_page(self, query, pagenum):
1024 """Report attempt to download search page with given number."""
1025 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1027 def _get_n_results(self, query, n):
1028 """Get a specified number of results for a query"""
1034 while (50 * pagenum) < limit:
1035 self.report_download_page(query, pagenum+1)
1036 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1037 request = compat_urllib_request.Request(result_url)
1039 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1040 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1041 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1042 api_response = json.loads(data)['data']
1044 if not 'items' in api_response:
1045 raise ExtractorError(u'[youtube] No video results')
1047 new_ids = list(video['id'] for video in api_response['items'])
1048 video_ids += new_ids
1050 limit = min(n, api_response['totalItems'])
1053 if len(video_ids) > n:
1054 video_ids = video_ids[:n]
1055 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1056 return self.playlist_result(videos, query)
1059 class YoutubeShowIE(InfoExtractor):
1060 IE_DESC = u'YouTube.com (multi-season) shows'
1061 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1062 IE_NAME = u'youtube:show'
1064 def _real_extract(self, url):
1065 mobj = re.match(self._VALID_URL, url)
1066 show_name = mobj.group(1)
1067 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1068 # There's one playlist for each season of the show
1069 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1070 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1071 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1074 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1076 Base class for extractors that fetch info from
1077 http://www.youtube.com/feed_ajax
1078 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1080 _LOGIN_REQUIRED = True
1082 # use action_load_personal_feed instead of action_load_system_feed
1083 _PERSONAL_FEED = False
1086 def _FEED_TEMPLATE(self):
1087 action = 'action_load_system_feed'
1088 if self._PERSONAL_FEED:
1089 action = 'action_load_personal_feed'
1090 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1094 return u'youtube:%s' % self._FEED_NAME
1096 def _real_initialize(self):
1099 def _real_extract(self, url):
1101 # The step argument is available only in 2.7 or higher
1102 for i in itertools.count(0):
1103 paging = i*self._PAGING_STEP
1104 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1105 u'%s feed' % self._FEED_NAME,
1106 u'Downloading page %s' % i)
1107 info = json.loads(info)
1108 feed_html = info['feed_html']
1109 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1110 ids = orderedSet(m.group(1) for m in m_ids)
1111 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1112 if info['paging'] is None:
1114 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1116 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1117 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1118 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1119 _FEED_NAME = 'subscriptions'
1120 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1122 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1123 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1124 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1125 _FEED_NAME = 'recommended'
1126 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1128 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1129 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1130 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1131 _FEED_NAME = 'watch_later'
1132 _PLAYLIST_TITLE = u'Youtube Watch Later'
1134 _PERSONAL_FEED = True
1136 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1137 IE_NAME = u'youtube:favorites'
1138 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1139 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1140 _LOGIN_REQUIRED = True
1142 def _real_extract(self, url):
1143 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1144 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1145 return self.url_result(playlist_id, 'YoutubePlaylist')