8 import xml.etree.ElementTree
10 from .common import InfoExtractor, SearchInfoExtractor
11 from .subtitles import SubtitlesInfoExtractor
17 compat_urllib_request,
28 class YoutubeBaseInfoExtractor(InfoExtractor):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE = 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED = False
37 def report_lang(self):
38 """Report attempt to set language."""
39 self.to_screen(u'Setting language')
41 def _set_language(self):
42 request = compat_urllib_request.Request(self._LANG_URL)
45 compat_urllib_request.urlopen(request).read()
46 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
47 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 request = compat_urllib_request.Request(self._LOGIN_URL)
61 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
62 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
63 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
68 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
71 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'PersistentCookie': u'yes',
83 u'bgresponse': u'js_disabled',
84 u'checkConnection': u'',
85 u'checkedDomains': u'youtube',
91 u'signIn': u'Sign in',
93 u'service': u'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
100 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
101 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
104 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
109 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
113 def _confirm_age(self):
116 'action_confirm': 'Confirm',
118 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
120 self.report_age_confirmation()
121 compat_urllib_request.urlopen(request).read().decode('utf-8')
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
126 def _real_initialize(self):
127 if self._downloader is None:
129 if not self._set_language():
131 if not self._login():
136 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
137 IE_DESC = u'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
154 |youtu\.be/ # just youtu.be/xxxx
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 # Listed in order of quality
162 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
163 # Apple HTTP Live Streaming
164 '96', '95', '94', '93', '92', '132', '151',
166 '85', '84', '102', '83', '101', '82', '100',
168 '138', '137', '248', '136', '247', '135', '246',
169 '245', '244', '134', '243', '133', '242', '160',
171 '141', '172', '140', '171', '139',
173 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '102', '84', '101', '83', '100', '82',
179 '138', '248', '137', '247', '136', '246', '245',
180 '244', '135', '243', '134', '242', '133', '160',
182 '172', '141', '171', '140', '139',
184 _video_formats_map = {
185 'flv': ['35', '34', '6', '5'],
186 '3gp': ['36', '17', '13'],
187 'mp4': ['38', '37', '22', '18'],
188 'webm': ['46', '45', '44', '43'],
190 _video_extensions = {
212 # Apple HTTP Live Streaming
244 _video_dimensions = {
326 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
327 u"file": u"BaW_jenozKc.mp4",
329 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
330 u"uploader": u"Philipp Hagemeister",
331 u"uploader_id": u"phihag",
332 u"upload_date": u"20121002",
333 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
337 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
338 u"file": u"1ltcDfZMA3U.flv",
339 u"note": u"Test VEVO video (#897)",
341 u"upload_date": u"20070518",
342 u"title": u"Maps - It Will Find You",
343 u"description": u"Music video by Maps performing It Will Find You.",
344 u"uploader": u"MuteUSA",
345 u"uploader_id": u"MuteUSA"
349 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
350 u"file": u"UxxajLWwzqY.mp4",
351 u"note": u"Test generic use_cipher_signature video (#897)",
353 u"upload_date": u"20120506",
354 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
355 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
356 u"uploader": u"Icona Pop",
357 u"uploader_id": u"IconaPop"
361 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
362 u"file": u"07FYdnEawAQ.mp4",
363 u"note": u"Test VEVO video with age protection (#956)",
365 u"upload_date": u"20130703",
366 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
367 u"description": u"md5:64249768eec3bc4276236606ea996373",
368 u"uploader": u"justintimberlakeVEVO",
369 u"uploader_id": u"justintimberlakeVEVO"
373 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
374 u'file': u'TGi3HqYrWHE.mp4',
375 u'note': u'm3u8 video',
377 u'title': u'Triathlon - Men - London 2012 Olympic Games',
378 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
379 u'uploader': u'olympic',
380 u'upload_date': u'20120807',
381 u'uploader_id': u'olympic',
384 u'skip_download': True,
391 def suitable(cls, url):
392 """Receives a URL and returns True if suitable for this IE."""
393 if YoutubePlaylistIE.suitable(url): return False
394 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
396 def report_video_webpage_download(self, video_id):
397 """Report attempt to download video webpage."""
398 self.to_screen(u'%s: Downloading video webpage' % video_id)
400 def report_video_info_webpage_download(self, video_id):
401 """Report attempt to download video info webpage."""
402 self.to_screen(u'%s: Downloading video info webpage' % video_id)
404 def report_information_extraction(self, video_id):
405 """Report attempt to extract video information."""
406 self.to_screen(u'%s: Extracting video information' % video_id)
408 def report_unavailable_format(self, video_id, format):
409 """Report extracted video URL."""
410 self.to_screen(u'%s: Format %s not available' % (video_id, format))
412 def report_rtmp_download(self):
413 """Indicate the download will use the RTMP protocol."""
414 self.to_screen(u'RTMP download detected')
416 def _decrypt_signature(self, s):
417 """Turn the encrypted s field into a working signature"""
420 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
422 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
424 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
426 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
428 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
430 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
432 return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]
434 return s[81:36:-1] + s[0] + s[35:2:-1]
436 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
438 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
440 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
442 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
444 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
447 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
449 def _decrypt_signature_age_gate(self, s):
450 # The videos with age protection use another player, so the algorithms
453 return s[2:63] + s[82] + s[64:82] + s[63]
455 # Fallback to the other algortihms
456 return self._decrypt_signature(s)
458 def _get_available_subtitles(self, video_id):
460 sub_list = self._download_webpage(
461 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
462 video_id, note=False)
463 except ExtractorError as err:
464 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
466 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
471 params = compat_urllib_parse.urlencode({
474 'fmt': self._downloader.params.get('subtitlesformat'),
476 url = u'http://www.youtube.com/api/timedtext?' + params
477 sub_lang_list[lang] = url
478 if not sub_lang_list:
479 self._downloader.report_warning(u'video doesn\'t have subtitles')
483 def _get_available_automatic_caption(self, video_id, webpage):
484 """We need the webpage for getting the captions url, pass it as an
485 argument to speed up the process."""
486 sub_format = self._downloader.params.get('subtitlesformat')
487 self.to_screen(u'%s: Looking for automatic captions' % video_id)
488 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
489 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
491 self._downloader.report_warning(err_msg)
493 player_config = json.loads(mobj.group(1))
495 args = player_config[u'args']
496 caption_url = args[u'ttsurl']
497 timestamp = args[u'timestamp']
498 # We get the available subtitles
499 list_params = compat_urllib_parse.urlencode({
504 list_url = caption_url + '&' + list_params
505 list_page = self._download_webpage(list_url, video_id)
506 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
507 original_lang_node = caption_list.find('track')
508 if original_lang_node.attrib.get('kind') != 'asr' :
509 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
511 original_lang = original_lang_node.attrib['lang_code']
514 for lang_node in caption_list.findall('target'):
515 sub_lang = lang_node.attrib['lang_code']
516 params = compat_urllib_parse.urlencode({
517 'lang': original_lang,
523 sub_lang_list[sub_lang] = caption_url + '&' + params
525 # An extractor error can be raise by the download process if there are
526 # no automatic captions but there are subtitles
527 except (KeyError, ExtractorError):
528 self._downloader.report_warning(err_msg)
531 def _print_formats(self, formats):
532 print('Available formats:')
534 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
535 self._video_dimensions.get(x, '???'),
536 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
538 def _extract_id(self, url):
539 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
541 raise ExtractorError(u'Invalid URL: %s' % url)
542 video_id = mobj.group(2)
545 def _get_video_url_list(self, url_map):
547 Transform a dictionary in the format {itag:url} to a list of (itag, url)
548 with the requested formats.
550 req_format = self._downloader.params.get('format', None)
551 format_limit = self._downloader.params.get('format_limit', None)
552 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
553 if format_limit is not None and format_limit in available_formats:
554 format_list = available_formats[available_formats.index(format_limit):]
556 format_list = available_formats
557 existing_formats = [x for x in format_list if x in url_map]
558 if len(existing_formats) == 0:
559 raise ExtractorError(u'no known formats available for video')
560 if self._downloader.params.get('listformats', None):
561 self._print_formats(existing_formats)
563 if req_format is None or req_format == 'best':
564 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
565 elif req_format == 'worst':
566 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
567 elif req_format in ('-1', 'all'):
568 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
570 # Specific formats. We pick the first in a slash-delimeted sequence.
571 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
572 # available in the specified format. For example,
573 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
574 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
575 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
576 req_formats = req_format.split('/')
577 video_url_list = None
578 for rf in req_formats:
580 video_url_list = [(rf, url_map[rf])]
582 if rf in self._video_formats_map:
583 for srf in self._video_formats_map[rf]:
585 video_url_list = [(srf, url_map[srf])]
590 if video_url_list is None:
591 raise ExtractorError(u'requested format not available')
592 return video_url_list
594 def _extract_from_m3u8(self, manifest_url, video_id):
596 def _get_urls(_manifest):
597 lines = _manifest.split('\n')
598 urls = filter(lambda l: l and not l.startswith('#'),
601 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
602 formats_urls = _get_urls(manifest)
603 for format_url in formats_urls:
604 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
605 url_map[itag] = format_url
608 def _real_extract(self, url):
609 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
610 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
612 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
613 mobj = re.search(self._NEXT_URL_RE, url)
615 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
616 video_id = self._extract_id(url)
619 self.report_video_webpage_download(video_id)
620 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
621 request = compat_urllib_request.Request(url)
623 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
624 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
627 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
629 # Attempt to extract SWF player URL
630 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
632 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
637 self.report_video_info_webpage_download(video_id)
638 if re.search(r'player-age-gate-content">', video_webpage) is not None:
639 self.report_age_confirmation()
641 # We simulate the access to the video from www.youtube.com/v/{video_id}
642 # this can be viewed without login into Youtube
643 data = compat_urllib_parse.urlencode({'video_id': video_id,
647 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
651 video_info_url = 'https://www.youtube.com/get_video_info?' + data
652 video_info_webpage = self._download_webpage(video_info_url, video_id,
654 errnote='unable to download video info webpage')
655 video_info = compat_parse_qs(video_info_webpage)
658 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
659 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
660 % (video_id, el_type))
661 video_info_webpage = self._download_webpage(video_info_url, video_id,
663 errnote='unable to download video info webpage')
664 video_info = compat_parse_qs(video_info_webpage)
665 if 'token' in video_info:
667 if 'token' not in video_info:
668 if 'reason' in video_info:
669 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
671 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
673 # Check for "rental" videos
674 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
675 raise ExtractorError(u'"rental" videos not supported')
677 # Start extracting information
678 self.report_information_extraction(video_id)
681 if 'author' not in video_info:
682 raise ExtractorError(u'Unable to extract uploader name')
683 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
686 video_uploader_id = None
687 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
689 video_uploader_id = mobj.group(1)
691 self._downloader.report_warning(u'unable to extract uploader nickname')
694 if 'title' not in video_info:
695 raise ExtractorError(u'Unable to extract video title')
696 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
699 # We try first to get a high quality image:
700 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
701 video_webpage, re.DOTALL)
702 if m_thumb is not None:
703 video_thumbnail = m_thumb.group(1)
704 elif 'thumbnail_url' not in video_info:
705 self._downloader.report_warning(u'unable to extract video thumbnail')
707 else: # don't panic if we can't find it
708 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
712 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
714 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
715 upload_date = unified_strdate(upload_date)
718 video_description = get_element_by_id("eow-description", video_webpage)
719 if video_description:
720 video_description = clean_html(video_description)
722 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
724 video_description = unescapeHTML(fd_mobj.group(1))
726 video_description = u''
729 video_subtitles = self.extract_subtitles(video_id, video_webpage)
731 if self._downloader.params.get('listsubtitles', False):
732 self._list_available_subtitles(video_id, video_webpage)
735 if 'length_seconds' not in video_info:
736 self._downloader.report_warning(u'unable to extract video duration')
739 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
741 # Decide which formats to download
744 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
746 raise ValueError('Could not find vevo ID')
747 info = json.loads(mobj.group(1))
749 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
750 # this signatures are encrypted
751 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
753 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
754 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
755 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
757 if 'url_encoded_fmt_stream_map' in video_info:
758 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
760 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
761 elif 'adaptive_fmts' in video_info:
762 if 'url_encoded_fmt_stream_map' in video_info:
763 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
765 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
769 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
770 self.report_rtmp_download()
771 video_url_list = [(None, video_info['conn'][0])]
772 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
773 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
774 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
776 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
777 url_data = compat_parse_qs(url_data_str)
778 if 'itag' in url_data and 'url' in url_data:
779 url = url_data['url'][0]
780 if 'sig' in url_data:
781 url += '&signature=' + url_data['sig'][0]
782 elif 's' in url_data:
783 if self._downloader.params.get('verbose'):
786 player_version = self._search_regex(r'ad3-(.+?)\.swf',
787 video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
788 'flash player', fatal=False)
789 player = 'flash player %s' % player_version
791 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
792 'html5 player', fatal=False)
793 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
794 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
795 (len(s), parts_sizes, url_data['itag'][0], player))
796 encrypted_sig = url_data['s'][0]
798 signature = self._decrypt_signature_age_gate(encrypted_sig)
800 signature = self._decrypt_signature(encrypted_sig)
801 url += '&signature=' + signature
802 if 'ratebypass' not in url:
803 url += '&ratebypass=yes'
804 url_map[url_data['itag'][0]] = url
805 video_url_list = self._get_video_url_list(url_map)
806 if not video_url_list:
808 elif video_info.get('hlsvp'):
809 manifest_url = video_info['hlsvp'][0]
810 url_map = self._extract_from_m3u8(manifest_url, video_id)
811 video_url_list = self._get_video_url_list(url_map)
812 if not video_url_list:
816 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
819 for format_param, video_real_url in video_url_list:
821 video_extension = self._video_extensions.get(format_param, 'flv')
823 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
824 self._video_dimensions.get(format_param, '???'),
825 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
829 'url': video_real_url,
830 'uploader': video_uploader,
831 'uploader_id': video_uploader_id,
832 'upload_date': upload_date,
833 'title': video_title,
834 'ext': video_extension,
835 'format': video_format,
836 'thumbnail': video_thumbnail,
837 'description': video_description,
838 'player_url': player_url,
839 'subtitles': video_subtitles,
840 'duration': video_duration
844 class YoutubePlaylistIE(InfoExtractor):
845 IE_DESC = u'YouTube.com playlists'
851 (?:course|view_play_list|my_playlists|artist|playlist|watch)
852 \? (?:.*?&)*? (?:p|a|list)=
855 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
858 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
860 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
862 IE_NAME = u'youtube:playlist'
865 def suitable(cls, url):
866 """Receives a URL and returns True if suitable for this IE."""
867 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
869 def _real_extract(self, url):
870 # Extract playlist id
871 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
873 raise ExtractorError(u'Invalid URL: %s' % url)
875 # Download playlist videos from API
876 playlist_id = mobj.group(1) or mobj.group(2)
879 for page_num in itertools.count(1):
880 start_index = self._MAX_RESULTS * (page_num - 1) + 1
881 if start_index >= 1000:
882 self._downloader.report_warning(u'Max number of results reached')
884 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
885 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
888 response = json.loads(page)
889 except ValueError as err:
890 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
892 if 'feed' not in response:
893 raise ExtractorError(u'Got a malformed response from YouTube API')
894 playlist_title = response['feed']['title']['$t']
895 if 'entry' not in response['feed']:
896 # Number of videos is a multiple of self._MAX_RESULTS
899 for entry in response['feed']['entry']:
900 index = entry['yt$position']['$t']
901 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
904 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
907 videos = [v[1] for v in sorted(videos)]
909 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
910 return [self.playlist_result(url_results, playlist_id, playlist_title)]
913 class YoutubeChannelIE(InfoExtractor):
914 IE_DESC = u'YouTube.com channels'
915 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
916 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
917 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
918 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
919 IE_NAME = u'youtube:channel'
921 def extract_videos_from_page(self, page):
923 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
924 if mobj.group(1) not in ids_in_page:
925 ids_in_page.append(mobj.group(1))
928 def _real_extract(self, url):
930 mobj = re.match(self._VALID_URL, url)
932 raise ExtractorError(u'Invalid URL: %s' % url)
934 # Download channel page
935 channel_id = mobj.group(1)
939 url = self._TEMPLATE_URL % (channel_id, pagenum)
940 page = self._download_webpage(url, channel_id,
941 u'Downloading page #%s' % pagenum)
943 # Extract video identifiers
944 ids_in_page = self.extract_videos_from_page(page)
945 video_ids.extend(ids_in_page)
947 # Download any subsequent channel pages using the json-based channel_ajax query
948 if self._MORE_PAGES_INDICATOR in page:
949 for pagenum in itertools.count(1):
950 url = self._MORE_PAGES_URL % (pagenum, channel_id)
951 page = self._download_webpage(url, channel_id,
952 u'Downloading page #%s' % pagenum)
954 page = json.loads(page)
956 ids_in_page = self.extract_videos_from_page(page['content_html'])
957 video_ids.extend(ids_in_page)
959 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
962 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
964 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
965 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
966 return [self.playlist_result(url_entries, channel_id)]
969 class YoutubeUserIE(InfoExtractor):
970 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
971 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
972 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
973 _GDATA_PAGE_SIZE = 50
974 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
975 IE_NAME = u'youtube:user'
978 def suitable(cls, url):
979 # Don't return True if the url can be extracted with other youtube
980 # extractor, the regex would is too permissive and it would match.
981 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
982 if any(ie.suitable(url) for ie in other_ies): return False
983 else: return super(YoutubeUserIE, cls).suitable(url)
985 def _real_extract(self, url):
987 mobj = re.match(self._VALID_URL, url)
989 raise ExtractorError(u'Invalid URL: %s' % url)
991 username = mobj.group(1)
993 # Download video ids using YouTube Data API. Result size per
994 # query is limited (currently to 50 videos) so we need to query
995 # page by page until there are no video ids - it means we got
1000 for pagenum in itertools.count(0):
1001 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1003 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1004 page = self._download_webpage(gdata_url, username,
1005 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1008 response = json.loads(page)
1009 except ValueError as err:
1010 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1012 # Extract video identifiers
1014 for entry in response['feed']['entry']:
1015 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1016 video_ids.extend(ids_in_page)
1018 # A little optimization - if current page is not
1019 # "full", ie. does not contain PAGE_SIZE video ids then
1020 # we can assume that this page is the last one - there
1021 # are no more ids on further pages - no need to query
1024 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1027 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1028 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1029 return [self.playlist_result(url_results, playlist_title = username)]
1031 class YoutubeSearchIE(SearchInfoExtractor):
1032 IE_DESC = u'YouTube.com searches'
1033 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1035 IE_NAME = u'youtube:search'
1036 _SEARCH_KEY = 'ytsearch'
1038 def report_download_page(self, query, pagenum):
1039 """Report attempt to download search page with given number."""
1040 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1042 def _get_n_results(self, query, n):
1043 """Get a specified number of results for a query"""
1049 while (50 * pagenum) < limit:
1050 self.report_download_page(query, pagenum+1)
1051 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1052 request = compat_urllib_request.Request(result_url)
1054 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1056 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1057 api_response = json.loads(data)['data']
1059 if not 'items' in api_response:
1060 raise ExtractorError(u'[youtube] No video results')
1062 new_ids = list(video['id'] for video in api_response['items'])
1063 video_ids += new_ids
1065 limit = min(n, api_response['totalItems'])
1068 if len(video_ids) > n:
1069 video_ids = video_ids[:n]
1070 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1071 return self.playlist_result(videos, query)
1074 class YoutubeShowIE(InfoExtractor):
1075 IE_DESC = u'YouTube.com (multi-season) shows'
1076 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1077 IE_NAME = u'youtube:show'
1079 def _real_extract(self, url):
1080 mobj = re.match(self._VALID_URL, url)
1081 show_name = mobj.group(1)
1082 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1083 # There's one playlist for each season of the show
1084 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1085 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1086 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1089 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1091 Base class for extractors that fetch info from
1092 http://www.youtube.com/feed_ajax
1093 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1095 _LOGIN_REQUIRED = True
1097 # use action_load_personal_feed instead of action_load_system_feed
1098 _PERSONAL_FEED = False
1101 def _FEED_TEMPLATE(self):
1102 action = 'action_load_system_feed'
1103 if self._PERSONAL_FEED:
1104 action = 'action_load_personal_feed'
1105 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1109 return u'youtube:%s' % self._FEED_NAME
1111 def _real_initialize(self):
1114 def _real_extract(self, url):
1116 # The step argument is available only in 2.7 or higher
1117 for i in itertools.count(0):
1118 paging = i*self._PAGING_STEP
1119 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1120 u'%s feed' % self._FEED_NAME,
1121 u'Downloading page %s' % i)
1122 info = json.loads(info)
1123 feed_html = info['feed_html']
1124 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1125 ids = orderedSet(m.group(1) for m in m_ids)
1126 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1127 if info['paging'] is None:
1129 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1131 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1132 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1133 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1134 _FEED_NAME = 'subscriptions'
1135 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1137 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1138 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1139 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1140 _FEED_NAME = 'recommended'
1141 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1143 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1144 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1145 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1146 _FEED_NAME = 'watch_later'
1147 _PLAYLIST_TITLE = u'Youtube Watch Later'
1149 _PERSONAL_FEED = True
1151 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1152 IE_NAME = u'youtube:favorites'
1153 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1154 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1155 _LOGIN_REQUIRED = True
1157 def _real_extract(self, url):
1158 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1159 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1160 return self.url_result(playlist_id, 'YoutubePlaylist')