8 import xml.etree.ElementTree
10 from .common import InfoExtractor, SearchInfoExtractor
11 from .subtitles import SubtitlesInfoExtractor
17 compat_urllib_request,
28 class YoutubeBaseInfoExtractor(InfoExtractor):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE = 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED = False
37 def report_lang(self):
38 """Report attempt to set language."""
39 self.to_screen(u'Setting language')
41 def _set_language(self):
42 request = compat_urllib_request.Request(self._LANG_URL)
45 compat_urllib_request.urlopen(request).read()
46 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
47 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 request = compat_urllib_request.Request(self._LOGIN_URL)
61 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
62 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
63 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
68 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
71 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'PersistentCookie': u'yes',
83 u'bgresponse': u'js_disabled',
84 u'checkConnection': u'',
85 u'checkedDomains': u'youtube',
91 u'signIn': u'Sign in',
93 u'service': u'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
100 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
101 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
104 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
109 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
113 def _confirm_age(self):
116 'action_confirm': 'Confirm',
118 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
120 self.report_age_confirmation()
121 compat_urllib_request.urlopen(request).read().decode('utf-8')
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
126 def _real_initialize(self):
127 if self._downloader is None:
129 if not self._set_language():
131 if not self._login():
136 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
137 IE_DESC = u'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
153 |youtu\.be/ # just youtu.be/xxxx
155 )? # all until now is optional -> you can pass the naked ID
156 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
157 (?(1).+)? # if we found the ID, everything can follow
159 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
160 # Listed in order of quality
161 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
162 # Apple HTTP Live Streaming
163 '96', '95', '94', '93', '92', '132', '151',
165 '85', '84', '102', '83', '101', '82', '100',
167 '138', '137', '248', '136', '247', '135', '246',
168 '245', '244', '134', '243', '133', '242', '160',
170 '141', '172', '140', '171', '139',
172 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
173 # Apple HTTP Live Streaming
174 '96', '95', '94', '93', '92', '132', '151',
176 '85', '102', '84', '101', '83', '100', '82',
178 '138', '248', '137', '247', '136', '246', '245',
179 '244', '135', '243', '134', '242', '133', '160',
181 '172', '141', '171', '140', '139',
183 _video_formats_map = {
184 'flv': ['35', '34', '6', '5'],
185 '3gp': ['36', '17', '13'],
186 'mp4': ['38', '37', '22', '18'],
187 'webm': ['46', '45', '44', '43'],
189 _video_extensions = {
211 # Apple HTTP Live Streaming
243 _video_dimensions = {
325 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
326 u"file": u"BaW_jenozKc.mp4",
328 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
329 u"uploader": u"Philipp Hagemeister",
330 u"uploader_id": u"phihag",
331 u"upload_date": u"20121002",
332 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
336 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
337 u"file": u"1ltcDfZMA3U.flv",
338 u"note": u"Test VEVO video (#897)",
340 u"upload_date": u"20070518",
341 u"title": u"Maps - It Will Find You",
342 u"description": u"Music video by Maps performing It Will Find You.",
343 u"uploader": u"MuteUSA",
344 u"uploader_id": u"MuteUSA"
348 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
349 u"file": u"UxxajLWwzqY.mp4",
350 u"note": u"Test generic use_cipher_signature video (#897)",
352 u"upload_date": u"20120506",
353 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
354 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
355 u"uploader": u"Icona Pop",
356 u"uploader_id": u"IconaPop"
360 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
361 u"file": u"07FYdnEawAQ.mp4",
362 u"note": u"Test VEVO video with age protection (#956)",
364 u"upload_date": u"20130703",
365 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
366 u"description": u"md5:64249768eec3bc4276236606ea996373",
367 u"uploader": u"justintimberlakeVEVO",
368 u"uploader_id": u"justintimberlakeVEVO"
372 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
373 u'file': u'TGi3HqYrWHE.mp4',
374 u'note': u'm3u8 video',
376 u'title': u'Triathlon - Men - London 2012 Olympic Games',
377 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
378 u'uploader': u'olympic',
379 u'upload_date': u'20120807',
380 u'uploader_id': u'olympic',
383 u'skip_download': True,
390 def suitable(cls, url):
391 """Receives a URL and returns True if suitable for this IE."""
392 if YoutubePlaylistIE.suitable(url): return False
393 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
395 def report_video_webpage_download(self, video_id):
396 """Report attempt to download video webpage."""
397 self.to_screen(u'%s: Downloading video webpage' % video_id)
399 def report_video_info_webpage_download(self, video_id):
400 """Report attempt to download video info webpage."""
401 self.to_screen(u'%s: Downloading video info webpage' % video_id)
403 def report_information_extraction(self, video_id):
404 """Report attempt to extract video information."""
405 self.to_screen(u'%s: Extracting video information' % video_id)
407 def report_unavailable_format(self, video_id, format):
408 """Report extracted video URL."""
409 self.to_screen(u'%s: Format %s not available' % (video_id, format))
411 def report_rtmp_download(self):
412 """Indicate the download will use the RTMP protocol."""
413 self.to_screen(u'RTMP download detected')
415 def _decrypt_signature(self, s):
416 """Turn the encrypted s field into a working signature"""
419 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
421 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
423 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
425 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
427 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
429 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
431 return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]
433 return s[81:36:-1] + s[0] + s[35:2:-1]
435 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
437 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
439 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
441 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
443 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
446 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
448 def _decrypt_signature_age_gate(self, s):
449 # The videos with age protection use another player, so the algorithms
452 return s[2:63] + s[82] + s[64:82] + s[63]
454 # Fallback to the other algortihms
455 return self._decrypt_signature(s)
457 def _get_available_subtitles(self, video_id):
459 sub_list = self._download_webpage(
460 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
461 video_id, note=False)
462 except ExtractorError as err:
463 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
465 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
470 params = compat_urllib_parse.urlencode({
473 'fmt': self._downloader.params.get('subtitlesformat'),
475 url = u'http://www.youtube.com/api/timedtext?' + params
476 sub_lang_list[lang] = url
477 if not sub_lang_list:
478 self._downloader.report_warning(u'video doesn\'t have subtitles')
482 def _get_available_automatic_caption(self, video_id, webpage):
483 """We need the webpage for getting the captions url, pass it as an
484 argument to speed up the process."""
485 sub_format = self._downloader.params.get('subtitlesformat')
486 self.to_screen(u'%s: Looking for automatic captions' % video_id)
487 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
488 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
490 self._downloader.report_warning(err_msg)
492 player_config = json.loads(mobj.group(1))
494 args = player_config[u'args']
495 caption_url = args[u'ttsurl']
496 timestamp = args[u'timestamp']
497 # We get the available subtitles
498 list_params = compat_urllib_parse.urlencode({
503 list_url = caption_url + '&' + list_params
504 list_page = self._download_webpage(list_url, video_id)
505 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
506 original_lang_node = caption_list.find('track')
507 if original_lang_node.attrib.get('kind') != 'asr' :
508 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
510 original_lang = original_lang_node.attrib['lang_code']
513 for lang_node in caption_list.findall('target'):
514 sub_lang = lang_node.attrib['lang_code']
515 params = compat_urllib_parse.urlencode({
516 'lang': original_lang,
522 sub_lang_list[sub_lang] = caption_url + '&' + params
524 # An extractor error can be raise by the download process if there are
525 # no automatic captions but there are subtitles
526 except (KeyError, ExtractorError):
527 self._downloader.report_warning(err_msg)
530 def _print_formats(self, formats):
531 print('Available formats:')
533 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
534 self._video_dimensions.get(x, '???'),
535 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
537 def _extract_id(self, url):
538 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
540 raise ExtractorError(u'Invalid URL: %s' % url)
541 video_id = mobj.group(2)
544 def _get_video_url_list(self, url_map):
546 Transform a dictionary in the format {itag:url} to a list of (itag, url)
547 with the requested formats.
549 req_format = self._downloader.params.get('format', None)
550 format_limit = self._downloader.params.get('format_limit', None)
551 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
552 if format_limit is not None and format_limit in available_formats:
553 format_list = available_formats[available_formats.index(format_limit):]
555 format_list = available_formats
556 existing_formats = [x for x in format_list if x in url_map]
557 if len(existing_formats) == 0:
558 raise ExtractorError(u'no known formats available for video')
559 if self._downloader.params.get('listformats', None):
560 self._print_formats(existing_formats)
562 if req_format is None or req_format == 'best':
563 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
564 elif req_format == 'worst':
565 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
566 elif req_format in ('-1', 'all'):
567 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
569 # Specific formats. We pick the first in a slash-delimeted sequence.
570 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
571 # available in the specified format. For example,
572 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
573 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
574 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
575 req_formats = req_format.split('/')
576 video_url_list = None
577 for rf in req_formats:
579 video_url_list = [(rf, url_map[rf])]
581 if rf in self._video_formats_map:
582 for srf in self._video_formats_map[rf]:
584 video_url_list = [(srf, url_map[srf])]
589 if video_url_list is None:
590 raise ExtractorError(u'requested format not available')
591 return video_url_list
593 def _extract_from_m3u8(self, manifest_url, video_id):
595 def _get_urls(_manifest):
596 lines = _manifest.split('\n')
597 urls = filter(lambda l: l and not l.startswith('#'),
600 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
601 formats_urls = _get_urls(manifest)
602 for format_url in formats_urls:
603 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
604 url_map[itag] = format_url
607 def _real_extract(self, url):
608 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
609 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
611 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
612 mobj = re.search(self._NEXT_URL_RE, url)
614 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
615 video_id = self._extract_id(url)
618 self.report_video_webpage_download(video_id)
619 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
620 request = compat_urllib_request.Request(url)
622 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
623 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
624 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
626 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
628 # Attempt to extract SWF player URL
629 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
631 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
636 self.report_video_info_webpage_download(video_id)
637 if re.search(r'player-age-gate-content">', video_webpage) is not None:
638 self.report_age_confirmation()
640 # We simulate the access to the video from www.youtube.com/v/{video_id}
641 # this can be viewed without login into Youtube
642 data = compat_urllib_parse.urlencode({'video_id': video_id,
646 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
650 video_info_url = 'https://www.youtube.com/get_video_info?' + data
651 video_info_webpage = self._download_webpage(video_info_url, video_id,
653 errnote='unable to download video info webpage')
654 video_info = compat_parse_qs(video_info_webpage)
657 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
658 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
659 % (video_id, el_type))
660 video_info_webpage = self._download_webpage(video_info_url, video_id,
662 errnote='unable to download video info webpage')
663 video_info = compat_parse_qs(video_info_webpage)
664 if 'token' in video_info:
666 if 'token' not in video_info:
667 if 'reason' in video_info:
668 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
670 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
672 # Check for "rental" videos
673 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
674 raise ExtractorError(u'"rental" videos not supported')
676 # Start extracting information
677 self.report_information_extraction(video_id)
680 if 'author' not in video_info:
681 raise ExtractorError(u'Unable to extract uploader name')
682 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
685 video_uploader_id = None
686 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
688 video_uploader_id = mobj.group(1)
690 self._downloader.report_warning(u'unable to extract uploader nickname')
693 if 'title' not in video_info:
694 raise ExtractorError(u'Unable to extract video title')
695 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
698 # We try first to get a high quality image:
699 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
700 video_webpage, re.DOTALL)
701 if m_thumb is not None:
702 video_thumbnail = m_thumb.group(1)
703 elif 'thumbnail_url' not in video_info:
704 self._downloader.report_warning(u'unable to extract video thumbnail')
706 else: # don't panic if we can't find it
707 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
711 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
713 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
714 upload_date = unified_strdate(upload_date)
717 video_description = get_element_by_id("eow-description", video_webpage)
718 if video_description:
719 video_description = clean_html(video_description)
721 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
723 video_description = unescapeHTML(fd_mobj.group(1))
725 video_description = u''
728 video_subtitles = self.extract_subtitles(video_id, video_webpage)
730 if self._downloader.params.get('listsubtitles', False):
731 self._list_available_subtitles(video_id, video_webpage)
734 if 'length_seconds' not in video_info:
735 self._downloader.report_warning(u'unable to extract video duration')
738 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
740 # Decide which formats to download
743 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
745 raise ValueError('Could not find vevo ID')
746 info = json.loads(mobj.group(1))
748 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
749 # this signatures are encrypted
750 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
752 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
753 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
754 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
756 if 'url_encoded_fmt_stream_map' in video_info:
757 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
759 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
760 elif 'adaptive_fmts' in video_info:
761 if 'url_encoded_fmt_stream_map' in video_info:
762 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
764 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
768 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
769 self.report_rtmp_download()
770 video_url_list = [(None, video_info['conn'][0])]
771 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
772 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
773 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
775 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
776 url_data = compat_parse_qs(url_data_str)
777 if 'itag' in url_data and 'url' in url_data:
778 url = url_data['url'][0]
779 if 'sig' in url_data:
780 url += '&signature=' + url_data['sig'][0]
781 elif 's' in url_data:
782 if self._downloader.params.get('verbose'):
785 player_version = self._search_regex(r'ad3-(.+?)\.swf',
786 video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
787 'flash player', fatal=False)
788 player = 'flash player %s' % player_version
790 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
791 'html5 player', fatal=False)
792 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
793 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
794 (len(s), parts_sizes, url_data['itag'][0], player))
795 encrypted_sig = url_data['s'][0]
797 signature = self._decrypt_signature_age_gate(encrypted_sig)
799 signature = self._decrypt_signature(encrypted_sig)
800 url += '&signature=' + signature
801 if 'ratebypass' not in url:
802 url += '&ratebypass=yes'
803 url_map[url_data['itag'][0]] = url
804 video_url_list = self._get_video_url_list(url_map)
805 if not video_url_list:
807 elif video_info.get('hlsvp'):
808 manifest_url = video_info['hlsvp'][0]
809 url_map = self._extract_from_m3u8(manifest_url, video_id)
810 video_url_list = self._get_video_url_list(url_map)
811 if not video_url_list:
815 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
818 for format_param, video_real_url in video_url_list:
820 video_extension = self._video_extensions.get(format_param, 'flv')
822 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
823 self._video_dimensions.get(format_param, '???'),
824 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
828 'url': video_real_url,
829 'uploader': video_uploader,
830 'uploader_id': video_uploader_id,
831 'upload_date': upload_date,
832 'title': video_title,
833 'ext': video_extension,
834 'format': video_format,
835 'thumbnail': video_thumbnail,
836 'description': video_description,
837 'player_url': player_url,
838 'subtitles': video_subtitles,
839 'duration': video_duration
843 class YoutubePlaylistIE(InfoExtractor):
844 IE_DESC = u'YouTube.com playlists'
850 (?:course|view_play_list|my_playlists|artist|playlist|watch)
851 \? (?:.*?&)*? (?:p|a|list)=
854 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
857 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
859 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
861 IE_NAME = u'youtube:playlist'
864 def suitable(cls, url):
865 """Receives a URL and returns True if suitable for this IE."""
866 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
868 def _real_extract(self, url):
869 # Extract playlist id
870 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
872 raise ExtractorError(u'Invalid URL: %s' % url)
874 # Download playlist videos from API
875 playlist_id = mobj.group(1) or mobj.group(2)
878 for page_num in itertools.count(1):
879 start_index = self._MAX_RESULTS * (page_num - 1) + 1
880 if start_index >= 1000:
881 self._downloader.report_warning(u'Max number of results reached')
883 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
884 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
887 response = json.loads(page)
888 except ValueError as err:
889 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
891 if 'feed' not in response:
892 raise ExtractorError(u'Got a malformed response from YouTube API')
893 playlist_title = response['feed']['title']['$t']
894 if 'entry' not in response['feed']:
895 # Number of videos is a multiple of self._MAX_RESULTS
898 for entry in response['feed']['entry']:
899 index = entry['yt$position']['$t']
900 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
903 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
906 videos = [v[1] for v in sorted(videos)]
908 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
909 return [self.playlist_result(url_results, playlist_id, playlist_title)]
912 class YoutubeChannelIE(InfoExtractor):
913 IE_DESC = u'YouTube.com channels'
914 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
915 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
916 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
917 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
918 IE_NAME = u'youtube:channel'
920 def extract_videos_from_page(self, page):
922 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
923 if mobj.group(1) not in ids_in_page:
924 ids_in_page.append(mobj.group(1))
927 def _real_extract(self, url):
929 mobj = re.match(self._VALID_URL, url)
931 raise ExtractorError(u'Invalid URL: %s' % url)
933 # Download channel page
934 channel_id = mobj.group(1)
938 url = self._TEMPLATE_URL % (channel_id, pagenum)
939 page = self._download_webpage(url, channel_id,
940 u'Downloading page #%s' % pagenum)
942 # Extract video identifiers
943 ids_in_page = self.extract_videos_from_page(page)
944 video_ids.extend(ids_in_page)
946 # Download any subsequent channel pages using the json-based channel_ajax query
947 if self._MORE_PAGES_INDICATOR in page:
948 for pagenum in itertools.count(1):
949 url = self._MORE_PAGES_URL % (pagenum, channel_id)
950 page = self._download_webpage(url, channel_id,
951 u'Downloading page #%s' % pagenum)
953 page = json.loads(page)
955 ids_in_page = self.extract_videos_from_page(page['content_html'])
956 video_ids.extend(ids_in_page)
958 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
961 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
963 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
964 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
965 return [self.playlist_result(url_entries, channel_id)]
968 class YoutubeUserIE(InfoExtractor):
969 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
970 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
971 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
972 _GDATA_PAGE_SIZE = 50
973 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
974 IE_NAME = u'youtube:user'
977 def suitable(cls, url):
978 # Don't return True if the url can be extracted with other youtube
979 # extractor, the regex would is too permissive and it would match.
980 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
981 if any(ie.suitable(url) for ie in other_ies): return False
982 else: return super(YoutubeUserIE, cls).suitable(url)
984 def _real_extract(self, url):
986 mobj = re.match(self._VALID_URL, url)
988 raise ExtractorError(u'Invalid URL: %s' % url)
990 username = mobj.group(1)
992 # Download video ids using YouTube Data API. Result size per
993 # query is limited (currently to 50 videos) so we need to query
994 # page by page until there are no video ids - it means we got
999 for pagenum in itertools.count(0):
1000 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1002 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1003 page = self._download_webpage(gdata_url, username,
1004 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1007 response = json.loads(page)
1008 except ValueError as err:
1009 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1011 # Extract video identifiers
1013 for entry in response['feed']['entry']:
1014 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1015 video_ids.extend(ids_in_page)
1017 # A little optimization - if current page is not
1018 # "full", ie. does not contain PAGE_SIZE video ids then
1019 # we can assume that this page is the last one - there
1020 # are no more ids on further pages - no need to query
1023 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1026 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1027 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1028 return [self.playlist_result(url_results, playlist_title = username)]
1030 class YoutubeSearchIE(SearchInfoExtractor):
1031 IE_DESC = u'YouTube.com searches'
1032 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1034 IE_NAME = u'youtube:search'
1035 _SEARCH_KEY = 'ytsearch'
1037 def report_download_page(self, query, pagenum):
1038 """Report attempt to download search page with given number."""
1039 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1041 def _get_n_results(self, query, n):
1042 """Get a specified number of results for a query"""
1048 while (50 * pagenum) < limit:
1049 self.report_download_page(query, pagenum+1)
1050 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1051 request = compat_urllib_request.Request(result_url)
1053 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1056 api_response = json.loads(data)['data']
1058 if not 'items' in api_response:
1059 raise ExtractorError(u'[youtube] No video results')
1061 new_ids = list(video['id'] for video in api_response['items'])
1062 video_ids += new_ids
1064 limit = min(n, api_response['totalItems'])
1067 if len(video_ids) > n:
1068 video_ids = video_ids[:n]
1069 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1070 return self.playlist_result(videos, query)
1073 class YoutubeShowIE(InfoExtractor):
1074 IE_DESC = u'YouTube.com (multi-season) shows'
1075 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1076 IE_NAME = u'youtube:show'
1078 def _real_extract(self, url):
1079 mobj = re.match(self._VALID_URL, url)
1080 show_name = mobj.group(1)
1081 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1082 # There's one playlist for each season of the show
1083 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1084 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1085 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1088 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1090 Base class for extractors that fetch info from
1091 http://www.youtube.com/feed_ajax
1092 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1094 _LOGIN_REQUIRED = True
1096 # use action_load_personal_feed instead of action_load_system_feed
1097 _PERSONAL_FEED = False
1100 def _FEED_TEMPLATE(self):
1101 action = 'action_load_system_feed'
1102 if self._PERSONAL_FEED:
1103 action = 'action_load_personal_feed'
1104 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1108 return u'youtube:%s' % self._FEED_NAME
1110 def _real_initialize(self):
1113 def _real_extract(self, url):
1115 # The step argument is available only in 2.7 or higher
1116 for i in itertools.count(0):
1117 paging = i*self._PAGING_STEP
1118 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1119 u'%s feed' % self._FEED_NAME,
1120 u'Downloading page %s' % i)
1121 info = json.loads(info)
1122 feed_html = info['feed_html']
1123 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1124 ids = orderedSet(m.group(1) for m in m_ids)
1125 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1126 if info['paging'] is None:
1128 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1130 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1131 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1132 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1133 _FEED_NAME = 'subscriptions'
1134 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1136 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1137 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1138 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1139 _FEED_NAME = 'recommended'
1140 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1142 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1143 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1144 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1145 _FEED_NAME = 'watch_later'
1146 _PLAYLIST_TITLE = u'Youtube Watch Later'
1148 _PERSONAL_FEED = True
1150 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1151 IE_NAME = u'youtube:favorites'
1152 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1153 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1154 _LOGIN_REQUIRED = True
1156 def _real_extract(self, url):
1157 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1158 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1159 return self.url_result(playlist_id, 'YoutubePlaylist')