9 from .common import InfoExtractor, SearchInfoExtractor
10 from .subtitles import SubtitlesIE
16 compat_urllib_request,
27 class YoutubeBaseInfoExtractor(InfoExtractor):
28 """Provide base functions for Youtube extractors"""
29 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
30 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
31 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
32 _NETRC_MACHINE = 'youtube'
33 # If True it will raise an error if no login info is provided
34 _LOGIN_REQUIRED = False
36 def report_lang(self):
37 """Report attempt to set language."""
38 self.to_screen(u'Setting language')
40 def _set_language(self):
41 request = compat_urllib_request.Request(self._LANG_URL)
44 compat_urllib_request.urlopen(request).read()
45 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
46 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
51 (username, password) = self._get_login_info()
52 # No authentication to be performed
54 if self._LOGIN_REQUIRED:
55 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 request = compat_urllib_request.Request(self._LOGIN_URL)
60 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
61 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
62 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
67 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
70 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
76 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 u'PersistentCookie': u'yes',
82 u'bgresponse': u'js_disabled',
83 u'checkConnection': u'',
84 u'checkedDomains': u'youtube',
90 u'signIn': u'Sign in',
92 u'service': u'youtube',
96 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
98 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
99 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
100 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
103 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
107 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
108 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
112 def _confirm_age(self):
115 'action_confirm': 'Confirm',
117 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
119 self.report_age_confirmation()
120 compat_urllib_request.urlopen(request).read().decode('utf-8')
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
125 def _real_initialize(self):
126 if self._downloader is None:
128 if not self._set_language():
130 if not self._login():
134 class YoutubeSubtitlesIE(SubtitlesIE):
136 def _get_available_subtitles(self, video_id):
137 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
139 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
140 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
141 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
143 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
148 params = compat_urllib_parse.urlencode({
151 'fmt': self._downloader.params.get('subtitlesformat'),
153 url = u'http://www.youtube.com/api/timedtext?' + params
154 sub_lang_list[lang] = url
155 if not sub_lang_list:
156 self._downloader.report_warning(u'video doesn\'t have subtitles')
160 def _request_automatic_caption(self, video_id, webpage):
161 """We need the webpage for getting the captions url, pass it as an
162 argument to speed up the process."""
163 sub_lang = self._downloader.params.get('subtitleslang') or 'en'
164 sub_format = self._downloader.params.get('subtitlesformat')
165 self.to_screen(u'%s: Looking for automatic captions' % video_id)
166 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
167 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
169 self._downloader.report_warning(err_msg)
171 player_config = json.loads(mobj.group(1))
173 args = player_config[u'args']
174 caption_url = args[u'ttsurl']
175 timestamp = args[u'timestamp']
176 params = compat_urllib_parse.urlencode({
183 subtitles_url = caption_url + '&' + params
184 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
185 return {sub_lang: sub}
186 # An extractor error can be raise by the download process if there are
187 # no automatic captions but there are subtitles
188 except (KeyError, ExtractorError):
189 self._downloader.report_warning(err_msg)
192 class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
193 IE_DESC = u'YouTube.com'
196 (?:https?://)? # http(s):// (optional)
197 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
198 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
199 (?:.*?\#/)? # handle anchor (#/) redirect urls
200 (?: # the various things that can precede the ID:
201 (?:(?:v|embed|e)/) # v/ or embed/ or e/
202 |(?: # or the v= param in all its forms
203 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
204 (?:\?|\#!?) # the params delimiter ? or # or #!
205 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
208 )? # optional -> youtube.com/xxxx is OK
209 )? # all until now is optional -> you can pass the naked ID
210 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
211 (?(1).+)? # if we found the ID, everything can follow
213 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13',
216 '95', '94', '93', '92', '132', '151',
218 '85', '84', '102', '83', '101', '82', '100',
220 '138', '137', '248', '136', '247', '135', '246',
221 '245', '244', '134', '243', '133', '242', '160',
223 '141', '172', '140', '171', '139',
225 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13',
226 '95', '94', '93', '92', '132', '151',
227 '85', '102', '84', '101', '83', '100', '82',
229 '138', '248', '137', '247', '136', '246', '245',
230 '244', '135', '243', '134', '242', '133', '160',
232 '172', '141', '171', '140', '139',
234 _video_extensions = {
255 # videos that use m3u8
287 _video_dimensions = {
368 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
369 u"file": u"BaW_jenozKc.mp4",
371 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
372 u"uploader": u"Philipp Hagemeister",
373 u"uploader_id": u"phihag",
374 u"upload_date": u"20121002",
375 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
379 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
380 u"file": u"1ltcDfZMA3U.flv",
381 u"note": u"Test VEVO video (#897)",
383 u"upload_date": u"20070518",
384 u"title": u"Maps - It Will Find You",
385 u"description": u"Music video by Maps performing It Will Find You.",
386 u"uploader": u"MuteUSA",
387 u"uploader_id": u"MuteUSA"
391 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
392 u"file": u"UxxajLWwzqY.mp4",
393 u"note": u"Test generic use_cipher_signature video (#897)",
395 u"upload_date": u"20120506",
396 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
397 u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c",
398 u"uploader": u"Icona Pop",
399 u"uploader_id": u"IconaPop"
403 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
404 u"file": u"07FYdnEawAQ.mp4",
405 u"note": u"Test VEVO video with age protection (#956)",
407 u"upload_date": u"20130703",
408 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
409 u"description": u"md5:64249768eec3bc4276236606ea996373",
410 u"uploader": u"justintimberlakeVEVO",
411 u"uploader_id": u"justintimberlakeVEVO"
415 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
416 u'file': u'TGi3HqYrWHE.mp4',
417 u'note': u'm3u8 video',
419 u'title': u'Triathlon - Men - London 2012 Olympic Games',
420 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
421 u'uploader': u'olympic',
422 u'upload_date': u'20120807',
423 u'uploader_id': u'olympic',
426 u'skip_download': True,
433 def suitable(cls, url):
434 """Receives a URL and returns True if suitable for this IE."""
435 if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
436 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
438 def report_video_webpage_download(self, video_id):
439 """Report attempt to download video webpage."""
440 self.to_screen(u'%s: Downloading video webpage' % video_id)
442 def report_video_info_webpage_download(self, video_id):
443 """Report attempt to download video info webpage."""
444 self.to_screen(u'%s: Downloading video info webpage' % video_id)
446 def report_information_extraction(self, video_id):
447 """Report attempt to extract video information."""
448 self.to_screen(u'%s: Extracting video information' % video_id)
450 def report_unavailable_format(self, video_id, format):
451 """Report extracted video URL."""
452 self.to_screen(u'%s: Format %s not available' % (video_id, format))
454 def report_rtmp_download(self):
455 """Indicate the download will use the RTMP protocol."""
456 self.to_screen(u'RTMP download detected')
458 def _decrypt_signature(self, s):
459 """Turn the encrypted s field into a working signature"""
462 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
464 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
466 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
468 return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
470 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
472 return s[5:20] + s[2] + s[21:]
474 return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27]
476 return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27]
478 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
480 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
482 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
484 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
486 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
489 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
491 def _decrypt_signature_age_gate(self, s):
492 # The videos with age protection use another player, so the algorithms
495 return s[2:63] + s[82] + s[64:82] + s[63]
497 # Fallback to the other algortihms
498 return self._decrypt_signature(s)
500 def _print_formats(self, formats):
501 print('Available formats:')
503 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
504 self._video_dimensions.get(x, '???'),
505 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
507 def _extract_id(self, url):
508 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
510 raise ExtractorError(u'Invalid URL: %s' % url)
511 video_id = mobj.group(2)
514 def _get_video_url_list(self, url_map):
516 Transform a dictionary in the format {itag:url} to a list of (itag, url)
517 with the requested formats.
519 req_format = self._downloader.params.get('format', None)
520 format_limit = self._downloader.params.get('format_limit', None)
521 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
522 if format_limit is not None and format_limit in available_formats:
523 format_list = available_formats[available_formats.index(format_limit):]
525 format_list = available_formats
526 existing_formats = [x for x in format_list if x in url_map]
527 if len(existing_formats) == 0:
528 raise ExtractorError(u'no known formats available for video')
529 if self._downloader.params.get('listformats', None):
530 self._print_formats(existing_formats)
532 if req_format is None or req_format == 'best':
533 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
534 elif req_format == 'worst':
535 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
536 elif req_format in ('-1', 'all'):
537 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
539 # Specific formats. We pick the first in a slash-delimeted sequence.
540 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
541 req_formats = req_format.split('/')
542 video_url_list = None
543 for rf in req_formats:
545 video_url_list = [(rf, url_map[rf])]
547 if video_url_list is None:
548 raise ExtractorError(u'requested format not available')
549 return video_url_list
551 def _extract_from_m3u8(self, manifest_url, video_id):
553 def _get_urls(_manifest):
554 lines = _manifest.split('\n')
555 urls = filter(lambda l: l and not l.startswith('#'),
558 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
559 formats_urls = _get_urls(manifest)
560 for format_url in formats_urls:
561 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
562 url_map[itag] = format_url
565 def _real_extract(self, url):
566 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
567 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
569 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
570 mobj = re.search(self._NEXT_URL_RE, url)
572 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
573 video_id = self._extract_id(url)
576 self.report_video_webpage_download(video_id)
577 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
578 request = compat_urllib_request.Request(url)
580 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
584 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
586 # Attempt to extract SWF player URL
587 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
589 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
594 self.report_video_info_webpage_download(video_id)
595 if re.search(r'player-age-gate-content">', video_webpage) is not None:
596 self.report_age_confirmation()
598 # We simulate the access to the video from www.youtube.com/v/{video_id}
599 # this can be viewed without login into Youtube
600 data = compat_urllib_parse.urlencode({'video_id': video_id,
604 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
608 video_info_url = 'https://www.youtube.com/get_video_info?' + data
609 video_info_webpage = self._download_webpage(video_info_url, video_id,
611 errnote='unable to download video info webpage')
612 video_info = compat_parse_qs(video_info_webpage)
615 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
616 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
617 % (video_id, el_type))
618 video_info_webpage = self._download_webpage(video_info_url, video_id,
620 errnote='unable to download video info webpage')
621 video_info = compat_parse_qs(video_info_webpage)
622 if 'token' in video_info:
624 if 'token' not in video_info:
625 if 'reason' in video_info:
626 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
628 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
630 # Check for "rental" videos
631 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
632 raise ExtractorError(u'"rental" videos not supported')
634 # Start extracting information
635 self.report_information_extraction(video_id)
638 if 'author' not in video_info:
639 raise ExtractorError(u'Unable to extract uploader name')
640 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
643 video_uploader_id = None
644 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
646 video_uploader_id = mobj.group(1)
648 self._downloader.report_warning(u'unable to extract uploader nickname')
651 if 'title' not in video_info:
652 raise ExtractorError(u'Unable to extract video title')
653 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
656 # We try first to get a high quality image:
657 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
658 video_webpage, re.DOTALL)
659 if m_thumb is not None:
660 video_thumbnail = m_thumb.group(1)
661 elif 'thumbnail_url' not in video_info:
662 self._downloader.report_warning(u'unable to extract video thumbnail')
664 else: # don't panic if we can't find it
665 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
669 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
671 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
672 upload_date = unified_strdate(upload_date)
675 video_description = get_element_by_id("eow-description", video_webpage)
676 if video_description:
677 video_description = clean_html(video_description)
679 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
681 video_description = unescapeHTML(fd_mobj.group(1))
683 video_description = u''
686 video_subtitles = None
688 if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
689 video_subtitles = self._extract_subtitles(video_id)
690 elif self._downloader.params.get('writeautomaticsub', False):
691 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
693 if self._downloader.params.get('listsubtitles', False):
694 self._list_available_subtitles(video_id)
697 if 'length_seconds' not in video_info:
698 self._downloader.report_warning(u'unable to extract video duration')
701 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
703 # Decide which formats to download
706 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
708 raise ValueError('Could not find vevo ID')
709 info = json.loads(mobj.group(1))
711 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
712 # this signatures are encrypted
713 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
715 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
716 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
717 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
719 if 'url_encoded_fmt_stream_map' in video_info:
720 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
722 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
723 elif 'adaptive_fmts' in video_info:
724 if 'url_encoded_fmt_stream_map' in video_info:
725 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
727 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
731 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
732 self.report_rtmp_download()
733 video_url_list = [(None, video_info['conn'][0])]
734 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
735 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
736 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
738 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
739 url_data = compat_parse_qs(url_data_str)
740 if 'itag' in url_data and 'url' in url_data:
741 url = url_data['url'][0]
742 if 'sig' in url_data:
743 url += '&signature=' + url_data['sig'][0]
744 elif 's' in url_data:
745 if self._downloader.params.get('verbose'):
748 player_version = self._search_regex(r'ad3-(.+?)\.swf',
749 video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
750 'flash player', fatal=False)
751 player = 'flash player %s' % player_version
753 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
754 'html5 player', fatal=False)
755 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
756 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
757 (len(s), parts_sizes, url_data['itag'][0], player))
758 encrypted_sig = url_data['s'][0]
760 signature = self._decrypt_signature_age_gate(encrypted_sig)
762 signature = self._decrypt_signature(encrypted_sig)
763 url += '&signature=' + signature
764 if 'ratebypass' not in url:
765 url += '&ratebypass=yes'
766 url_map[url_data['itag'][0]] = url
767 video_url_list = self._get_video_url_list(url_map)
768 if not video_url_list:
770 elif video_info.get('hlsvp'):
771 manifest_url = video_info['hlsvp'][0]
772 url_map = self._extract_from_m3u8(manifest_url, video_id)
773 video_url_list = self._get_video_url_list(url_map)
774 if not video_url_list:
778 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
781 for format_param, video_real_url in video_url_list:
783 video_extension = self._video_extensions.get(format_param, 'flv')
785 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
786 self._video_dimensions.get(format_param, '???'),
787 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
791 'url': video_real_url,
792 'uploader': video_uploader,
793 'uploader_id': video_uploader_id,
794 'upload_date': upload_date,
795 'title': video_title,
796 'ext': video_extension,
797 'format': video_format,
798 'thumbnail': video_thumbnail,
799 'description': video_description,
800 'player_url': player_url,
801 'subtitles': video_subtitles,
802 'duration': video_duration
806 class YoutubePlaylistIE(InfoExtractor):
807 IE_DESC = u'YouTube.com playlists'
813 (?:course|view_play_list|my_playlists|artist|playlist|watch)
814 \? (?:.*?&)*? (?:p|a|list)=
817 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
820 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
822 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
824 IE_NAME = u'youtube:playlist'
827 def suitable(cls, url):
828 """Receives a URL and returns True if suitable for this IE."""
829 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
831 def _real_extract(self, url):
832 # Extract playlist id
833 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
835 raise ExtractorError(u'Invalid URL: %s' % url)
837 # Download playlist videos from API
838 playlist_id = mobj.group(1) or mobj.group(2)
841 for page_num in itertools.count(1):
842 start_index = self._MAX_RESULTS * (page_num - 1) + 1
843 if start_index >= 1000:
844 self._downloader.report_warning(u'Max number of results reached')
846 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
847 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
850 response = json.loads(page)
851 except ValueError as err:
852 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
854 if 'feed' not in response:
855 raise ExtractorError(u'Got a malformed response from YouTube API')
856 playlist_title = response['feed']['title']['$t']
857 if 'entry' not in response['feed']:
858 # Number of videos is a multiple of self._MAX_RESULTS
861 for entry in response['feed']['entry']:
862 index = entry['yt$position']['$t']
863 if 'media$group' in entry and 'media$player' in entry['media$group']:
864 videos.append((index, entry['media$group']['media$player']['url']))
866 videos = [v[1] for v in sorted(videos)]
868 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
869 return [self.playlist_result(url_results, playlist_id, playlist_title)]
872 class YoutubeChannelIE(InfoExtractor):
873 IE_DESC = u'YouTube.com channels'
874 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
875 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
876 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
877 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
878 IE_NAME = u'youtube:channel'
880 def extract_videos_from_page(self, page):
882 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
883 if mobj.group(1) not in ids_in_page:
884 ids_in_page.append(mobj.group(1))
887 def _real_extract(self, url):
889 mobj = re.match(self._VALID_URL, url)
891 raise ExtractorError(u'Invalid URL: %s' % url)
893 # Download channel page
894 channel_id = mobj.group(1)
898 url = self._TEMPLATE_URL % (channel_id, pagenum)
899 page = self._download_webpage(url, channel_id,
900 u'Downloading page #%s' % pagenum)
902 # Extract video identifiers
903 ids_in_page = self.extract_videos_from_page(page)
904 video_ids.extend(ids_in_page)
906 # Download any subsequent channel pages using the json-based channel_ajax query
907 if self._MORE_PAGES_INDICATOR in page:
908 for pagenum in itertools.count(1):
909 url = self._MORE_PAGES_URL % (pagenum, channel_id)
910 page = self._download_webpage(url, channel_id,
911 u'Downloading page #%s' % pagenum)
913 page = json.loads(page)
915 ids_in_page = self.extract_videos_from_page(page['content_html'])
916 video_ids.extend(ids_in_page)
918 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
921 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
923 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
924 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
925 return [self.playlist_result(url_entries, channel_id)]
928 class YoutubeUserIE(InfoExtractor):
929 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
930 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
931 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
932 _GDATA_PAGE_SIZE = 50
933 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
934 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
935 IE_NAME = u'youtube:user'
937 def _real_extract(self, url):
939 mobj = re.match(self._VALID_URL, url)
941 raise ExtractorError(u'Invalid URL: %s' % url)
943 username = mobj.group(1)
945 # Download video ids using YouTube Data API. Result size per
946 # query is limited (currently to 50 videos) so we need to query
947 # page by page until there are no video ids - it means we got
952 for pagenum in itertools.count(0):
953 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
955 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
956 page = self._download_webpage(gdata_url, username,
957 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
959 # Extract video identifiers
962 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
963 if mobj.group(1) not in ids_in_page:
964 ids_in_page.append(mobj.group(1))
966 video_ids.extend(ids_in_page)
968 # A little optimization - if current page is not
969 # "full", ie. does not contain PAGE_SIZE video ids then
970 # we can assume that this page is the last one - there
971 # are no more ids on further pages - no need to query
974 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
977 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
978 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
979 return [self.playlist_result(url_results, playlist_title = username)]
981 class YoutubeSearchIE(SearchInfoExtractor):
982 IE_DESC = u'YouTube.com searches'
983 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
985 IE_NAME = u'youtube:search'
986 _SEARCH_KEY = 'ytsearch'
988 def report_download_page(self, query, pagenum):
989 """Report attempt to download search page with given number."""
990 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
992 def _get_n_results(self, query, n):
993 """Get a specified number of results for a query"""
999 while (50 * pagenum) < limit:
1000 self.report_download_page(query, pagenum+1)
1001 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1002 request = compat_urllib_request.Request(result_url)
1004 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1007 api_response = json.loads(data)['data']
1009 if not 'items' in api_response:
1010 raise ExtractorError(u'[youtube] No video results')
1012 new_ids = list(video['id'] for video in api_response['items'])
1013 video_ids += new_ids
1015 limit = min(n, api_response['totalItems'])
1018 if len(video_ids) > n:
1019 video_ids = video_ids[:n]
1020 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1021 return self.playlist_result(videos, query)
1024 class YoutubeShowIE(InfoExtractor):
1025 IE_DESC = u'YouTube.com (multi-season) shows'
1026 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1027 IE_NAME = u'youtube:show'
1029 def _real_extract(self, url):
1030 mobj = re.match(self._VALID_URL, url)
1031 show_name = mobj.group(1)
1032 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1033 # There's one playlist for each season of the show
1034 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1035 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1036 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1039 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1041 Base class for extractors that fetch info from
1042 http://www.youtube.com/feed_ajax
1043 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1045 _LOGIN_REQUIRED = True
1047 # use action_load_personal_feed instead of action_load_system_feed
1048 _PERSONAL_FEED = False
1051 def _FEED_TEMPLATE(self):
1052 action = 'action_load_system_feed'
1053 if self._PERSONAL_FEED:
1054 action = 'action_load_personal_feed'
1055 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1059 return u'youtube:%s' % self._FEED_NAME
1061 def _real_initialize(self):
1064 def _real_extract(self, url):
1066 # The step argument is available only in 2.7 or higher
1067 for i in itertools.count(0):
1068 paging = i*self._PAGING_STEP
1069 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1070 u'%s feed' % self._FEED_NAME,
1071 u'Downloading page %s' % i)
1072 info = json.loads(info)
1073 feed_html = info['feed_html']
1074 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1075 ids = orderedSet(m.group(1) for m in m_ids)
1076 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1077 if info['paging'] is None:
1079 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1081 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1082 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1083 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1084 _FEED_NAME = 'subscriptions'
1085 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1087 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1088 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1089 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1090 _FEED_NAME = 'recommended'
1091 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1093 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1094 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1095 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1096 _FEED_NAME = 'watch_later'
1097 _PLAYLIST_TITLE = u'Youtube Watch Later'
1099 _PERSONAL_FEED = True
1101 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1102 IE_NAME = u'youtube:favorites'
1103 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1104 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?'
1105 _LOGIN_REQUIRED = True
1107 def _real_extract(self, url):
1108 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1109 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1110 return self.url_result(playlist_id, 'YoutubePlaylist')