X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=08f63be96ca3a84bf18100df637127bfae042b68;hb=23ad44b57bb62a76414daf630d85c7544e0b2728;hp=0257ee2f9eaefa69fefd4c2c4f87d452d3a80d68;hpb=69ea8ca42cd4fc62fdd4e7f18defb3b23da618d2;p=youtube-dl.git diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0257ee2f9..08f63be96 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, int_or_none, - PagedList, + OnDemandPagedList, unescapeHTML, unified_strdate, orderedSet, @@ -185,14 +185,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._download_webpage( req, None, - note='Confirming age', errnote='Unable to confirm age') - return True + note='Confirming age', errnote='Unable to confirm age', + fatal=False) def _real_initialize(self): if self._downloader is None: return - if not self._set_language(): - return + if self._get_login_info()[0] is not None: + if not self._set_language(): + return if not self._login(): return self._confirm_age() @@ -211,7 +212,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ + (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -273,6 +274,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, @@ -286,6 +290,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -295,6 +300,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -503,7 +510,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'signature=([$a-zA-Z]+)', jscode, + r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode, 'Initial JS player signature function name') jsi = JSInterpreter(jscode) @@ -655,6 +662,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + pref_cookies = [ + c for c in self._downloader.cookiejar + if c.domain == '.youtube.com' and c.name == 'PREF'] + for pc in pref_cookies: + if 'hl=' in pc.value: + pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) + else: + if pc.value: + pc.value += '&' + pc.value += 'hl=en' video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -667,7 +684,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video info self.report_video_info_webpage_download(video_id) if re.search(r'player-age-gate-content">', video_webpage) is not None: - self.report_age_confirmation() age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube @@ -675,12 +691,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'), + r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) else: age_gate = False @@ -928,7 +945,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest - if (self._downloader.params.get('youtube_include_dash_manifest', False)): + if self._downloader.params.get('youtube_include_dash_manifest', True): try: # The DASH manifest used needs to be the one from the original video_webpage. # The one found in get_video_info seems to be using different signatures. @@ -974,7 +991,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): existing_format.update(f) except (ExtractorError, KeyError) as e: - self.report_warning('Skipping DASH manifest: %s' % e, video_id) + self.report_warning('Skipping DASH manifest: %r' % e, video_id) self._sort_formats(formats) @@ -1005,7 +1022,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): (?:\w+\.)? youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch) + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) \? (?:.*?&)*? (?:p|a|list)= | p/ ) @@ -1026,6 +1043,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', 'info_dict': { 'title': 'ytdl test PL', + 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', }, 'playlist_count': 3, }, { @@ -1045,7 +1063,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'note': 'issue #673', 'url': 'PLBB231211A4F62143', 'info_dict': { - 'title': 'Team Fortress 2 (Class-based LP)', + 'title': '[OLD]Team Fortress 2 (Class-based LP)', }, 'playlist_mincount': 26, }, { @@ -1061,6 +1079,20 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'title': 'YDL_safe_search', }, 'playlist_count': 2, + }, { + 'note': 'embedded', + 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + } + }, { + 'note': 'Embedded SWF player', + 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA7', + } }] def _real_initialize(self): @@ -1160,16 +1192,25 @@ class YoutubeTopListIE(YoutubePlaylistIE): IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' ' (Example: "yttoplist:music:Top Tracks")') _VALID_URL = r'yttoplist:(?P.*?):(?P.*?)$' - _TESTS = [] + _TESTS = [{ + 'url': 'yttoplist:music:Trending', + 'playlist_mincount': 5, + 'skip': 'Only works for logged-in users', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) channel = mobj.group('chann') title = mobj.group('title') query = compat_urllib_parse.urlencode({'title': title}) - playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query) - channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex(playlist_re, channel_page, 'list') + channel_page = self._download_webpage( + 'https://www.youtube.com/%s' % channel, title) + link = self._html_search_regex( + r'''(?x) + <a\s+href="([^"]+)".*?>\s* + <span\s+class="branded-page-module-title-text">\s* + <span[^>]*>.*?%s.*?</span>''' % re.escape(query), + channel_page, 'list') url = compat_urlparse.urljoin('https://www.youtube.com/', link) video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' @@ -1195,6 +1236,11 @@ class YoutubeChannelIE(InfoExtractor): _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = 'youtube:channel' + _TESTS = [{ + 'note': 'paginated channel', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'playlist_mincount': 91, + }] def extract_videos_from_page(self, page): ids_in_page = [] @@ -1253,6 +1299,17 @@ class YoutubeUserIE(InfoExtractor): _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = 'youtube:user' + _TESTS = [{ + 'url': 'https://www.youtube.com/user/TheLinuxFoundation', + 'playlist_mincount': 320, + 'info_dict': { + 'title': 'TheLinuxFoundation', + } + }, { + 'url': 'ytuser:phihag', + 'only_matching': True, + }] + @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube @@ -1302,7 +1359,7 @@ class YoutubeUserIE(InfoExtractor): 'id': video_id, 'title': title, } - url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) + url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) @@ -1361,6 +1418,13 @@ class YoutubeSearchURLIE(InfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'title': 'youtube-dl test video', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -1395,17 +1459,38 @@ class YoutubeSearchURLIE(InfoExtractor): class YoutubeShowIE(InfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(.*)' + _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' + _TESTS = [{ + 'url': 'http://www.youtube.com/show/airdisasters', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'airdisasters', + 'title': 'Air Disasters', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - show_name = mobj.group(1) - webpage = self._download_webpage(url, show_name, 'Downloading show webpage') + playlist_id = mobj.group('id') + webpage = self._download_webpage( + url, playlist_id, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) - self.to_screen('%s: Found %s seasons' % (show_name, len(m_seasons))) - return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] + self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) + entries = [ + self.url_result( + 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist') + for season in m_seasons + ] + title = self._og_search_title(webpage, fatal=False) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'entries': entries, + } class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):