X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=c27f2cd9523ebe7a7bb7bf2f4a38bd22b779051f;hb=a115e07594ccb7749ca108c889978510c7df126e;hp=1ab429162e4c23dc72ea200e660805db55ab6bc9;hpb=b477fc13144be90ba7bbec3386f2e48fa3e7d604;p=youtube-dl.git diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1ab429162..c27f2cd95 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -77,7 +77,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _set_language(self): self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', + '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) @@ -303,7 +303,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -1898,6 +1898,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} + microformat = try_get( + player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} + video_title = video_info.get('title', [None])[0] or video_details.get('title') if not video_title: self._downloader.report_warning('Unable to extract video title') @@ -1927,7 +1930,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ''', replace_url, video_description) video_description = clean_html(video_description) else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') + video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage) if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): @@ -1975,6 +1978,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): view_count = extract_view_count(video_info) if view_count is None and video_details: view_count = int_or_none(video_details.get('viewCount')) + if view_count is None and microformat: + view_count = int_or_none(microformat.get('viewCount')) if is_live is None: is_live = bool_or_none(video_details.get('isLive')) @@ -2225,6 +2230,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if mobj is not None: video_uploader_id = mobj.group('uploader_id') video_uploader_url = mobj.group('uploader_url') + else: + owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) + if owner_profile_url: + video_uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', + default=None) + video_uploader_url = owner_profile_url channel_id = ( str_or_none(video_details.get('channelId')) @@ -2271,6 +2283,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): [r'(?s)id="eow-date.*?>(.*?)', r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = microformat.get('publishDate') or microformat.get('uploadDate') upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -2342,17 +2356,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): m_cat_container = self._search_regex( r'(?s)]*>\s*Category\s*\s*]*>(.*?)', video_webpage, 'categories', default=None) + category = None if m_cat_container: category = self._html_search_regex( r'(?s)(.*?)', m_cat_container, 'category', default=None) - video_categories = None if category is None else [category] - else: - video_categories = None + if not category: + category = try_get( + microformat, lambda x: x['category'], compat_str) + video_categories = None if category is None else [category] video_tags = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + if not video_tags: + video_tags = try_get(video_details, lambda x: x['keywords'], list) def _extract_count(count_name): return str_to_int(self._search_regex( @@ -2758,7 +2776,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ids = [] last_id = playlist_id[-11:] for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) new_ids = orderedSet(re.findall( @@ -3098,7 +3116,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P[^/]+)/playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ @@ -3124,6 +3142,9 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'title': 'Chem Player', }, 'skip': 'Blocked', + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, }] @@ -3268,7 +3289,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, headers=self._YOUTUBE_CLIENT_HEADERS)