X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=c8d54f22a80ac447179305f50a85b42c483b946e;hb=4cb13d0d6af20906f973f0b281f9efba4bc65d41;hp=83b5840f76c7c9aa76760457e9b87ce86e83e680;hpb=15707c7e024f1f29e7abd8ddaa362196ef2d4af6;p=youtube-dl.git diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 83b5840f7..c8d54f22a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -44,6 +44,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + urlencode_postdata, ISO3166Utils, ) @@ -115,7 +116,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = compat_urllib_parse_urlencode(login_form_strs).encode('ascii') + login_data = urlencode_postdata(login_form_strs) req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( @@ -124,6 +125,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if login_results is False: return False + error_msg = self._html_search_regex( + r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', + login_results, 'error message', default=None) + if error_msg: + raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) @@ -148,7 +155,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'TrustDevice': 'on', }) - tfa_data = compat_urllib_parse_urlencode(tfa_form_strs).encode('ascii') + tfa_data = urlencode_postdata(tfa_form_strs) tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( @@ -233,7 +240,9 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): + for playlist_id in orderedSet(re.findall( + r']+class="[^"]*yt-lockup-title[^"]*"[^>]*>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -267,7 +276,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): )) |(?: youtu\.be| # just youtu.be/xxxx - vid\.plus # or vid.plus/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) @@ -334,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, @@ -489,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, + 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { @@ -505,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # JS player signature function name containing $ @@ -525,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # Controversy video @@ -606,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympics', + 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { @@ -659,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:33', + 'formats': 'mincount:32', }, }, # DASH manifest with segment_list @@ -679,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'youtube_include_dash_manifest': True, 'format': '135', # bestvideo - } + }, + 'skip': 'This live event has ended.', }, { # Multifeed videos (multiple cameras), URL is for Main Camera @@ -750,11 +764,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, + 'skip': 'Not multifeed anymore', }, { 'url': 'http://vid.plus/FlRa-iH7PGw', 'only_matching': True, }, + { + 'url': 'http://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', + 'only_matching': True, + }, { # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) # Also tests cut-off URL expansion in video description (see @@ -798,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video does not exist.', }, { # Video licensed under Creative Commons @@ -1312,10 +1332,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if video_description: video_description = re.sub(r'''(?x) ]*> + (?:[a-zA-Z-]+="[^"]*"\s+)*? + class="[^"]*"[^>]*> [^<]+\.{3}\s* ''', r'\1', video_description) @@ -1810,20 +1830,32 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading Youtube mix') + ids = [] + last_id = playlist_id[-11:] + for n in itertools.count(1): + url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + webpage = self._download_webpage( + url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) + new_ids = orderedSet(re.findall( + r'''(?xs)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), + webpage)) + # Fetch new pages until all the videos are repeated, it seems that + # there are always 51 unique videos. + new_ids = [_id for _id in new_ids if _id not in ids] + if not new_ids: + break + ids.extend(new_ids) + last_id = ids[-1] + + url_results = self._ids_to_results(ids) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) - ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1876,7 +1908,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): if video: return video - if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + if playlist_id.startswith(('RD', 'UL', 'PU')): # Mixes require a custom extraction process return self._extract_mix(playlist_id) @@ -1960,7 +1992,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' IE_NAME = 'youtube:user' @@ -1973,14 +2005,17 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'ytuser:phihag', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/gametrailers', + 'only_matching': True, }] @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube # extractor, the regex would is too permissive and it would match. - other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_ies): + other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_yt_ies): return False else: return super(YoutubeUserIE, cls).suitable(url) @@ -2113,10 +2148,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2131,32 +2167,8 @@ class YoutubeSearchURLIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } + return self.playlist_result(self._process_page(webpage), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):