X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=ab675415441f3d915dc95dfa55b093e776960081;hb=65488b820c6792f6c9f5e729c417985d501c8eb1;hp=8a5ef2e7028a58f6bc33d2f100197442ba34aca1;hpb=5e1eddb939a4fbc2a5ef10111c8141c842cf01d6;p=youtube-dl.git diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8a5ef2e70..ab6754154 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,6 +33,7 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + remove_start, smuggle_url, str_to_int, unescapeHTML, @@ -46,7 +47,7 @@ from ..utils import ( class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor' + _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -128,40 +129,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Two-Factor # TODO add SMS and phone call support - these require making a request and then prompting the user - if re.search(r'(?i)]* id="gaia_secondfactorform"', login_results) is not None: - tfa_code = self._get_tfa_info() + if re.search(r'(?i)]* id="challenge"', login_results) is not None: + tfa_code = self._get_tfa_info('2-step verification code') - if tfa_code is None: - self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor ') - self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') + if not tfa_code: + self._downloader.report_warning( + 'Two-factor authentication required. Provide it either interactively or with --twofactor ' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False - # Unlike the first login form, secTok and timeStmp are both required for the TFA form - - match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get secTok - did the page structure change?') - secTok = match.group(1) - match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get timeStmp - did the page structure change?') - timeStmp = match.group(1) - - tfa_form_strs = { - 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', - 'smsToken': '', - 'smsUserPin': tfa_code, - 'smsVerifyPin': 'Verify', - - 'PersistentCookie': 'yes', - 'checkConnection': '', - 'checkedDomains': 'youtube', - 'pstMsg': '1', - 'secTok': secTok, - 'timeStmp': timeStmp, - 'service': 'youtube', - 'hl': 'en_US', - } + tfa_code = remove_start(tfa_code, 'G-') + + tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + + tfa_form_strs.update({ + 'Pin': tfa_code, + 'TrustDevice': 'on', + }) + tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items()) tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') @@ -173,8 +158,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if tfa_results is False: return False - if re.search(r'(?i)]* id="gaia_secondfactorform"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') + if re.search(r'(?i)]* id="challenge"', tfa_results) is not None: + self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') return False if re.search(r'(?i)]* id="gaia_loginform"', tfa_results) is not None: self._downloader.report_warning('unable to log in - did the page structure change?') @@ -213,11 +198,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx) v= ) )) - |youtu\.be/ # just youtu.be/xxxx + |(?: + youtu\.be| # just youtu.be/xxxx + vid\.plus # or vid.plus/xxxx + )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID @@ -283,13 +271,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -299,11 +287,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -331,6 +319,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -345,7 +334,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', } @@ -361,6 +353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'age_limit': 18, } }, { @@ -376,6 +369,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'setindia' } }, + { + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY', + 'note': 'Use the first video ID in the URL', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', @@ -417,7 +430,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:2acfda1b285bdd478ccec22f9918199d', + 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', @@ -451,6 +464,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'upload_date': '20140605', + 'age_limit': 18, }, }, # Age-gate video with encrypted signature @@ -464,6 +478,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'upload_date': '20110629', + 'age_limit': 18, }, }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) @@ -488,7 +503,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', - 'upload_date': '20120731', + 'upload_date': '20120724', 'uploader_id': 'olympic', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympics', @@ -517,7 +532,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'qEJwOuvDf7I', 'info_dict': { 'id': 'qEJwOuvDf7I', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', @@ -612,6 +627,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + }, + { + 'url': 'http://vid.plus/FlRa-iH7PGw', + 'only_matching': True, } ] @@ -641,7 +660,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1171,6 +1190,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + def _extract_count(count_name): return str_to_int(self._search_regex( r'-%s-button[^>]+>]+class="yt-uix-button-content"[^>]*>([\d,]+)' @@ -1266,7 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - r'html5player-([^/]+?)(?:/html5player)?\.js', + r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1339,6 +1362,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, + 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration, @@ -1745,7 +1769,7 @@ class YoutubeSearchURLIE(InfoExtractor): r'(?s)]+class="item-section"(.*?)', webpage, 'result HTML') part_codes = re.findall( - r'(?s)

(.*?)

', result_code) + r'(?s)]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)', result_code) entries = [] for part_code in part_codes: part_title = self._html_search_regex(