X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=6782bbff6addf710ab17bec5baf73a2e11c50201;hb=d828f3a5500b29f30c702e6aa34add6e29370b2e;hp=d09128555f87f962339be4eb13f51d95c76e615f;hpb=c5e8d7af0ed867d70502491e3a80ee09b78ed2ce;p=youtube-dl.git diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d09128555..6782bbff6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,12 +1,11 @@ # coding: utf-8 -from __future__ import absolute_import import json import netrc import re import socket -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( compat_http_client, compat_parse_qs, @@ -35,7 +34,7 @@ class YoutubeIE(InfoExtractor): (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms - (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:watch|movie(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= @@ -59,7 +58,7 @@ class YoutubeIE(InfoExtractor): '18': 'mp4', '22': 'mp4', '37': 'mp4', - '38': 'video', # You actually don't know if this will be MOV, AVI or whatever + '38': 'mp4', '43': 'webm', '44': 'webm', '45': 'webm', @@ -82,6 +81,44 @@ class YoutubeIE(InfoExtractor): '46': '1080x1920', } IE_NAME = u'youtube' + _TESTS = [ + { + u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc", + u"file": u"BaW_jenozKc.mp4", + u"info_dict": { + u"title": u"youtube-dl test video \"'/\\ä↭𝕐", + u"uploader": u"Philipp Hagemeister", + u"uploader_id": u"phihag", + u"upload_date": u"20121002", + u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." + } + }, + { + u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U", + u"file": u"1ltcDfZMA3U.flv", + u"note": u"Test VEVO video (#897)", + u"info_dict": { + u"upload_date": u"20070518", + u"title": u"Maps - It Will Find You", + u"description": u"Music video by Maps performing It Will Find You.", + u"uploader": u"MuteUSA", + u"uploader_id": u"MuteUSA" + } + }, + { + u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY", + u"file": u"UxxajLWwzqY.mp4", + u"note": u"Test generic use_cipher_signature video (#897)", + u"info_dict": { + u"upload_date": u"20120506", + u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", + u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", + u"uploader": u"IconaPop", + u"uploader_id": u"IconaPop" + } + } + ] + @classmethod def suitable(cls, url): @@ -130,16 +167,26 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - @staticmethod - def _decrypt_signature(s): - """Decrypt the key the two subkeys must have a length of 43""" - (a,b) = s.split('.') - if len(a) != 43 or len(b) != 43: - raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid') - b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] - a = a[-40:] - s_dec = '.'.join((a,b))[::-1] - return s_dec + def _decrypt_signature(self, s): + """Turn the encrypted s field into a working signature""" + + if len(s) == 88: + return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + elif len(s) == 87: + return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1] + elif len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + elif len(s) == 85: + return s[76] + s[82:76:-1] + s[83] + s[75:60:-1] + s[0] + s[59:50:-1] + s[1] + s[49:2:-1] + elif len(s) == 84: + return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26] + elif len(s) == 83: + return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36] + elif len(s) == 82: + return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] + + else: + raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) def _get_available_subtitles(self, video_id): self.report_video_subtitles_download(video_id) @@ -343,7 +390,7 @@ class YoutubeIE(InfoExtractor): request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) try: self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) @@ -355,6 +402,9 @@ class YoutubeIE(InfoExtractor): return video_id def _real_extract(self, url): + if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url): + self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).') + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: @@ -454,14 +504,13 @@ class YoutubeIE(InfoExtractor): if video_subtitles: (sub_error, sub_lang, sub) = video_subtitles[0] if sub_error: - # We try with the automatic captions - video_subtitles = self._request_automatic_caption(video_id, video_webpage) - (sub_error_auto, sub_lang, sub) = video_subtitles[0] - if sub is not None: - pass - else: - # We report the original error - self._downloader.report_warning(sub_error) + self._downloader.report_warning(sub_error) + + if self._downloader.params.get('writeautomaticsub', False): + video_subtitles = self._request_automatic_caption(video_id, video_webpage) + (sub_error, sub_lang, sub) = video_subtitles[0] + if sub_error: + self._downloader.report_warning(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) @@ -471,7 +520,7 @@ class YoutubeIE(InfoExtractor): self._downloader.report_warning(sub_error) if self._downloader.params.get('listsubtitles', False): - sub_lang_list = self._list_available_subtitles(video_id) + self._list_available_subtitles(video_id) return if 'length_seconds' not in video_info: @@ -480,19 +529,20 @@ class YoutubeIE(InfoExtractor): else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) - # token - video_token = compat_urllib_parse.unquote_plus(video_info['token'][0]) - # Decide which formats to download req_format = self._downloader.params.get('format', None) try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) + if not mobj: + raise ValueError('Could not find vevo ID') info = json.loads(mobj.group(1)) args = info['args'] - if args.get('ptk','') == 'vevo' or 'dashmpd': - # Vevo videos with encrypted signatures - self.to_screen(u'%s: Vevo video detected.' % video_id) + # Easy way to know if the 's' value is in url_encoded_fmt_stream_map + # this signatures are encrypted + m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map']) + if m_s is not None: + self.to_screen(u'%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] except ValueError: pass @@ -509,6 +559,12 @@ class YoutubeIE(InfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: + if self._downloader.params.get('verbose'): + s = url_data['s'][0] + player = self._search_regex(r'html5player-(.+?)\.js', video_webpage, + 'html5 player', fatal=False) + self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' % + (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player)) signature = self._decrypt_signature(url_data['s'][0]) url += '&signature=' + signature if 'ratebypass' not in url: @@ -530,7 +586,7 @@ class YoutubeIE(InfoExtractor): if req_format is None or req_format == 'best': video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality elif req_format == 'worst': - video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality + video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality elif req_format in ('-1', 'all'): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: @@ -755,3 +811,45 @@ class YoutubeUserIE(InfoExtractor): urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] url_results = [self.url_result(url, 'Youtube') for url in urls] return [self.playlist_result(url_results, playlist_title = username)] + +class YoutubeSearchIE(SearchInfoExtractor): + """Information Extractor for YouTube search queries.""" + _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' + _MAX_RESULTS = 1000 + IE_NAME = u'youtube:search' + _SEARCH_KEY = 'ytsearch' + + def report_download_page(self, query, pagenum): + """Report attempt to download search page with given number.""" + self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) + + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + + video_ids = [] + pagenum = 0 + limit = n + + while (50 * pagenum) < limit: + self.report_download_page(query, pagenum+1) + result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) + request = compat_urllib_request.Request(result_url) + try: + data = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download API page: %s' % compat_str(err)) + api_response = json.loads(data)['data'] + + if not 'items' in api_response: + raise ExtractorError(u'[youtube] No video results') + + new_ids = list(video['id'] for video in api_response['items']) + video_ids += new_ids + + limit = min(n, api_response['totalItems']) + pagenum += 1 + + if len(video_ids) > n: + video_ids = video_ids[:n] + videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] + return self.playlist_result(videos, query)