X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fyoutube.py;h=54592d174b1b27cbeb55bae783b9549e0ab37dd3;hb=8e93b9b9aae0e31f053eaa8c5cbfebfbab2dfbab;hp=9424d5e2669a72e791a0ba0a0120de0bfec27fc8;hpb=d2fee313ec71ad8a4d8b57ec9f433210ee01056b;p=youtube-dl.git diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9424d5e26..54592d174 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,9 @@ from ..utils import ( get_element_by_id, get_element_by_attribute, ExtractorError, + int_or_none, + PagedList, + RegexNotFoundError, unescapeHTML, unified_strdate, orderedSet, @@ -37,7 +40,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -108,7 +111,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'next_url': '/', 'action_confirm': 'Confirm', } - req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + req = compat_urllib_request.Request(self._AGE_URL, + compat_urllib_parse.urlencode(age_form).encode('ascii')) self._download_webpage( req, None, @@ -131,6 +135,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): ( (?:https?://|//)? # http(s):// or protocol-independent URL (optional) (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| + (?:www\.)?deturl\.com/www\.youtube\.com/| + (?:www\.)?pwnyoutube\.com| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls @@ -202,6 +208,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, @@ -213,6 +225,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, } IE_NAME = u'youtube' @@ -264,6 +279,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"setindia" } }, + { + u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", + u"file": u"a9LDPn-MO4I.m4a", + u"note": u"256k DASH audio (format 141) via DASH manifest", + u"info_dict": { + u"upload_date": "20121002", + u"uploader_id": "8KVIDEO", + u"description": "No description available.", + u"uploader": "8KVIDEO", + u"title": "UHDTV TEST 8K VIDEO.mp4" + }, + u"params": { + u"youtube_include_dash_manifest": True, + u"format": "141", + }, + }, ] @@ -984,7 +1015,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( - 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, + 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) @@ -998,9 +1029,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': l[0].encode('utf-8'), + 'name': unescapeHTML(l[0]).encode('utf-8'), }) - url = u'http://www.youtube.com/api/timedtext?' + params + url = u'https://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url if not sub_lang_list: self._downloader.report_warning(u'video doesn\'t have subtitles') @@ -1061,18 +1092,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = mobj.group(2) return video_id - def _get_video_url_list(self, url_map): - """ - Transform a dictionary in the format {itag:url} to a list of (itag, url) - with the requested formats. - """ - existing_formats = [x for x in self._formats if x in url_map] - if len(existing_formats) == 0: - raise ExtractorError(u'no known formats available for video') - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - video_url_list.reverse() # order worst to best - return video_url_list - def _extract_from_m3u8(self, manifest_url, video_id): url_map = {} def _get_urls(_manifest): @@ -1246,7 +1265,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_annotations = self._extract_annotations(video_id) # Decide which formats to download - try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) if not mobj: @@ -1271,9 +1289,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): except ValueError: pass + def _map_to_format_list(urlmap): + formats = [] + for itag, video_real_url in urlmap.items(): + dct = { + 'format_id': itag, + 'url': video_real_url, + 'player_url': player_url, + } + if itag in self._formats: + dct.update(self._formats[itag]) + formats.append(dct) + return formats + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() - video_url_list = [(None, video_info['conn'][0])] + formats = [{ + 'format_id': '_rtmp', + 'protocol': 'rtmp', + 'url': video_info['conn'][0], + 'player_url': player_url, + }] elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] if 'rtmpe%3Dyes' in encoded_url_map: @@ -1318,23 +1354,50 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) else: raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') - formats = [] - for itag, video_real_url in video_url_list: - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - dct.update(self._formats[itag]) - formats.append(dct) + # Look for the DASH manifest + dash_manifest_url_lst = video_info.get('dashmpd') + if (dash_manifest_url_lst and dash_manifest_url_lst[0] and + self._downloader.params.get('youtube_include_dash_manifest', False)): + try: + dash_doc = self._download_xml( + dash_manifest_url_lst[0], video_id, + note=u'Downloading DASH manifest', + errnote=u'Could not download DASH manifest') + for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') + if url_el is None: + continue + format_id = r.attrib['id'] + video_url = url_el.text + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + } + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + f.update(self._formats.get(format_id, {})) + formats.append(f) + else: + existing_format.update(f) + + except (ExtractorError, KeyError) as e: + self.report_warning(u'Skipping DASH manifest: %s' % e, video_id) self._sort_formats(formats) @@ -1443,7 +1506,14 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, page) is None: break - playlist_title = self._og_search_title(page) + try: + playlist_title = self._og_search_title(page) + except RegexNotFoundError: + self.report_warning( + u'Playlist page is missing OpenGraph title, falling back ...', + playlist_id) + playlist_title = self._html_search_regex( + r'