X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fcommon.py;h=65520744799013fbaa756d171cb10bff8a3e693e;hb=06e4874c997fb523bd9d8e675d1ec0e69ab501ed;hp=52523d7b249210eed88d334c2fefe28cabe78706;hpb=36e6f62cd0883f0f486d1666d010e5d9e6d515bd;p=youtube-dl.git diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52523d7b2..655207447 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,8 +18,6 @@ from ..compat import ( compat_http_client, compat_urllib_error, compat_urllib_parse, - compat_urllib_parse_urlparse, - compat_urllib_request, compat_urlparse, compat_str, compat_etree_fromstring, @@ -31,17 +29,20 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + error_to_compat_str, ExtractorError, fix_xml_ampersands, float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, + sanitized_Request, unescapeHTML, unified_strdate, url_basename, xpath_text, xpath_with_ns, + determine_protocol, ) @@ -167,7 +168,7 @@ class InfoExtractor(object): "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video @@ -310,11 +311,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ @@ -332,7 +333,8 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, compat_str(err)) + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -622,7 +624,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) return (username, password) @@ -776,14 +778,12 @@ class InfoExtractor(object): preference = f.get('preference') if preference is None: - proto = f.get('protocol') - if proto is None: - proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme - - preference = 0 if proto in ['http', 'https'] else -0.1 + preference = 0 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 + proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + if f.get('vcodec') == 'none': # audio only if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] @@ -814,6 +814,7 @@ class InfoExtractor(object): f.get('vbr') if f.get('vbr') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, + proto_preference, ext_preference, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, @@ -883,7 +884,7 @@ class InfoExtractor(object): fatal=fatal) if manifest is False: - return manifest + return [] formats = [] manifest_version = '1.0' @@ -891,6 +892,11 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + base_url = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': media_url = media_el.attrib.get('href') or media_el.attrib.get('url') @@ -898,16 +904,14 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - f4m_formats = self._extract_f4m_formats( - manifest_url, video_id, preference, f4m_id, fatal=fatal) - if f4m_formats: - formats.extend(f4m_formats) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal)) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -943,13 +947,14 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc, urlh = self._download_webpage_handle( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return [] + m3u8_doc, urlh = res m3u8_url = urlh.geturl() last_info = None last_media = None @@ -1140,10 +1145,8 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + formats.extend(self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)) continue if src_ext == 'f4m': @@ -1155,9 +1158,7 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) - if f4m_formats: - formats.extend(f4m_formats) + formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue if src_url.startswith('http') and self._is_valid_url(src, video_id): @@ -1279,7 +1280,7 @@ class InfoExtractor(object): def _get_cookies(self, url): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ - req = compat_urllib_request.Request(url) + req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) return compat_cookies.SimpleCookie(req.get_header('Cookie'))