X-Git-Url: https://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=52f2055b56913c7ff3139a0cdaf301c09d023ee6;hb=9211e3319e6006373d8b5055f7a3d0bbd734c57b;hp=ceba4ca1c415cec0381d9d152dc72a7d6bf255fe;hpb=7b67b60773b70ac74edd3993eeea6fe9b790c664;p=youtube-dl.git diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ceba4ca1c..52f2055b5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..downloader.f4m import remove_encrypted_media from ..utils import ( @@ -646,15 +647,29 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal) + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: xml_string = transform_source(xml_string) - return compat_etree_fromstring(xml_string.encode('utf-8')) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -1386,7 +1401,7 @@ class InfoExtractor(object): media_url = media.get('URI') if media_url: format_id = [] - for v in (group_id, name): + for v in (m3u8_id, group_id, name): if v: format_id.append(v) f = { @@ -1905,7 +1920,7 @@ class InfoExtractor(object): # can't be used at the same time if '%(Number' in media_template and 's' not in representation_ms_info: segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration': + if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ @@ -2169,6 +2184,12 @@ class InfoExtractor(object): f = parse_content_type(source_attributes.get('type')) is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: + # res attribute is not standard but seen several times + # in the wild + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: @@ -2301,7 +2322,6 @@ class InfoExtractor(object): formats = self._parse_jwplayer_formats( video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) - self._sort_formats(formats) subtitles = {} tracks = video_data.get('tracks') @@ -2318,16 +2338,25 @@ class InfoExtractor(object): 'url': self._proto_relative_url(track_url) }) - entries.append({ + entry = { 'id': this_video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), + 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, - 'formats': formats, - }) + } + # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 + if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): + entry.update({ + '_type': 'url_transparent', + 'url': formats[0]['url'], + }) + else: + self._sort_formats(formats) + entry['formats'] = formats + entries.append(entry) if len(entries) == 1: return entries[0] else: @@ -2428,10 +2457,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, port is not None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url):