X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=f507400cc34facb450015bd7254d4f98b85487a5;hb=391256dc0ee5e6ac8d3022455de207b8b02a5b10;hp=78f053f1829406e721bded4680c22d372085d709;hpb=eca1f0d115e6a2712ff0d5f6b25e3ded5e52db71;p=youtube-dl.git diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78f053f18..f507400cc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -52,6 +52,7 @@ from ..utils import ( GeoUtils, int_or_none, js_to_json, + JSON_LD_RE, mimetype2ext, orderedSet, parse_codecs, @@ -68,6 +69,7 @@ from ..utils import ( update_url_query, urljoin, url_basename, + url_or_none, xpath_element, xpath_text, xpath_with_ns, @@ -210,6 +212,11 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -599,6 +606,11 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of + # introduced in Python 3.4.1. + err.fp._error = err return err.fp if errnote is False: @@ -1149,8 +1161,7 @@ class InfoExtractor(object): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', - html, 'JSON-LD', group='json_ld', **kwargs) + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} @@ -1208,10 +1219,10 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ - 'url': e.get('contentUrl'), + 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -1239,6 +1250,13 @@ class InfoExtractor(object): part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Movie': + info.update({ + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('dateCreated')), + }) elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), @@ -1701,9 +1719,9 @@ class InfoExtractor(object): # However, this is not always respected, for example, [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': audio_group = groups.get(audio_group_id) @@ -1859,9 +1877,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1871,9 +1887,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1884,7 +1904,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats