X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=0cbb97aae9727a0ca0cf2e149d54b5bf66371769;hb=edd6074ceac8b879382f9000cf8883b5e357776e;hp=9627816b41b76a5531382a89efa0bbc034c4a802;hpb=2133565cec3646680600d314b93e535f6fa52339;p=youtube-dl.git diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9627816b4..0cbb97aae 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, + compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -87,6 +88,9 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file + * manifest_url + The URL of the manifest file in case of + fragmented media (DASH, hls, hds) * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -115,6 +119,11 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". + * fragments A list of fragments of the fragmented media, + with the following entries: + * "url" (mandatory) - fragment's URL + * "duration" (optional, int or float) + * "filesize" (optional, int) * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -226,7 +235,7 @@ class InfoExtractor(object): chapter_id: Id of the chapter the video belongs to, as a unicode string. The following fields should only be used when the video is an episode of some - series or programme: + series, programme or podcast: series: Title of the series or programme the video episode belongs to. season: Title of the season the video episode belongs to. @@ -674,16 +683,21 @@ class InfoExtractor(object): username = info[0] password = info[2] else: - raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine) + raise netrc.NetrcParseError( + 'No authenticators for %s' % netrc_machine) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + self._downloader.report_warning( + 'parsing .netrc: %s' % error_to_compat_str(err)) - return (username, password) + return username, password def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) - It will look in the netrc file using the _NETRC_MACHINE value + First look for the manually specified credentials using username_option + and password_option as keys in params dictionary. If no such credentials + available look in the netrc file using the netrc_machine or _NETRC_MACHINE + value. If there's no info available, return (None, None) """ if self._downloader is None: @@ -1086,6 +1100,13 @@ class InfoExtractor(object): manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 'bootstrap info', default=None) + vcodec = None + mime_type = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], + 'base URL', default=None) + if mime_type and mime_type.startswith('audio/'): + vcodec = 'none' + for i, media_el in enumerate(media_nodes): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) @@ -1126,6 +1147,7 @@ class InfoExtractor(object): 'width': f.get('width') or width, 'height': f.get('height') or height, 'format_id': f.get('format_id') if not tbr else format_id, + 'vcodec': vcodec, }) formats.extend(f4m_formats) continue @@ -1137,10 +1159,12 @@ class InfoExtractor(object): formats.append({ 'format_id': format_id, 'url': manifest_url, + 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, 'tbr': tbr, 'width': width, 'height': height, + 'vcodec': vcodec, 'preference': preference, }) return formats @@ -1242,9 +1266,11 @@ class InfoExtractor(object): # format_id intact. if not live: format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) + manifest_url = format_url(line.strip()) f = { 'format_id': '-'.join(format_id), - 'url': format_url(line.strip()), + 'url': manifest_url, + 'manifest_url': manifest_url, 'tbr': tbr, 'ext': ext, 'fps': float_or_none(last_info.get('FRAME-RATE')), @@ -1516,9 +1542,10 @@ class InfoExtractor(object): mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) + compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + formats_dict=formats_dict, mpd_url=mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): """ Parse formats from MPD manifest. References: @@ -1539,42 +1566,52 @@ class InfoExtractor(object): def extract_multisegment_info(element, ms_parent_info): ms_info = ms_parent_info.copy() + + # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some + # common attributes and elements. We will only extract relevant + # for us. + def extract_common(source): + segment_timeline = source.find(_add_ns('SegmentTimeline')) + if segment_timeline is not None: + s_e = segment_timeline.findall(_add_ns('S')) + if s_e: + ms_info['total_number'] = 0 + ms_info['s'] = [] + for s in s_e: + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) + start_number = source.get('startNumber') + if start_number: + ms_info['start_number'] = int(start_number) + timescale = source.get('timescale') + if timescale: + ms_info['timescale'] = int(timescale) + segment_duration = source.get('duration') + if segment_duration: + ms_info['segment_duration'] = int(segment_duration) + + def extract_Initialization(source): + initialization = source.find(_add_ns('Initialization')) + if initialization is not None: + ms_info['initialization_url'] = initialization.attrib['sourceURL'] + segment_list = element.find(_add_ns('SegmentList')) if segment_list is not None: + extract_common(segment_list) + extract_Initialization(segment_list) segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) if segment_urls_e: ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] - initialization = segment_list.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] else: segment_template = element.find(_add_ns('SegmentTemplate')) if segment_template is not None: - start_number = segment_template.get('startNumber') - if start_number: - ms_info['start_number'] = int(start_number) - segment_timeline = segment_template.find(_add_ns('SegmentTimeline')) - if segment_timeline is not None: - s_e = segment_timeline.findall(_add_ns('S')) - if s_e: - ms_info['total_number'] = 0 - ms_info['s'] = [] - for s in s_e: - r = int(s.get('r', 0)) - ms_info['total_number'] += 1 + r - ms_info['s'].append({ - 't': int(s.get('t', 0)), - # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) - 'd': int(s.attrib['d']), - 'r': r, - }) - else: - timescale = segment_template.get('timescale') - if timescale: - ms_info['timescale'] = int(timescale) - segment_duration = segment_template.get('duration') - if segment_duration: - ms_info['segment_duration'] = int(segment_duration) + extract_common(segment_template) media_template = segment_template.get('media') if media_template: ms_info['media_template'] = media_template @@ -1582,11 +1619,14 @@ class InfoExtractor(object): if initialization: ms_info['initialization_url'] = initialization else: - initialization = segment_template.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] + extract_Initialization(segment_template) return ms_info + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) + mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] for period in mpd_doc.findall(_add_ns('Period')): @@ -1629,6 +1669,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'manifest_url': mpd_url, 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), @@ -1643,9 +1684,7 @@ class InfoExtractor(object): } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: - if 'total_number' not in representation_ms_info and 'segment_duration': - segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) @@ -1654,46 +1693,79 @@ class InfoExtractor(object): # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time - if '%(Number' in media_template: - representation_ms_info['segment_urls'] = [ - media_template % { + if '%(Number' in media_template and 's' not in representation_ms_info: + segment_duration = None + if 'total_number' not in representation_ms_info and 'segment_duration': + segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) + representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + representation_ms_info['fragments'] = [{ + 'url': media_template % { 'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth'), - } - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + }, + 'duration': segment_duration, + } for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] else: - representation_ms_info['segment_urls'] = [] + # $Number*$ or $Time$ in media template with S list available + # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg + # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 + representation_ms_info['fragments'] = [] segment_time = 0 + segment_d = None + segment_number = representation_ms_info['start_number'] def add_segment_url(): - representation_ms_info['segment_urls'].append( - media_template % { - 'Time': segment_time, - 'Bandwidth': representation_attrib.get('bandwidth'), - } - ) + segment_url = media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + 'Number': segment_number, + } + representation_ms_info['fragments'].append({ + 'url': segment_url, + 'duration': float_or_none(segment_d, representation_ms_info['timescale']), + }) for num, s in enumerate(representation_ms_info['s']): segment_time = s.get('t') or segment_time + segment_d = s['d'] add_segment_url() + segment_number += 1 for r in range(s.get('r', 0)): - segment_time += s['d'] + segment_time += segment_d add_segment_url() - segment_time += s['d'] - if 'segment_urls' in representation_ms_info: + segment_number += 1 + segment_time += segment_d + elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: + # No media template + # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # or any YouTube dashsegments video + fragments = [] + s_num = 0 + for segment_url in representation_ms_info['segment_urls']: + s = representation_ms_info['s'][s_num] + for r in range(s.get('r', 0) + 1): + fragments.append({ + 'url': segment_url, + 'duration': float_or_none(s['d'], representation_ms_info['timescale']), + }) + representation_ms_info['fragments'] = fragments + # NB: MPD manifest may contain direct URLs to unfragmented media. + # No fragments key is present in this case. + if 'fragments' in representation_ms_info: f.update({ - 'segment_urls': representation_ms_info['segment_urls'], + 'fragments': [], 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) - f.update({ - 'initialization_url': initialization_url, - }) if not f.get('url'): f['url'] = initialization_url + f['fragments'].append({'url': initialization_url}) + f['fragments'].extend(representation_ms_info['fragments']) + for fragment in f['fragments']: + fragment['url'] = combine_url(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -1739,7 +1811,11 @@ class InfoExtractor(object): return is_plain_url, formats entries = [] - for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage): + media_tags = [(media_tag, media_type, '') + for media_tag, media_type + in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] + media_tags.extend(re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage)) + for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], 'subtitles': {}, @@ -1766,7 +1842,7 @@ class InfoExtractor(object): for track_tag in re.findall(r']+>', media_content): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') - if not kind or kind == 'subtitles': + if not kind or kind in ('subtitles', 'captions'): src = track_attributes.get('src') if not src: continue @@ -1774,22 +1850,70 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) - if media_info['formats']: + if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries def _extract_akamai_formats(self, manifest_url, video_id): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') - formats.extend(self._extract_f4m_formats( - update_url_query(f4m_url, {'hdcore': '3.7.0'}), - video_id, f4m_id='hds', fatal=False)) + if 'hdcore=' not in f4m_url: + f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign + f4m_formats = self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False) + for entry in f4m_formats: + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.extend(f4m_formats) m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) return formats + def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) + url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') + http_base_url = 'http' + url_base + formats = [] + if 'm3u8' not in skip_protocols: + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + if 'f4m' not in skip_protocols: + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) + if re.search(r'(?:/smil:|\.smil)', url_base): + if 'dash' not in skip_protocols: + formats.extend(self._extract_mpd_formats( + http_base_url + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) + if 'smil' not in skip_protocols: + rtmp_formats = self._extract_smil_formats( + http_base_url + '/jwplayer.smil', + video_id, fatal=False) + for rtmp_format in rtmp_formats: + rtsp_format = rtmp_format.copy() + rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([rtmp_format, rtsp_format]) + else: + for protocol in ('rtmp', 'rtsp'): + if protocol not in skip_protocols: + formats.append({ + 'url': protocol + url_base, + 'format_id': protocol, + 'protocol': protocol, + }) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1910,6 +2034,12 @@ class InfoExtractor(object): headers['Ytdl-request-proxy'] = geo_verification_proxy return headers + def _generic_id(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + + def _generic_title(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + class SearchInfoExtractor(InfoExtractor): """