From 86f4d14f817acaee1f1f544cd9b06d47bc2a5180 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Sep 2016 20:35:22 +0700 Subject: [PATCH] Refactor fragments interface and dash segments downloader - Eliminate segment_urls and initialization_url + Introduce manifest_url (manifest may contain unfragmented data in this case url will be used for direct media URL and manifest_url for manifest itself correspondingly) * Rewrite dashsegments downloader to use fragments data * Improve generic mpd extraction --- youtube_dl/downloader/dash.py | 35 +++++++++++---------------------- youtube_dl/extractor/common.py | 31 +++++++++++------------------ youtube_dl/extractor/generic.py | 4 +++- 3 files changed, 26 insertions(+), 44 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 41fc9cfc2..8437dde30 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import os -import re from .fragment import FragmentFD from ..compat import compat_urllib_error @@ -19,34 +18,32 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - base_url = info_dict['url'] - segment_urls = [info_dict['segment_urls'][0]] if self.params.get('test', False) else info_dict['segment_urls'] - initialization_url = info_dict.get('initialization_url') + segments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] ctx = { 'filename': filename, - 'total_frags': len(segment_urls) + (1 if initialization_url else 0), + 'total_frags': len(segments), } self._prepare_and_start_frag_download(ctx) - def combine_url(base_url, target_url): - if re.match(r'^https?://', target_url): - return target_url - return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) - segments_filenames = [] fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - def process_segment(segment, tmp_filename, fatal): - target_url, segment_name = segment + def process_segment(segment, tmp_filename, num): + segment_url = segment['url'] + segment_name = 'Frag%d' % num target_filename = '%s-%s' % (tmp_filename, segment_name) + # In DASH, the first segment contains necessary headers to + # generate a valid MP4 file, so always abort for the first segment + fatal = num == 0 or not skip_unavailable_fragments count = 0 while count <= fragment_retries: try: - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + success = ctx['dl'].download(target_filename, {'url': segment_url}) if not success: return False down, target_sanitized = sanitize_open(target_filename, 'rb') @@ -72,16 +69,8 @@ class DashSegmentsFD(FragmentFD): return False return True - segments_to_download = [(initialization_url, 'Init')] if initialization_url else [] - segments_to_download.extend([ - (segment_url, 'Seg%d' % i) - for i, segment_url in enumerate(segment_urls)]) - - for i, segment in enumerate(segments_to_download): - # In DASH, the first segment contains necessary headers to - # generate a valid MP4 file, so always abort for the first segment - fatal = i == 0 or not skip_unavailable_fragments - if not process_segment(segment, ctx['tmpfilename'], fatal): + for i, segment in enumerate(segments): + if not process_segment(segment, ctx['tmpfilename'], i): return False self._finish_frag_download(ctx) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e637b33d5..f35311e7a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -86,9 +86,10 @@ class InfoExtractor(object): from worst to best quality. Potential fields: - * url Mandatory. The URL of the video file or URL of - the manifest file in case of fragmented media - (DASH, hls, hds). + * url Mandatory. The URL of the video file + * manifest_url + The URL of the manifest file in case of + fragmented media (DASH, hls, hds) * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -1528,9 +1529,10 @@ class InfoExtractor(object): mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) + compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + formats_dict=formats_dict, mpd_url=mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): """ Parse formats from MPD manifest. References: @@ -1654,6 +1656,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'manifest_url': mpd_url, 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), @@ -1682,14 +1685,6 @@ class InfoExtractor(object): if 'total_number' not in representation_ms_info and 'segment_duration': segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth'), - } - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] representation_ms_info['fragments'] = [{ 'url': media_template % { 'Number': segment_number, @@ -1703,7 +1698,6 @@ class InfoExtractor(object): # $Number*$ or $Time$ in media template with S list available # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - representation_ms_info['segment_urls'] = [] representation_ms_info['fragments'] = [] segment_time = 0 segment_d = None @@ -1715,7 +1709,6 @@ class InfoExtractor(object): 'Bandwidth': representation_attrib.get('bandwidth'), 'Number': segment_number, } - representation_ms_info['segment_urls'].append(segment_url) representation_ms_info['fragments'].append({ 'url': segment_url, 'duration': float_or_none(segment_d, representation_ms_info['timescale']), @@ -1745,17 +1738,15 @@ class InfoExtractor(object): 'duration': float_or_none(s['d'], representation_ms_info['timescale']), }) representation_ms_info['fragments'] = fragments - if 'segment_urls' in representation_ms_info: + # NB: MPD manifest may contain direct URLs to unfragmented media. + # No fragments key is present in this case. + if 'fragments' in representation_ms_info: f.update({ - 'segment_urls': representation_ms_info['segment_urls'], 'fragments': [], 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) - f.update({ - 'initialization_url': initialization_url, - }) if not f.get('url'): f['url'] = initialization_url f['fragments'].append({'url': initialization_url}) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 92a6e5146..c1792c534 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1657,7 +1657,9 @@ class GenericIE(InfoExtractor): return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( - doc, video_id, mpd_base_url=url.rpartition('/')[0]) + doc, video_id, + mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): -- 2.40.0