[options] Include custom conf in final argv (closes #11741)

[youtube-dl.git] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 6ae946569095a4e6e82a8ff5f04580aa3229ee3a..dce8c7d0d5ad389aa84bf84f25731a2e680e91e3 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -59,6 +59,7 @@ from ..utils import (
      parse_m3u8_attributes,
      extract_attributes,
      parse_codecs,
+    urljoin,
  )
  
  
@@ -188,9 +189,10 @@ class InfoExtractor(object):
      uploader_url:   Full URL to a personal webpage of the video uploader.
      location:       Physical location where the video was filmed.
      subtitles:      The available subtitles as a dictionary in the format
-                    {language: subformats}. "subformats" is a list sorted from
-                    lower to higher preference, each element is a dictionary
-                    with the "ext" entry and one of:
+                    {tag: subformats}. "tag" is usually a language code, and
+                    "subformats" is a list sorted from lower to higher
+                    preference, each element is a dictionary with the "ext"
+                    entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
                      "ext" will be calculated from URL if missing
@@ -1224,7 +1226,7 @@ class InfoExtractor(object):
                  'protocol': entry_protocol,
                  'preference': preference,
              }]
-        audio_groups = set()
+        audio_in_video_stream = {}
          last_info = {}
          last_media = {}
          for line in m3u8_doc.splitlines():
@@ -1234,10 +1236,11 @@ class InfoExtractor(object):
                  media = parse_m3u8_attributes(line)
                  media_type = media.get('TYPE')
                  if media_type in ('VIDEO', 'AUDIO'):
+                    group_id = media.get('GROUP-ID')
                      media_url = media.get('URI')
                      if media_url:
                          format_id = []
-                        for v in (media.get('GROUP-ID'), media.get('NAME')):
+                        for v in (group_id, media.get('NAME')):
                              if v:
                                  format_id.append(v)
                          f = {
@@ -1250,12 +1253,15 @@ class InfoExtractor(object):
                          }
                          if media_type == 'AUDIO':
                              f['vcodec'] = 'none'
-                            audio_groups.add(media['GROUP-ID'])
+                            if group_id and not audio_in_video_stream.get(group_id):
+                                audio_in_video_stream[group_id] = False
                          formats.append(f)
                      else:
                          # When there is no URI in EXT-X-MEDIA let this tag's
                          # data be used by regular URI lines below
                          last_media = media
+                        if media_type == 'AUDIO' and group_id:
+                            audio_in_video_stream[group_id] = True
              elif line.startswith('#') or not line.strip():
                  continue
              else:
@@ -1299,7 +1305,7 @@ class InfoExtractor(object):
                          'abr': abr,
                      })
                  f.update(parse_codecs(last_info.get('CODECS')))
-                if last_info.get('AUDIO') in audio_groups:
+                if audio_in_video_stream.get(last_info.get('AUDIO')) is False:
                      # TODO: update acodec for for audio only formats with the same GROUP-ID
                      f['acodec'] = 'none'
                  formats.append(f)
@@ -1631,11 +1637,6 @@ class InfoExtractor(object):
                          extract_Initialization(segment_template)
              return ms_info
  
-        def combine_url(base_url, target_url):
-            if re.match(r'^https?://', target_url):
-                return target_url
-            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
-
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
          formats = []
          for period in mpd_doc.findall(_add_ns('Period')):
@@ -1685,12 +1686,11 @@ class InfoExtractor(object):
                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
                              'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                              'fps': int_or_none(representation_attrib.get('frameRate')),
-                            'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
-                            'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
                              'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                              'format_note': 'DASH %s' % content_type,
                              'filesize': filesize,
                          }
+                        f.update(parse_codecs(representation_attrib.get('codecs')))
                          representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                          if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
  
@@ -1774,7 +1774,7 @@ class InfoExtractor(object):
                                  f['fragments'].append({'url': initialization_url})
                              f['fragments'].extend(representation_ms_info['fragments'])
                              for fragment in f['fragments']:
-                                fragment['url'] = combine_url(base_url, fragment['url'])
+                                fragment['url'] = urljoin(base_url, fragment['url'])
                          try:
                              existing_format = next(
                                  fo for fo in formats
@@ -1888,7 +1888,7 @@ class InfoExtractor(object):
                  })
          return formats
  
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
          def absolute_url(video_url):
              return compat_urlparse.urljoin(base_url, video_url)
  
@@ -1905,11 +1905,16 @@ class InfoExtractor(object):
  
          def _media_formats(src, cur_media_type):
              full_url = absolute_url(src)
-            if determine_ext(full_url) == 'm3u8':
+            ext = determine_ext(full_url)
+            if ext == 'm3u8':
                  is_plain_url = False
                  formats = self._extract_m3u8_formats(
                      full_url, video_id, ext='mp4',
                      entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+            elif ext == 'mpd':
+                is_plain_url = False
+                formats = self._extract_mpd_formats(
+                    full_url, video_id, mpd_id=mpd_id)
              else:
                  is_plain_url = True
                  formats = [{
@@ -1962,10 +1967,13 @@ class InfoExtractor(object):
                  entries.append(media_info)
          return entries
  
-    def _extract_akamai_formats(self, manifest_url, video_id):
+    def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
          formats = []
          hdcore_sign = 'hdcore=3.7.0'
-        f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        hds_host = hosts.get('hds')
+        if hds_host:
+            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
          if 'hdcore=' not in f4m_url:
              f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
          f4m_formats = self._extract_f4m_formats(
@@ -1973,7 +1981,10 @@ class InfoExtractor(object):
          for entry in f4m_formats:
              entry.update({'extra_param_to_segment_url': hdcore_sign})
          formats.extend(f4m_formats)
-        m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        hls_host = hosts.get('hls')
+        if hls_host:
+            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
          formats.extend(self._extract_m3u8_formats(
              m3u8_url, video_id, 'mp4', 'm3u8_native',
              m3u8_id='hls', fatal=False))