[YoutubeDL] Always set the '_filename' field in the info_dict (reported in #4053)

[youtube-dl.git] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index cd155a0901b6a50189d064da26b74951a41b1e18..4782326826670ba7d7f24abcf46970cf2f821a98 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,6 +14,7 @@ import xml.etree.ElementTree
  
  from ..compat import (
      compat_cookiejar,
+    compat_HTTPError,
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse_urlparse,
@@ -26,6 +27,7 @@ from ..utils import (
      compiled_regex_type,
      ExtractorError,
      float_or_none,
+    HEADRequest,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
@@ -108,12 +110,17 @@ class InfoExtractor(object):
                                    (quality takes higher priority)
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
-                    * http_referer  HTTP Referer header value to set.
                      * http_method  HTTP method to use for the download.
                      * http_headers  A dictionary of additional HTTP headers
                                   to add to the request.
                      * http_post_data  Additional data to send with a POST
                                   request.
+                    * stretched_ratio  If given and not 1, indicates that the
+                                 video's pixels are not square.
+                                 width : height ratio as float.
+                    * no_resume  The server does not support resuming the
+                                 (HTTP or RTMP) download. Boolean.
+
      url:            Final video URL.
      ext:            Video filename extension.
      format:         The video format, defaults to ext (used for --get-format)
@@ -127,7 +134,9 @@ class InfoExtractor(object):
                      something like "4234987", title "Dancing naked mole rats",
                      and display_id "dancing-naked-mole-rats"
      thumbnails:     A list of dictionaries, with the following entries:
+                        * "id" (optional, string) - Thumbnail format ID
                          * "url"
+                        * "preference" (optional, int) - quality of the image
                          * "width" (optional, int)
                          * "height" (optional, int)
                          * "resolution" (optional, string "{width}x{height"},
@@ -709,6 +718,27 @@ class InfoExtractor(object):
              )
          formats.sort(key=_formats_key)
  
+    def _check_formats(self, formats, video_id):
+        if formats:
+            formats[:] = filter(
+                lambda f: self._is_valid_url(
+                    f['url'], video_id,
+                    item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
+                formats)
+
+    def _is_valid_url(self, url, video_id, item='video'):
+        try:
+            self._request_webpage(
+                HEADRequest(url), video_id,
+                'Checking %s URL' % item)
+            return True
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                self.report_warning(
+                    '%s URL is invalid, skipping' % item, video_id)
+                return False
+            raise
+
      def http_scheme(self):
          """ Either "http:" or "https:", depending on the user's preferences """
          return (
@@ -739,8 +769,14 @@ class InfoExtractor(object):
              'Unable to download f4m manifest')
  
          formats = []
+        manifest_version = '1.0'
          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
+        if not media_nodes:
+            manifest_version = '2.0'
+            media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
          for i, media_el in enumerate(media_nodes):
+            if manifest_version == '2.0':
+                manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href')
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              format_id = 'f4m-%d' % (i if tbr is None else tbr)
              formats.append({