[mtv] Prepare for #980

[youtube-dl.git] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index a2986cebe5db15d54fbbdf90e87669982ce08aec..69cdcdc1b5b75d1cac5733b34565f087c9dcddec 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -35,6 +35,8 @@ class InfoExtractor(object):
      title:          Video title, unescaped.
      ext:            Video filename extension.
  
+    Instead of url and ext, formats can also specified.
+
      The following fields are optional:
  
      format:         The video format, defaults to ext (used for --get-format)
@@ -52,8 +54,19 @@ class InfoExtractor(object):
      view_count:     How many users have watched the video on the platform.
      urlhandle:      [internal] The urlHandle to be used to download the file,
                      like returned by urllib.request.urlopen
-
-    The fields should all be Unicode strings.
+    formats:        A list of dictionaries for each format available, it must
+                    be ordered from worst to best quality. Potential fields:
+                    * url       Mandatory. The URL of the video file
+                    * ext       Will be calculated from url if missing
+                    * format    A human-readable description of the format
+                                ("mp4 container with h264/opus").
+                                Calculated from width and height if missing.
+                    * format_id A short description of the format
+                                ("mp4_h264_opus" or "19")
+                    * width     Width of the video, if known
+                    * height    Height of the video, if known
+
+    Unless mentioned otherwise, the fields should be Unicode strings.
  
      Subclasses of this one should re-define the _real_initialize() and
      _real_extract() methods and define a _VALID_URL regexp.
@@ -150,7 +163,7 @@ class InfoExtractor(object):
          if m:
              encoding = m.group(1)
          else:
-            m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]',
+            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')