[extractor/common] Add support for video of WebPage context in _json_ld (closes ...

[youtube-dl.git] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index cdfa7000b729d50a2fb7b9de6e1bc4942dc6f057..61d97ab72e93b39569d6e6febe9789cf001e5c85 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,3 +1,4 @@
+# coding: utf-8
  from __future__ import unicode_literals
  
  import base64
@@ -975,6 +976,22 @@ class InfoExtractor(object):
              return info
          if isinstance(json_ld, dict):
              json_ld = [json_ld]
+
+        def extract_video_object(e):
+            assert e['@type'] == 'VideoObject'
+            info.update({
+                'url': e.get('contentUrl'),
+                'title': unescapeHTML(e.get('name')),
+                'description': unescapeHTML(e.get('description')),
+                'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+                'duration': parse_duration(e.get('duration')),
+                'timestamp': unified_timestamp(e.get('uploadDate')),
+                'filesize': float_or_none(e.get('contentSize')),
+                'tbr': int_or_none(e.get('bitrate')),
+                'width': int_or_none(e.get('width')),
+                'height': int_or_none(e.get('height')),
+            })
+
          for e in json_ld:
              if e.get('@context') == 'http://schema.org':
                  item_type = e.get('@type')
@@ -999,18 +1016,11 @@ class InfoExtractor(object):
                          'description': unescapeHTML(e.get('articleBody')),
                      })
                  elif item_type == 'VideoObject':
-                    info.update({
-                        'url': e.get('contentUrl'),
-                        'title': unescapeHTML(e.get('name')),
-                        'description': unescapeHTML(e.get('description')),
-                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
-                        'duration': parse_duration(e.get('duration')),
-                        'timestamp': unified_timestamp(e.get('uploadDate')),
-                        'filesize': float_or_none(e.get('contentSize')),
-                        'tbr': int_or_none(e.get('bitrate')),
-                        'width': int_or_none(e.get('width')),
-                        'height': int_or_none(e.get('height')),
-                    })
+                    extract_video_object(e)
+                elif item_type == 'WebPage':
+                    video = e.get('video')
+                    if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+                        extract_video_object(video)
                  break
          return dict((k, v) for k, v in info.items() if v is not None)
  
@@ -1778,7 +1788,7 @@ class InfoExtractor(object):
                      if content_type == 'text':
                          # TODO implement WebVTT downloading
                          pass
-                    elif content_type == 'video' or content_type == 'audio':
+                    elif content_type in ('video', 'audio'):
                          base_url = ''
                          for element in (representation, adaptation_set, period, mpd_doc):
                              base_url_e = element.find(_add_ns('BaseURL'))
@@ -2181,7 +2191,7 @@ class InfoExtractor(object):
  
      def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
          mobj = re.search(
-            r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\).*?\.setup\s*\((?P<options>[^)]+)\)',
              webpage)
          if mobj:
              try:
@@ -2257,11 +2267,17 @@ class InfoExtractor(object):
  
      def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                                  m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        urls = []
          formats = []
          for source in jwplayer_sources_data:
-            source_url = self._proto_relative_url(source['file'])
+            source_url = self._proto_relative_url(source.get('file'))
+            if not source_url:
+                continue
              if base_url:
                  source_url = compat_urlparse.urljoin(base_url, source_url)
+            if source_url in urls:
+                continue
+            urls.append(source_url)
              source_type = source.get('type') or ''
              ext = mimetype2ext(source_type) or determine_ext(source_url)
              if source_type == 'hls' or ext == 'm3u8':