Merge pull request #7326 from remitamine/clipfish

[youtube-dl.git] / youtube_dl / extractor / theplatform.py
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index adaec337579e0bdca194b0b6cf44cefd918edd31..1555aa77cac30c18de3f0c2db9e13ea00cc569f6 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -2,7 +2,6 @@
  from __future__ import unicode_literals
  
  import re
-import json
  import time
  import hmac
  import binascii
@@ -29,7 +28,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
  
  
  class ThePlatformBaseIE(InfoExtractor):
-    def _extract_theplatform_smil_formats(self, smil_url, video_id, note='Downloading SMIL data'):
+    def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
          meta = self._download_xml(smil_url, video_id, note=note)
          try:
              error_msg = next(
@@ -55,12 +54,13 @@ class ThePlatformBaseIE(InfoExtractor):
  
          self._sort_formats(formats)
  
-        return formats
+        subtitles = self._parse_smil_subtitles(meta, default_ns)
+
+        return formats, subtitles
  
      def get_metadata(self, path, video_id):
          info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
-        info_json = self._download_webpage(info_url, video_id)
-        info = json.loads(info_json)
+        info = self._download_json(info_url, video_id)
  
          subtitles = {}
          captions = info.get('captions')
@@ -139,6 +139,11 @@ class ThePlatformIE(ThePlatformBaseIE):
              'upload_date': '20150701',
              'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
          },
+    }, {
+        # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
+        # geo-restricted (US), HLS encrypted with AES-128
+        'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
+        'only_matching': True,
      }]
  
      @staticmethod
@@ -182,8 +187,12 @@ class ThePlatformIE(ThePlatformBaseIE):
              # Seems there's no pattern for the interested script filename, so
              # I try one by one
              for script in reversed(scripts):
-                feed_script = self._download_webpage(script, video_id, 'Downloading feed script')
-                feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None)
+                feed_script = self._download_webpage(
+                    self._proto_relative_url(script, 'http:'),
+                    video_id, 'Downloading feed script')
+                feed_id = self._search_regex(
+                    r'defaultFeedId\s*:\s*"([^"]+)"', feed_script,
+                    'default feed id', default=None)
                  if feed_id is not None:
                      break
              if feed_id is None:
@@ -193,6 +202,15 @@ class ThePlatformIE(ThePlatformBaseIE):
  
          if smuggled_data.get('force_smil_url', False):
              smil_url = url
+        # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385)
+        elif '/guid/' in url:
+            webpage = self._download_webpage(url, video_id)
+            smil_url = self._search_regex(
+                r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
+                webpage, 'smil url', group='url')
+            path = self._search_regex(
+                r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
+            smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL'
          elif mobj.group('config'):
              config_url = url + '&form=json'
              config_url = config_url.replace('swf/', 'config/')
@@ -210,12 +228,14 @@ class ThePlatformIE(ThePlatformBaseIE):
          if sig:
              smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
  
-        formats = self._extract_theplatform_smil_formats(smil_url, video_id)
+        formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
  
          ret = self.get_metadata(path, video_id)
+        combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
          ret.update({
              'id': video_id,
              'formats': formats,
+            'subtitles': combined_subtitles,
          })
  
          return ret
@@ -253,6 +273,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
          entry = feed['entries'][0]
  
          formats = []
+        subtitles = {}
          first_video_id = None
          duration = None
          for item in entry['media$content']:
@@ -261,7 +282,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
              if first_video_id is None:
                  first_video_id = cur_video_id
                  duration = float_or_none(item.get('plfile$duration'))
-            formats.extend(self._extract_theplatform_smil_formats(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id))
+            cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
+            formats.extend(cur_formats)
+            subtitles = self._merge_subtitles(subtitles, cur_subtitles)
  
          self._sort_formats(formats)
  
@@ -275,9 +298,11 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
          categories = [item['media$name'] for item in entry.get('media$categories', [])]
  
          ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+        subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
          ret.update({
              'id': video_id,
              'formats': formats,
+            'subtitles': subtitles,
              'thumbnails': thumbnails,
              'duration': duration,
              'timestamp': timestamp,