[youtube] Convert to new subtitles system

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Mon, 16 Feb 2015 20:44:17 +0000 (21:44 +0100)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Mon, 16 Feb 2015 21:47:39 +0000 (22:47 +0100)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Mon, 16 Feb 2015 20:44:17 +0000 (21:44 +0100)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Mon, 16 Feb 2015 21:47:39 +0000 (22:47 +0100)
diff --git a/test/test_subtitles.py b/test/test_subtitles.py

index 84ae0e7142886efacdb1c0f6a455bbc3498655de..91cebce28be9fb9c8cb388ec78c24f42f81a3164 100644 (file)
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -50,11 +50,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
      url = 'QRS8MkLhQmM'
      IE = YoutubeIE
  
      url = 'QRS8MkLhQmM'
      IE = YoutubeIE
  
-    def test_youtube_no_writesubtitles(self):
-        self.DL.params['writesubtitles'] = False
-        subtitles = self.getSubtitles()
-        self.assertEqual(subtitles, None)
-
      def test_youtube_subtitles(self):
          self.DL.params['writesubtitles'] = True
          subtitles = self.getSubtitles()
      def test_youtube_subtitles(self):
          self.DL.params['writesubtitles'] = True
          subtitles = self.getSubtitles()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 8545dc9e924aaadc2ba862b96b268b6cf6511ea9..a47f8f5de953f61d068fbe2e50fb9ebf656f84b4 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -1020,9 +1020,13 @@ class YoutubeDL(object):
              info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
  
          if self.params.get('listsubtitles', False):
              info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
  
          if self.params.get('listsubtitles', False):
-            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'))
+            if 'automatic_captions' in info_dict:
+                self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
+            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
              return
              return
-        info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles'))
+        info_dict['requested_subtitles'] = self.process_subtitles(
+            info_dict['id'], info_dict.get('subtitles'),
+            info_dict.get('automatic_captions'))
  
          # This extractors handle format selection themselves
          if info_dict['extractor'] in ['Youku']:
  
          # This extractors handle format selection themselves
          if info_dict['extractor'] in ['Youku']:
@@ -1152,8 +1156,14 @@ class YoutubeDL(object):
          info_dict.update(formats_to_download[-1])
          return info_dict
  
          info_dict.update(formats_to_download[-1])
          return info_dict
  
-    def process_subtitles(self, video_id, available_subs):
+    def process_subtitles(self, video_id, available_subs, available_autocaps):
          """Select the requested subtitles and their format"""
          """Select the requested subtitles and their format"""
+        if available_autocaps and self.params.get('writeautomaticsub'):
+            available_subs = available_subs.copy()
+            for lang, cap_info in available_autocaps.items():
+                if lang not in available_subs:
+                    available_subs[lang] = cap_info
+
          if not available_subs:
              return available_subs
  
          if not available_subs:
              return available_subs
  
@@ -1645,17 +1655,17 @@ class YoutubeDL(object):
              ['ID', 'width', 'height', 'URL'],
              [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
  
              ['ID', 'width', 'height', 'URL'],
              [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
  
-    def list_subtitles(self, video_id, subtitles):
+    def list_subtitles(self, video_id, subtitles, name='subtitles'):
          if not subtitles:
          if not subtitles:
-            self.to_screen('%s has no subtitles' % video_id)
+            self.to_screen('%s has no %s' % (video_id, name))
              return
          header_line = 'Language    formats'
          sub_lines = [
              '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
              for lang, formats in subtitles.items()]
          self.to_screen(
              return
          header_line = 'Language    formats'
          sub_lines = [
              '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
              for lang, formats in subtitles.items()]
          self.to_screen(
-            'Available subtitles for %s:\n%s\n%s' %
-            (video_id, header_line, '\n'.join(sub_lines)))
+            'Available %s for %s:\n%s\n%s' %
+            (name, video_id, header_line, '\n'.join(sub_lines)))
  
      def urlopen(self, req):
          """ Start an HTTP download """
  
      def urlopen(self, req):
          """ Start an HTTP download """
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index d149e0f92d0bfca461cffa97b79e9293b7f0d86e..fe7d8dbc9ea9331230ab0ae2fe28cc0069eab440 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -157,6 +157,8 @@ class InfoExtractor(object):
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A url pointing to the subtitles file
                      with the "ext" entry and one of:
                          * "data": The subtitles file contents
                          * "url": A url pointing to the subtitles file
+    automatic_captions: Like 'subtitles', used by the YoutubeIE for
+                    automatically generated captions
      duration:       Length of the video in seconds, as an integer.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      duration:       Length of the video in seconds, as an integer.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
@@ -1007,6 +1009,16 @@ class InfoExtractor(object):
      def _get_subtitles(self, *args, **kwargs):
          raise NotImplementedError("This method must be implemented by subclasses")
  
      def _get_subtitles(self, *args, **kwargs):
          raise NotImplementedError("This method must be implemented by subclasses")
  
+    def extract_automatic_captions(self, *args, **kwargs):
+        automatic_captions = {}
+        list_subtitles = self._downloader.params.get('listsubtitles')
+        if self._downloader.params.get('writeautomaticsub', False) or list_subtitles:
+            automatic_captions.update(self._get_automatic_captions(*args, **kwargs))
+        return automatic_captions
+
+    def _get_automatic_captions(self, *args, **kwargs):
+        raise NotImplementedError("This method must be implemented by subclasses")
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
  
  class SearchInfoExtractor(InfoExtractor):
      """
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 35ef4c30359cb70fa58b69eff16d75406190c5ed..1b2dbf2765b64ddd7d4f1cfab2698a4b03f6571f 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -11,7 +11,6 @@ import time
  import traceback
  
  from .common import InfoExtractor, SearchInfoExtractor
  import traceback
  
  from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesInfoExtractor
  from ..jsinterp import JSInterpreter
  from ..swfinterp import SWFInterpreter
  from ..compat import (
  from ..jsinterp import JSInterpreter
  from ..swfinterp import SWFInterpreter
  from ..compat import (
@@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              return
  
  
              return
  
  
-class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
+class YoutubeIE(YoutubeBaseInfoExtractor):
      IE_DESC = 'YouTube.com'
      _VALID_URL = r"""(?x)^
                       (
      IE_DESC = 'YouTube.com'
      _VALID_URL = r"""(?x)^
                       (
@@ -644,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              raise ExtractorError(
                  'Signature extraction failed: ' + tb, cause=e)
  
              raise ExtractorError(
                  'Signature extraction failed: ' + tb, cause=e)
  
-    def _get_available_subtitles(self, video_id, webpage):
+    def _get_subtitles(self, video_id, webpage):
          try:
              subs_doc = self._download_xml(
                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
          try:
              subs_doc = self._download_xml(
                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@ -658,23 +657,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              lang = track.attrib['lang_code']
              if lang in sub_lang_list:
                  continue
              lang = track.attrib['lang_code']
              if lang in sub_lang_list:
                  continue
-            params = compat_urllib_parse.urlencode({
-                'lang': lang,
-                'v': video_id,
-                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
-                'name': track.attrib['name'].encode('utf-8'),
-            })
-            url = 'https://www.youtube.com/api/timedtext?' + params
-            sub_lang_list[lang] = url
+            sub_formats = []
+            for ext in ['sbv', 'vtt', 'srt']:
+                params = compat_urllib_parse.urlencode({
+                    'lang': lang,
+                    'v': video_id,
+                    'fmt': ext,
+                    'name': track.attrib['name'].encode('utf-8'),
+                })
+                sub_formats.append({
+                    'url': 'https://www.youtube.com/api/timedtext?' + params,
+                    'ext': ext,
+                })
+            sub_lang_list[lang] = sub_formats
          if not sub_lang_list:
              self._downloader.report_warning('video doesn\'t have subtitles')
              return {}
          return sub_lang_list
  
          if not sub_lang_list:
              self._downloader.report_warning('video doesn\'t have subtitles')
              return {}
          return sub_lang_list
  
-    def _get_available_automatic_caption(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
-        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
          self.to_screen('%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = 'Couldn\'t find automatic captions for %s' % video_id
          self.to_screen('%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = 'Couldn\'t find automatic captions for %s' % video_id
@@ -704,14 +707,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              sub_lang_list = {}
              for lang_node in caption_list.findall('target'):
                  sub_lang = lang_node.attrib['lang_code']
              sub_lang_list = {}
              for lang_node in caption_list.findall('target'):
                  sub_lang = lang_node.attrib['lang_code']
-                params = compat_urllib_parse.urlencode({
-                    'lang': original_lang,
-                    'tlang': sub_lang,
-                    'fmt': sub_format,
-                    'ts': timestamp,
-                    'kind': caption_kind,
-                })
-                sub_lang_list[sub_lang] = caption_url + '&' + params
+                sub_formats = []
+                for ext in ['sbv', 'vtt', 'srt']:
+                    params = compat_urllib_parse.urlencode({
+                        'lang': original_lang,
+                        'tlang': sub_lang,
+                        'fmt': ext,
+                        'ts': timestamp,
+                        'kind': caption_kind,
+                    })
+                    sub_formats.append({
+                        'url': caption_url + '&' + params,
+                        'ext': ext,
+                    })
+                sub_lang_list[sub_lang] = sub_formats
              return sub_lang_list
          # An extractor error can be raise by the download process if there are
          # no automatic captions but there are subtitles
              return sub_lang_list
          # An extractor error can be raise by the download process if there are
          # no automatic captions but there are subtitles
@@ -966,10 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
  
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
-
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, video_webpage)
-            return
+        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
  
          if 'length_seconds' not in video_info:
              self._downloader.report_warning('unable to extract video duration')
  
          if 'length_seconds' not in video_info:
              self._downloader.report_warning('unable to extract video duration')
@@ -1118,6 +1124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              'description': video_description,
              'categories': video_categories,
              'subtitles': video_subtitles,
              'description': video_description,
              'categories': video_categories,
              'subtitles': video_subtitles,
+            'automatic_captions': automatic_captions,
              'duration': video_duration,
              'age_limit': 18 if age_gate else 0,
              'annotations': video_annotations,
              'duration': video_duration,
              'age_limit': 18 if age_gate else 0,
              'annotations': video_annotations,
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Mon, 16 Feb 2015 20:44:17 +0000 (21:44 +0100)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Mon, 16 Feb 2015 21:47:39 +0000 (22:47 +0100)
test/test_subtitles.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history