]> gitweb @ CieloNegro.org - youtube-dl.git/commitdiff
[youtube] Convert to new subtitles system
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Mon, 16 Feb 2015 20:44:17 +0000 (21:44 +0100)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Mon, 16 Feb 2015 21:47:39 +0000 (22:47 +0100)
The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

test/test_subtitles.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/common.py
youtube_dl/extractor/youtube.py

index 84ae0e7142886efacdb1c0f6a455bbc3498655de..91cebce28be9fb9c8cb388ec78c24f42f81a3164 100644 (file)
@@ -50,11 +50,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
     url = 'QRS8MkLhQmM'
     IE = YoutubeIE
 
     url = 'QRS8MkLhQmM'
     IE = YoutubeIE
 
-    def test_youtube_no_writesubtitles(self):
-        self.DL.params['writesubtitles'] = False
-        subtitles = self.getSubtitles()
-        self.assertEqual(subtitles, None)
-
     def test_youtube_subtitles(self):
         self.DL.params['writesubtitles'] = True
         subtitles = self.getSubtitles()
     def test_youtube_subtitles(self):
         self.DL.params['writesubtitles'] = True
         subtitles = self.getSubtitles()
index 8545dc9e924aaadc2ba862b96b268b6cf6511ea9..a47f8f5de953f61d068fbe2e50fb9ebf656f84b4 100755 (executable)
@@ -1020,9 +1020,13 @@ class YoutubeDL(object):
             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
 
         if self.params.get('listsubtitles', False):
             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
 
         if self.params.get('listsubtitles', False):
-            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'))
+            if 'automatic_captions' in info_dict:
+                self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
+            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
             return
             return
-        info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles'))
+        info_dict['requested_subtitles'] = self.process_subtitles(
+            info_dict['id'], info_dict.get('subtitles'),
+            info_dict.get('automatic_captions'))
 
         # This extractors handle format selection themselves
         if info_dict['extractor'] in ['Youku']:
 
         # This extractors handle format selection themselves
         if info_dict['extractor'] in ['Youku']:
@@ -1152,8 +1156,14 @@ class YoutubeDL(object):
         info_dict.update(formats_to_download[-1])
         return info_dict
 
         info_dict.update(formats_to_download[-1])
         return info_dict
 
-    def process_subtitles(self, video_id, available_subs):
+    def process_subtitles(self, video_id, available_subs, available_autocaps):
         """Select the requested subtitles and their format"""
         """Select the requested subtitles and their format"""
+        if available_autocaps and self.params.get('writeautomaticsub'):
+            available_subs = available_subs.copy()
+            for lang, cap_info in available_autocaps.items():
+                if lang not in available_subs:
+                    available_subs[lang] = cap_info
+
         if not available_subs:
             return available_subs
 
         if not available_subs:
             return available_subs
 
@@ -1645,17 +1655,17 @@ class YoutubeDL(object):
             ['ID', 'width', 'height', 'URL'],
             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
 
             ['ID', 'width', 'height', 'URL'],
             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
 
-    def list_subtitles(self, video_id, subtitles):
+    def list_subtitles(self, video_id, subtitles, name='subtitles'):
         if not subtitles:
         if not subtitles:
-            self.to_screen('%s has no subtitles' % video_id)
+            self.to_screen('%s has no %s' % (video_id, name))
             return
         header_line = 'Language    formats'
         sub_lines = [
             '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
             for lang, formats in subtitles.items()]
         self.to_screen(
             return
         header_line = 'Language    formats'
         sub_lines = [
             '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
             for lang, formats in subtitles.items()]
         self.to_screen(
-            'Available subtitles for %s:\n%s\n%s' %
-            (video_id, header_line, '\n'.join(sub_lines)))
+            'Available %s for %s:\n%s\n%s' %
+            (name, video_id, header_line, '\n'.join(sub_lines)))
 
     def urlopen(self, req):
         """ Start an HTTP download """
 
     def urlopen(self, req):
         """ Start an HTTP download """
index d149e0f92d0bfca461cffa97b79e9293b7f0d86e..fe7d8dbc9ea9331230ab0ae2fe28cc0069eab440 100644 (file)
@@ -157,6 +157,8 @@ class InfoExtractor(object):
                     with the "ext" entry and one of:
                         * "data": The subtitles file contents
                         * "url": A url pointing to the subtitles file
                     with the "ext" entry and one of:
                         * "data": The subtitles file contents
                         * "url": A url pointing to the subtitles file
+    automatic_captions: Like 'subtitles', used by the YoutubeIE for
+                    automatically generated captions
     duration:       Length of the video in seconds, as an integer.
     view_count:     How many users have watched the video on the platform.
     like_count:     Number of positive ratings of the video
     duration:       Length of the video in seconds, as an integer.
     view_count:     How many users have watched the video on the platform.
     like_count:     Number of positive ratings of the video
@@ -1007,6 +1009,16 @@ class InfoExtractor(object):
     def _get_subtitles(self, *args, **kwargs):
         raise NotImplementedError("This method must be implemented by subclasses")
 
     def _get_subtitles(self, *args, **kwargs):
         raise NotImplementedError("This method must be implemented by subclasses")
 
+    def extract_automatic_captions(self, *args, **kwargs):
+        automatic_captions = {}
+        list_subtitles = self._downloader.params.get('listsubtitles')
+        if self._downloader.params.get('writeautomaticsub', False) or list_subtitles:
+            automatic_captions.update(self._get_automatic_captions(*args, **kwargs))
+        return automatic_captions
+
+    def _get_automatic_captions(self, *args, **kwargs):
+        raise NotImplementedError("This method must be implemented by subclasses")
+
 
 class SearchInfoExtractor(InfoExtractor):
     """
 
 class SearchInfoExtractor(InfoExtractor):
     """
index 35ef4c30359cb70fa58b69eff16d75406190c5ed..1b2dbf2765b64ddd7d4f1cfab2698a4b03f6571f 100644 (file)
@@ -11,7 +11,6 @@ import time
 import traceback
 
 from .common import InfoExtractor, SearchInfoExtractor
 import traceback
 
 from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesInfoExtractor
 from ..jsinterp import JSInterpreter
 from ..swfinterp import SWFInterpreter
 from ..compat import (
 from ..jsinterp import JSInterpreter
 from ..swfinterp import SWFInterpreter
 from ..compat import (
@@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             return
 
 
             return
 
 
-class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
+class YoutubeIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com'
     _VALID_URL = r"""(?x)^
                      (
     IE_DESC = 'YouTube.com'
     _VALID_URL = r"""(?x)^
                      (
@@ -644,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             raise ExtractorError(
                 'Signature extraction failed: ' + tb, cause=e)
 
             raise ExtractorError(
                 'Signature extraction failed: ' + tb, cause=e)
 
-    def _get_available_subtitles(self, video_id, webpage):
+    def _get_subtitles(self, video_id, webpage):
         try:
             subs_doc = self._download_xml(
                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
         try:
             subs_doc = self._download_xml(
                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@ -658,23 +657,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             lang = track.attrib['lang_code']
             if lang in sub_lang_list:
                 continue
             lang = track.attrib['lang_code']
             if lang in sub_lang_list:
                 continue
-            params = compat_urllib_parse.urlencode({
-                'lang': lang,
-                'v': video_id,
-                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
-                'name': track.attrib['name'].encode('utf-8'),
-            })
-            url = 'https://www.youtube.com/api/timedtext?' + params
-            sub_lang_list[lang] = url
+            sub_formats = []
+            for ext in ['sbv', 'vtt', 'srt']:
+                params = compat_urllib_parse.urlencode({
+                    'lang': lang,
+                    'v': video_id,
+                    'fmt': ext,
+                    'name': track.attrib['name'].encode('utf-8'),
+                })
+                sub_formats.append({
+                    'url': 'https://www.youtube.com/api/timedtext?' + params,
+                    'ext': ext,
+                })
+            sub_lang_list[lang] = sub_formats
         if not sub_lang_list:
             self._downloader.report_warning('video doesn\'t have subtitles')
             return {}
         return sub_lang_list
 
         if not sub_lang_list:
             self._downloader.report_warning('video doesn\'t have subtitles')
             return {}
         return sub_lang_list
 
-    def _get_available_automatic_caption(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, webpage):
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
-        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
         self.to_screen('%s: Looking for automatic captions' % video_id)
         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
         self.to_screen('%s: Looking for automatic captions' % video_id)
         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
@@ -704,14 +707,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             sub_lang_list = {}
             for lang_node in caption_list.findall('target'):
                 sub_lang = lang_node.attrib['lang_code']
             sub_lang_list = {}
             for lang_node in caption_list.findall('target'):
                 sub_lang = lang_node.attrib['lang_code']
-                params = compat_urllib_parse.urlencode({
-                    'lang': original_lang,
-                    'tlang': sub_lang,
-                    'fmt': sub_format,
-                    'ts': timestamp,
-                    'kind': caption_kind,
-                })
-                sub_lang_list[sub_lang] = caption_url + '&' + params
+                sub_formats = []
+                for ext in ['sbv', 'vtt', 'srt']:
+                    params = compat_urllib_parse.urlencode({
+                        'lang': original_lang,
+                        'tlang': sub_lang,
+                        'fmt': ext,
+                        'ts': timestamp,
+                        'kind': caption_kind,
+                    })
+                    sub_formats.append({
+                        'url': caption_url + '&' + params,
+                        'ext': ext,
+                    })
+                sub_lang_list[sub_lang] = sub_formats
             return sub_lang_list
         # An extractor error can be raise by the download process if there are
         # no automatic captions but there are subtitles
             return sub_lang_list
         # An extractor error can be raise by the download process if there are
         # no automatic captions but there are subtitles
@@ -966,10 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
         # subtitles
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 
         # subtitles
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
-
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, video_webpage)
-            return
+        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
 
         if 'length_seconds' not in video_info:
             self._downloader.report_warning('unable to extract video duration')
 
         if 'length_seconds' not in video_info:
             self._downloader.report_warning('unable to extract video duration')
@@ -1118,6 +1124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             'description': video_description,
             'categories': video_categories,
             'subtitles': video_subtitles,
             'description': video_description,
             'categories': video_categories,
             'subtitles': video_subtitles,
+            'automatic_captions': automatic_captions,
             'duration': video_duration,
             'age_limit': 18 if age_gate else 0,
             'annotations': video_annotations,
             'duration': video_duration,
             'age_limit': 18 if age_gate else 0,
             'annotations': video_annotations,