Added new option '--all-srt' to download all the subtitles of a video.

[youtube-dl.git] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 23bd21af51a3d64ccf987c5ba5f8fb89d8d0902a..a220de80a4b62610efc483f50e45ffed5eb47279 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -216,6 +216,10 @@ class YoutubeIE(InfoExtractor):
          """Report attempt to download video info webpage."""
          self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
  
+    def report_video_subtitles_request(self, video_id, lang):
+        """Report attempt to download video info webpage."""
+        self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
+
      def report_information_extraction(self, video_id):
          """Report attempt to extract video information."""
          self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@@ -228,25 +232,7 @@ class YoutubeIE(InfoExtractor):
          """Indicate the download will use the RTMP protocol."""
          self._downloader.to_screen(u'[youtube] RTMP download detected')
  
-    def _closed_captions_xml_to_srt(self, xml_string):
-        srt = ''
-        texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
-        # TODO parse xml instead of regex
-        for n, (start, dur_tag, dur, caption) in enumerate(texts):
-            if not dur: dur = '4'
-            start = float(start)
-            end = start + float(dur)
-            start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
-            end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
-            caption = unescapeHTML(caption)
-            caption = unescapeHTML(caption) # double cycle, intentional
-            srt += str(n+1) + '\n'
-            srt += start + ' --> ' + end + '\n'
-            srt += caption + '\n\n'
-        return srt
-
-    def _extract_subtitles(self, video_id):
-        self.report_video_subtitles_download(video_id)
+    def _get_available_subtitles(self, video_id):
          request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
          try:
              srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
@@ -256,27 +242,49 @@ class YoutubeIE(InfoExtractor):
          srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
          if not srt_lang_list:
              return (u'WARNING: video has no closed captions', None)
-        if self._downloader.params.get('subtitleslang', False):
-            srt_lang = self._downloader.params.get('subtitleslang')
-        elif 'en' in srt_lang_list:
-            srt_lang = 'en'
-        else:
-            srt_lang = list(srt_lang_list.keys())[0]
-        if not srt_lang in srt_lang_list:
-            return (u'WARNING: no closed captions found in the specified language', None)
+        return srt_lang_list
+
+    def _request_subtitle(self, str_lang, str_name, video_id, format = 'srt'):
+        self.report_video_subtitles_request(video_id, str_lang)
          params = compat_urllib_parse.urlencode({
-            'lang': srt_lang,
-            'name': srt_lang_list[srt_lang].encode('utf-8'),
+            'lang': str_lang,
+            'name': str_name,
              'v': video_id,
+            'fmt': format,
          })
          url = 'http://www.youtube.com/api/timedtext?' + params
          try:
-            srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
+            srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
-        if not srt_xml:
+        if not srt:
              return (u'WARNING: Did not fetch video subtitles', None)
-        return (None, self._closed_captions_xml_to_srt(srt_xml))
+        return (None, str_lang, srt)
+
+    def _extract_subtitle(self, video_id):
+        self.report_video_subtitles_download(video_id)
+        srt_lang_list = self._get_available_subtitles(video_id)
+
+        if self._downloader.params.get('subtitleslang', False):
+            srt_lang = self._downloader.params.get('subtitleslang')
+        elif 'en' in srt_lang_list:
+            srt_lang = 'en'
+        else:
+            srt_lang = list(srt_lang_list.keys())[0]
+        if not srt_lang in srt_lang_list:
+            return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None)
+
+        sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
+        return [sub]
+
+    def _extract_all_subtitles(self, video_id):
+        self.report_video_subtitles_download(video_id)
+        srt_lang_list = self._get_available_subtitles(video_id)
+        subs = []
+        for srt_lang in srt_lang_list:
+            sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
+            subs.append(sub)
+        return subs
  
      def _print_formats(self, formats):
          print('Available formats:')
@@ -499,10 +507,20 @@ class YoutubeIE(InfoExtractor):
  
          # closed captions
          video_subtitles = None
+
          if self._downloader.params.get('writesubtitles', False):
-            (srt_error, video_subtitles) = self._extract_subtitles(video_id)
-            if srt_error:
-                self._downloader.trouble(srt_error)
+            video_subtitles = self._extract_subtitle(video_id)
+            if video_subtitles:
+                (srt_error, srt_lang, srt) = video_subtitles[0]
+                if srt_error:
+                    self._downloader.trouble(srt_error)
+
+        if self._downloader.params.get('allsubtitles', False):
+            video_subtitles = self._extract_all_subtitles(video_id)
+            for video_subtitle in video_subtitles:
+                (srt_error, srt_lang, srt) = video_subtitle
+                if srt_error:
+                    self._downloader.trouble(srt_error)
  
          if 'length_seconds' not in video_info:
              self._downloader.trouble(u'WARNING: unable to extract video duration')
@@ -1366,6 +1384,9 @@ class GenericIE(InfoExtractor):
          if mobj is None:
              # Broaden the search a little bit
              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+        if mobj is None:
+            # Broaden the search a little bit: JWPlayer JS loader
+            mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
          if mobj is None:
              self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
              return