Added new option '--all-srt' to download all the subtitles of a video.

[youtube-dl.git] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 53fab690a41c55730b6339fd213b72345e3cd692..a220de80a4b62610efc483f50e45ffed5eb47279 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -216,6 +216,10 @@ class YoutubeIE(InfoExtractor):
          """Report attempt to download video info webpage."""
          self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
  
+    def report_video_subtitles_request(self, video_id, lang):
+        """Report attempt to download video info webpage."""
+        self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
+
      def report_information_extraction(self, video_id):
          """Report attempt to extract video information."""
          self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@@ -228,25 +232,7 @@ class YoutubeIE(InfoExtractor):
          """Indicate the download will use the RTMP protocol."""
          self._downloader.to_screen(u'[youtube] RTMP download detected')
  
-    def _closed_captions_xml_to_srt(self, xml_string):
-        srt = ''
-        texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
-        # TODO parse xml instead of regex
-        for n, (start, dur_tag, dur, caption) in enumerate(texts):
-            if not dur: dur = '4'
-            start = float(start)
-            end = start + float(dur)
-            start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
-            end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
-            caption = unescapeHTML(caption)
-            caption = unescapeHTML(caption) # double cycle, intentional
-            srt += str(n+1) + '\n'
-            srt += start + ' --> ' + end + '\n'
-            srt += caption + '\n\n'
-        return srt
-
-    def _extract_subtitles(self, video_id):
-        self.report_video_subtitles_download(video_id)
+    def _get_available_subtitles(self, video_id):
          request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
          try:
              srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
@@ -256,27 +242,49 @@ class YoutubeIE(InfoExtractor):
          srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
          if not srt_lang_list:
              return (u'WARNING: video has no closed captions', None)
-        if self._downloader.params.get('subtitleslang', False):
-            srt_lang = self._downloader.params.get('subtitleslang')
-        elif 'en' in srt_lang_list:
-            srt_lang = 'en'
-        else:
-            srt_lang = list(srt_lang_list.keys())[0]
-        if not srt_lang in srt_lang_list:
-            return (u'WARNING: no closed captions found in the specified language', None)
+        return srt_lang_list
+
+    def _request_subtitle(self, str_lang, str_name, video_id, format = 'srt'):
+        self.report_video_subtitles_request(video_id, str_lang)
          params = compat_urllib_parse.urlencode({
-            'lang': srt_lang,
-            'name': srt_lang_list[srt_lang].encode('utf-8'),
+            'lang': str_lang,
+            'name': str_name,
              'v': video_id,
+            'fmt': format,
          })
          url = 'http://www.youtube.com/api/timedtext?' + params
          try:
-            srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
+            srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
-        if not srt_xml:
+        if not srt:
              return (u'WARNING: Did not fetch video subtitles', None)
-        return (None, self._closed_captions_xml_to_srt(srt_xml))
+        return (None, str_lang, srt)
+
+    def _extract_subtitle(self, video_id):
+        self.report_video_subtitles_download(video_id)
+        srt_lang_list = self._get_available_subtitles(video_id)
+
+        if self._downloader.params.get('subtitleslang', False):
+            srt_lang = self._downloader.params.get('subtitleslang')
+        elif 'en' in srt_lang_list:
+            srt_lang = 'en'
+        else:
+            srt_lang = list(srt_lang_list.keys())[0]
+        if not srt_lang in srt_lang_list:
+            return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None)
+
+        sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
+        return [sub]
+
+    def _extract_all_subtitles(self, video_id):
+        self.report_video_subtitles_download(video_id)
+        srt_lang_list = self._get_available_subtitles(video_id)
+        subs = []
+        for srt_lang in srt_lang_list:
+            sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
+            subs.append(sub)
+        return subs
  
      def _print_formats(self, formats):
          print('Available formats:')
@@ -499,10 +507,20 @@ class YoutubeIE(InfoExtractor):
  
          # closed captions
          video_subtitles = None
+
          if self._downloader.params.get('writesubtitles', False):
-            (srt_error, video_subtitles) = self._extract_subtitles(video_id)
-            if srt_error:
-                self._downloader.trouble(srt_error)
+            video_subtitles = self._extract_subtitle(video_id)
+            if video_subtitles:
+                (srt_error, srt_lang, srt) = video_subtitles[0]
+                if srt_error:
+                    self._downloader.trouble(srt_error)
+
+        if self._downloader.params.get('allsubtitles', False):
+            video_subtitles = self._extract_all_subtitles(video_id)
+            for video_subtitle in video_subtitles:
+                (srt_error, srt_lang, srt) = video_subtitle
+                if srt_error:
+                    self._downloader.trouble(srt_error)
  
          if 'length_seconds' not in video_info:
              self._downloader.trouble(u'WARNING: unable to extract video duration')
@@ -1330,7 +1348,7 @@ class GenericIE(InfoExtractor):
          opener = compat_urllib_request.OpenerDirector()
          for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
                          HTTPMethodFallback, HEADRedirectHandler,
-                        compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
+                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
              opener.add_handler(handler())
  
          response = opener.open(HeadRequest(url))
@@ -1366,6 +1384,9 @@ class GenericIE(InfoExtractor):
          if mobj is None:
              # Broaden the search a little bit
              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+        if mobj is None:
+            # Broaden the search a little bit: JWPlayer JS loader
+            mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
          if mobj is None:
              self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
              return
@@ -2098,6 +2119,10 @@ class FacebookIE(InfoExtractor):
          params_raw = compat_urllib_parse.unquote(data['params'])
          params = json.loads(params_raw)
          video_url = params['hd_src']
+        if not video_url:
+            video_url = params['sd_src']
+        if not video_url:
+            raise ExtractorError(u'Cannot find video URL')
          video_duration = int(params['video_duration'])
  
          m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
@@ -2233,7 +2258,7 @@ class MyVideoIE(InfoExtractor):
          webpage = self._download_webpage(webpage_url, video_id)
  
          self.report_extraction(video_id)
-        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
+        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
                   webpage)
          if mobj is None:
              self._downloader.trouble(u'ERROR: unable to extract media URL')
@@ -3725,13 +3750,13 @@ class YouPornIE(InfoExtractor):
          webpage = self._download_webpage(req, video_id)
  
          # Get the video title
-        result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
+        result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
          if result is None:
-            raise ExtractorError(u'ERROR: unable to extract video title')
+            raise ExtractorError(u'Unable to extract video title')
          video_title = result.group('title').strip()
  
          # Get the video date
-        result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
+        result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
          if result is None:
              self._downloader.to_stderr(u'WARNING: unable to extract video date')
              upload_date = None
@@ -3739,9 +3764,9 @@ class YouPornIE(InfoExtractor):
              upload_date = result.group('date').strip()
  
          # Get the video uploader
-        result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
+        result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
          if result is None:
-            self._downloader.to_stderr(u'ERROR: unable to extract uploader')
+            self._downloader.to_stderr(u'WARNING: unable to extract uploader')
              video_uploader = None
          else:
              video_uploader = result.group('uploader').strip()