[common] Relax JWPlayer regex and remove duplicate urls(#12768)

[youtube-dl.git] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 6d4789d9636cc6874e2979dcfcbe56839323ad7c..12e010a0dcda30e6ccc42b2b61202e7e355451d8 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,3 +1,4 @@
+# coding: utf-8
  from __future__ import unicode_literals
  
  import base64
  from __future__ import unicode_literals
  
  import base64
@@ -36,34 +37,35 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
      clean_html,
      compiled_regex_type,
      determine_ext,
+    determine_protocol,
      error_to_compat_str,
      ExtractorError,
      error_to_compat_str,
      ExtractorError,
+    extract_attributes,
      fix_xml_ampersands,
      float_or_none,
      GeoRestrictedError,
      GeoUtils,
      int_or_none,
      js_to_json,
      fix_xml_ampersands,
      float_or_none,
      GeoRestrictedError,
      GeoUtils,
      int_or_none,
      js_to_json,
+    mimetype2ext,
+    orderedSet,
+    parse_codecs,
+    parse_duration,
      parse_iso8601,
      parse_iso8601,
+    parse_m3u8_attributes,
      RegexNotFoundError,
      RegexNotFoundError,
-    sanitize_filename,
      sanitized_Request,
      sanitized_Request,
+    sanitize_filename,
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
+    update_Request,
+    update_url_query,
+    urljoin,
      url_basename,
      xpath_element,
      xpath_text,
      xpath_with_ns,
      url_basename,
      xpath_element,
      xpath_text,
      xpath_with_ns,
-    determine_protocol,
-    parse_duration,
-    mimetype2ext,
-    update_Request,
-    update_url_query,
-    parse_m3u8_attributes,
-    extract_attributes,
-    parse_codecs,
-    urljoin,
  )
  
  
  )
  
  
@@ -393,7 +395,6 @@ class InfoExtractor(object):
          is selected and a random IP belonging to this country is generated. This
          IP will be passed as X-Forwarded-For HTTP header in all subsequent
          HTTP requests.
          is selected and a random IP belonging to this country is generated. This
          IP will be passed as X-Forwarded-For HTTP header in all subsequent
          HTTP requests.
-        Method does nothing if no countries are specified.
  
          This method will be used for initial geo bypass mechanism initialization
          during the instance initialization with _GEO_COUNTRIES.
  
          This method will be used for initial geo bypass mechanism initialization
          during the instance initialization with _GEO_COUNTRIES.
@@ -402,8 +403,6 @@ class InfoExtractor(object):
          information is not available beforehand (e.g. obtained during
          extraction) or due to some another reason.
          """
          information is not available beforehand (e.g. obtained during
          extraction) or due to some another reason.
          """
-        if not countries:
-            return
          if not self._x_forwarded_for_ip:
              country_code = self._downloader.params.get('geo_bypass_country', None)
              # If there is no explicit country for geo bypass specified and
          if not self._x_forwarded_for_ip:
              country_code = self._downloader.params.get('geo_bypass_country', None)
              # If there is no explicit country for geo bypass specified and
@@ -418,7 +417,8 @@ class InfoExtractor(object):
                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                  if self._downloader.params.get('verbose', False):
                      self._downloader.to_stdout(
                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                  if self._downloader.params.get('verbose', False):
                      self._downloader.to_stdout(
-                        '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                        '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
+                        % (self._x_forwarded_for_ip, country_code.upper()))
  
      def extract(self, url):
          """Extracts URL information and returns it in list of dicts."""
  
      def extract(self, url):
          """Extracts URL information and returns it in list of dicts."""
@@ -447,10 +447,12 @@ class InfoExtractor(object):
                  self._downloader.params.get('geo_bypass', True) and
                  not self._x_forwarded_for_ip and
                  countries):
                  self._downloader.params.get('geo_bypass', True) and
                  not self._x_forwarded_for_ip and
                  countries):
-            self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
+            country_code = random.choice(countries)
+            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
              if self._x_forwarded_for_ip:
                  self.report_warning(
              if self._x_forwarded_for_ip:
                  self.report_warning(
-                    'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
+                    % (self._x_forwarded_for_ip, country_code.upper()))
                  return True
          return False
  
                  return True
          return False
  
@@ -546,6 +548,34 @@ class InfoExtractor(object):
  
          return encoding
  
  
          return encoding
  
+    def __check_blocked(self, content):
+        first_block = content[:512]
+        if ('<title>Access to this site is blocked</title>' in content and
+                'Websense' in first_block):
+            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
+            blocked_iframe = self._html_search_regex(
+                r'<iframe src="([^"]+)"', content,
+                'Websense information URL', default=None)
+            if blocked_iframe:
+                msg += ' Visit %s for more details' % blocked_iframe
+            raise ExtractorError(msg, expected=True)
+        if '<title>The URL you requested has been blocked</title>' in first_block:
+            msg = (
+                'Access to this webpage has been blocked by Indian censorship. '
+                'Use a VPN or proxy server (with --proxy) to route around it.')
+            block_msg = self._html_search_regex(
+                r'</h1><p>(.*?)</p>',
+                content, 'block message', default=None)
+            if block_msg:
+                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+            raise ExtractorError(msg, expected=True)
+        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
+                'blocklist.rkn.gov.ru' in content):
+            raise ExtractorError(
+                'Access to this webpage has been blocked by decision of the Russian government. '
+                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
+                expected=True)
+
      def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
          content_type = urlh.headers.get('Content-Type', '')
          webpage_bytes = urlh.read()
      def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
          content_type = urlh.headers.get('Content-Type', '')
          webpage_bytes = urlh.read()
@@ -587,25 +617,7 @@ class InfoExtractor(object):
          except LookupError:
              content = webpage_bytes.decode('utf-8', 'replace')
  
          except LookupError:
              content = webpage_bytes.decode('utf-8', 'replace')
  
-        if ('<title>Access to this site is blocked</title>' in content and
-                'Websense' in content[:512]):
-            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
-            blocked_iframe = self._html_search_regex(
-                r'<iframe src="([^"]+)"', content,
-                'Websense information URL', default=None)
-            if blocked_iframe:
-                msg += ' Visit %s for more details' % blocked_iframe
-            raise ExtractorError(msg, expected=True)
-        if '<title>The URL you requested has been blocked</title>' in content[:512]:
-            msg = (
-                'Access to this webpage has been blocked by Indian censorship. '
-                'Use a VPN or proxy server (with --proxy) to route around it.')
-            block_msg = self._html_search_regex(
-                r'</h1><p>(.*?)</p>',
-                content, 'block message', default=None)
-            if block_msg:
-                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
-            raise ExtractorError(msg, expected=True)
+        self.__check_blocked(content)
  
          return content
  
  
          return content
  
@@ -714,6 +726,13 @@ class InfoExtractor(object):
              video_info['title'] = video_title
          return video_info
  
              video_info['title'] = video_title
          return video_info
  
+    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
+        urlrs = orderedSet(
+            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
+            for m in matches)
+        return self.playlist_result(
+            urlrs, playlist_id=video_id, playlist_title=video_title)
+
      @staticmethod
      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
          """Returns a playlist"""
      @staticmethod
      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
          """Returns a playlist"""
@@ -1760,7 +1779,7 @@ class InfoExtractor(object):
                      if content_type == 'text':
                          # TODO implement WebVTT downloading
                          pass
                      if content_type == 'text':
                          # TODO implement WebVTT downloading
                          pass
-                    elif content_type == 'video' or content_type == 'audio':
+                    elif content_type in ('video', 'audio'):
                          base_url = ''
                          for element in (representation, adaptation_set, period, mpd_doc):
                              base_url_e = element.find(_add_ns('BaseURL'))
                          base_url = ''
                          for element in (representation, adaptation_set, period, mpd_doc):
                              base_url_e = element.find(_add_ns('BaseURL'))
@@ -2010,7 +2029,7 @@ class InfoExtractor(object):
                  })
          return formats
  
                  })
          return formats
  
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
          def absolute_url(video_url):
              return compat_urlparse.urljoin(base_url, video_url)
  
          def absolute_url(video_url):
              return compat_urlparse.urljoin(base_url, video_url)
  
@@ -2032,7 +2051,8 @@ class InfoExtractor(object):
                  is_plain_url = False
                  formats = self._extract_m3u8_formats(
                      full_url, video_id, ext='mp4',
                  is_plain_url = False
                  formats = self._extract_m3u8_formats(
                      full_url, video_id, ext='mp4',
-                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
+                    preference=preference)
              elif ext == 'mpd':
                  is_plain_url = False
                  formats = self._extract_mpd_formats(
              elif ext == 'mpd':
                  is_plain_url = False
                  formats = self._extract_mpd_formats(
@@ -2160,18 +2180,24 @@ class InfoExtractor(object):
                      })
          return formats
  
                      })
          return formats
  
-    @staticmethod
-    def _find_jwplayer_data(webpage):
+    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
          mobj = re.search(
          mobj = re.search(
-            r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\).*?\.setup\s*\((?P<options>[^)]+)\)',
              webpage)
          if mobj:
              webpage)
          if mobj:
-            return mobj.group('options')
+            try:
+                jwplayer_data = self._parse_json(mobj.group('options'),
+                                                 video_id=video_id,
+                                                 transform_source=transform_source)
+            except ExtractorError:
+                pass
+            else:
+                if isinstance(jwplayer_data, dict):
+                    return jwplayer_data
  
      def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
  
      def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
-        jwplayer_data = self._parse_json(
-            self._find_jwplayer_data(webpage), video_id,
-            transform_source=js_to_json)
+        jwplayer_data = self._find_jwplayer_data(
+            webpage, video_id, transform_source=js_to_json)
          return self._parse_jwplayer_data(
              jwplayer_data, video_id, *args, **kwargs)
  
          return self._parse_jwplayer_data(
              jwplayer_data, video_id, *args, **kwargs)
  
@@ -2197,56 +2223,9 @@ class InfoExtractor(object):
  
              this_video_id = video_id or video_data['mediaid']
  
  
              this_video_id = video_id or video_data['mediaid']
  
-            formats = []
-            for source in video_data['sources']:
-                source_url = self._proto_relative_url(source['file'])
-                if base_url:
-                    source_url = compat_urlparse.urljoin(base_url, source_url)
-                source_type = source.get('type') or ''
-                ext = mimetype2ext(source_type) or determine_ext(source_url)
-                if source_type == 'hls' or ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
-                elif ext == 'mpd':
-                    formats.extend(self._extract_mpd_formats(
-                        source_url, this_video_id, mpd_id=mpd_id, fatal=False))
-                # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
-                elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
-                    formats.append({
-                        'url': source_url,
-                        'vcodec': 'none',
-                        'ext': ext,
-                    })
-                else:
-                    height = int_or_none(source.get('height'))
-                    if height is None:
-                        # Often no height is provided but there is a label in
-                        # format like 1080p.
-                        height = int_or_none(self._search_regex(
-                            r'^(\d{3,})[pP]$', source.get('label') or '',
-                            'height', default=None))
-                    a_format = {
-                        'url': source_url,
-                        'width': int_or_none(source.get('width')),
-                        'height': height,
-                        'ext': ext,
-                    }
-                    if source_url.startswith('rtmp'):
-                        a_format['ext'] = 'flv'
-
-                        # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
-                        # of jwplayer.flash.swf
-                        rtmp_url_parts = re.split(
-                            r'((?:mp4|mp3|flv):)', source_url, 1)
-                        if len(rtmp_url_parts) == 3:
-                            rtmp_url, prefix, play_path = rtmp_url_parts
-                            a_format.update({
-                                'url': rtmp_url,
-                                'play_path': prefix + play_path,
-                            })
-                        if rtmp_params:
-                            a_format.update(rtmp_params)
-                    formats.append(a_format)
+            formats = self._parse_jwplayer_formats(
+                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
+                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
              self._sort_formats(formats)
  
              subtitles = {}
              self._sort_formats(formats)
  
              subtitles = {}
@@ -2277,6 +2256,71 @@ class InfoExtractor(object):
          else:
              return self.playlist_result(entries)
  
          else:
              return self.playlist_result(entries)
  
+    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
+                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        urls = []
+        formats = []
+        for source in jwplayer_sources_data:
+            source_url = self._proto_relative_url(source.get('file'))
+            if not source_url:
+                continue
+            if base_url:
+                source_url = compat_urlparse.urljoin(base_url, source_url)
+            if source_url in urls:
+                continue
+            urls.append(source_url)
+            source_type = source.get('type') or ''
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if source_type == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id=m3u8_id, fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    source_url, video_id, mpd_id=mpd_id, fatal=False))
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    source_url, video_id, fatal=False))
+            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+            elif source_type.startswith('audio') or ext in (
+                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+                formats.append({
+                    'url': source_url,
+                    'vcodec': 'none',
+                    'ext': ext,
+                })
+            else:
+                height = int_or_none(source.get('height'))
+                if height is None:
+                    # Often no height is provided but there is a label in
+                    # format like "1080p", "720p SD", or 1080.
+                    height = int_or_none(self._search_regex(
+                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
+                        'height', default=None))
+                a_format = {
+                    'url': source_url,
+                    'width': int_or_none(source.get('width')),
+                    'height': height,
+                    'tbr': int_or_none(source.get('bitrate')),
+                    'ext': ext,
+                }
+                if source_url.startswith('rtmp'):
+                    a_format['ext'] = 'flv'
+                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                    # of jwplayer.flash.swf
+                    rtmp_url_parts = re.split(
+                        r'((?:mp4|mp3|flv):)', source_url, 1)
+                    if len(rtmp_url_parts) == 3:
+                        rtmp_url, prefix, play_path = rtmp_url_parts
+                        a_format.update({
+                            'url': rtmp_url,
+                            'play_path': prefix + play_path,
+                        })
+                    if rtmp_params:
+                        a_format.update(rtmp_params)
+                formats.append(a_format)
+        return formats
+
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()