]> gitweb @ CieloNegro.org - youtube-dl.git/blobdiff - youtube_dl/extractor/common.py
[common] Relax JWPlayer regex and remove duplicate urls(#12768)
[youtube-dl.git] / youtube_dl / extractor / common.py
index 6d4789d9636cc6874e2979dcfcbe56839323ad7c..12e010a0dcda30e6ccc42b2b61202e7e355451d8 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import base64
 from __future__ import unicode_literals
 
 import base64
@@ -36,34 +37,35 @@ from ..utils import (
     clean_html,
     compiled_regex_type,
     determine_ext,
     clean_html,
     compiled_regex_type,
     determine_ext,
+    determine_protocol,
     error_to_compat_str,
     ExtractorError,
     error_to_compat_str,
     ExtractorError,
+    extract_attributes,
     fix_xml_ampersands,
     float_or_none,
     GeoRestrictedError,
     GeoUtils,
     int_or_none,
     js_to_json,
     fix_xml_ampersands,
     float_or_none,
     GeoRestrictedError,
     GeoUtils,
     int_or_none,
     js_to_json,
+    mimetype2ext,
+    orderedSet,
+    parse_codecs,
+    parse_duration,
     parse_iso8601,
     parse_iso8601,
+    parse_m3u8_attributes,
     RegexNotFoundError,
     RegexNotFoundError,
-    sanitize_filename,
     sanitized_Request,
     sanitized_Request,
+    sanitize_filename,
     unescapeHTML,
     unified_strdate,
     unified_timestamp,
     unescapeHTML,
     unified_strdate,
     unified_timestamp,
+    update_Request,
+    update_url_query,
+    urljoin,
     url_basename,
     xpath_element,
     xpath_text,
     xpath_with_ns,
     url_basename,
     xpath_element,
     xpath_text,
     xpath_with_ns,
-    determine_protocol,
-    parse_duration,
-    mimetype2ext,
-    update_Request,
-    update_url_query,
-    parse_m3u8_attributes,
-    extract_attributes,
-    parse_codecs,
-    urljoin,
 )
 
 
 )
 
 
@@ -393,7 +395,6 @@ class InfoExtractor(object):
         is selected and a random IP belonging to this country is generated. This
         IP will be passed as X-Forwarded-For HTTP header in all subsequent
         HTTP requests.
         is selected and a random IP belonging to this country is generated. This
         IP will be passed as X-Forwarded-For HTTP header in all subsequent
         HTTP requests.
-        Method does nothing if no countries are specified.
 
         This method will be used for initial geo bypass mechanism initialization
         during the instance initialization with _GEO_COUNTRIES.
 
         This method will be used for initial geo bypass mechanism initialization
         during the instance initialization with _GEO_COUNTRIES.
@@ -402,8 +403,6 @@ class InfoExtractor(object):
         information is not available beforehand (e.g. obtained during
         extraction) or due to some another reason.
         """
         information is not available beforehand (e.g. obtained during
         extraction) or due to some another reason.
         """
-        if not countries:
-            return
         if not self._x_forwarded_for_ip:
             country_code = self._downloader.params.get('geo_bypass_country', None)
             # If there is no explicit country for geo bypass specified and
         if not self._x_forwarded_for_ip:
             country_code = self._downloader.params.get('geo_bypass_country', None)
             # If there is no explicit country for geo bypass specified and
@@ -418,7 +417,8 @@ class InfoExtractor(object):
                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_stdout(
                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_stdout(
-                        '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                        '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
+                        % (self._x_forwarded_for_ip, country_code.upper()))
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
@@ -447,10 +447,12 @@ class InfoExtractor(object):
                 self._downloader.params.get('geo_bypass', True) and
                 not self._x_forwarded_for_ip and
                 countries):
                 self._downloader.params.get('geo_bypass', True) and
                 not self._x_forwarded_for_ip and
                 countries):
-            self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
+            country_code = random.choice(countries)
+            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
             if self._x_forwarded_for_ip:
                 self.report_warning(
             if self._x_forwarded_for_ip:
                 self.report_warning(
-                    'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
+                    % (self._x_forwarded_for_ip, country_code.upper()))
                 return True
         return False
 
                 return True
         return False
 
@@ -546,6 +548,34 @@ class InfoExtractor(object):
 
         return encoding
 
 
         return encoding
 
+    def __check_blocked(self, content):
+        first_block = content[:512]
+        if ('<title>Access to this site is blocked</title>' in content and
+                'Websense' in first_block):
+            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
+            blocked_iframe = self._html_search_regex(
+                r'<iframe src="([^"]+)"', content,
+                'Websense information URL', default=None)
+            if blocked_iframe:
+                msg += ' Visit %s for more details' % blocked_iframe
+            raise ExtractorError(msg, expected=True)
+        if '<title>The URL you requested has been blocked</title>' in first_block:
+            msg = (
+                'Access to this webpage has been blocked by Indian censorship. '
+                'Use a VPN or proxy server (with --proxy) to route around it.')
+            block_msg = self._html_search_regex(
+                r'</h1><p>(.*?)</p>',
+                content, 'block message', default=None)
+            if block_msg:
+                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+            raise ExtractorError(msg, expected=True)
+        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
+                'blocklist.rkn.gov.ru' in content):
+            raise ExtractorError(
+                'Access to this webpage has been blocked by decision of the Russian government. '
+                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
+                expected=True)
+
     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
         content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
         content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
@@ -587,25 +617,7 @@ class InfoExtractor(object):
         except LookupError:
             content = webpage_bytes.decode('utf-8', 'replace')
 
         except LookupError:
             content = webpage_bytes.decode('utf-8', 'replace')
 
-        if ('<title>Access to this site is blocked</title>' in content and
-                'Websense' in content[:512]):
-            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
-            blocked_iframe = self._html_search_regex(
-                r'<iframe src="([^"]+)"', content,
-                'Websense information URL', default=None)
-            if blocked_iframe:
-                msg += ' Visit %s for more details' % blocked_iframe
-            raise ExtractorError(msg, expected=True)
-        if '<title>The URL you requested has been blocked</title>' in content[:512]:
-            msg = (
-                'Access to this webpage has been blocked by Indian censorship. '
-                'Use a VPN or proxy server (with --proxy) to route around it.')
-            block_msg = self._html_search_regex(
-                r'</h1><p>(.*?)</p>',
-                content, 'block message', default=None)
-            if block_msg:
-                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
-            raise ExtractorError(msg, expected=True)
+        self.__check_blocked(content)
 
         return content
 
 
         return content
 
@@ -714,6 +726,13 @@ class InfoExtractor(object):
             video_info['title'] = video_title
         return video_info
 
             video_info['title'] = video_title
         return video_info
 
+    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
+        urlrs = orderedSet(
+            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
+            for m in matches)
+        return self.playlist_result(
+            urlrs, playlist_id=video_id, playlist_title=video_title)
+
     @staticmethod
     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
         """Returns a playlist"""
     @staticmethod
     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
         """Returns a playlist"""
@@ -1760,7 +1779,7 @@ class InfoExtractor(object):
                     if content_type == 'text':
                         # TODO implement WebVTT downloading
                         pass
                     if content_type == 'text':
                         # TODO implement WebVTT downloading
                         pass
-                    elif content_type == 'video' or content_type == 'audio':
+                    elif content_type in ('video', 'audio'):
                         base_url = ''
                         for element in (representation, adaptation_set, period, mpd_doc):
                             base_url_e = element.find(_add_ns('BaseURL'))
                         base_url = ''
                         for element in (representation, adaptation_set, period, mpd_doc):
                             base_url_e = element.find(_add_ns('BaseURL'))
@@ -2010,7 +2029,7 @@ class InfoExtractor(object):
                 })
         return formats
 
                 })
         return formats
 
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
         def absolute_url(video_url):
             return compat_urlparse.urljoin(base_url, video_url)
 
         def absolute_url(video_url):
             return compat_urlparse.urljoin(base_url, video_url)
 
@@ -2032,7 +2051,8 @@ class InfoExtractor(object):
                 is_plain_url = False
                 formats = self._extract_m3u8_formats(
                     full_url, video_id, ext='mp4',
                 is_plain_url = False
                 formats = self._extract_m3u8_formats(
                     full_url, video_id, ext='mp4',
-                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
+                    preference=preference)
             elif ext == 'mpd':
                 is_plain_url = False
                 formats = self._extract_mpd_formats(
             elif ext == 'mpd':
                 is_plain_url = False
                 formats = self._extract_mpd_formats(
@@ -2160,18 +2180,24 @@ class InfoExtractor(object):
                     })
         return formats
 
                     })
         return formats
 
-    @staticmethod
-    def _find_jwplayer_data(webpage):
+    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
         mobj = re.search(
         mobj = re.search(
-            r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\).*?\.setup\s*\((?P<options>[^)]+)\)',
             webpage)
         if mobj:
             webpage)
         if mobj:
-            return mobj.group('options')
+            try:
+                jwplayer_data = self._parse_json(mobj.group('options'),
+                                                 video_id=video_id,
+                                                 transform_source=transform_source)
+            except ExtractorError:
+                pass
+            else:
+                if isinstance(jwplayer_data, dict):
+                    return jwplayer_data
 
     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
 
     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
-        jwplayer_data = self._parse_json(
-            self._find_jwplayer_data(webpage), video_id,
-            transform_source=js_to_json)
+        jwplayer_data = self._find_jwplayer_data(
+            webpage, video_id, transform_source=js_to_json)
         return self._parse_jwplayer_data(
             jwplayer_data, video_id, *args, **kwargs)
 
         return self._parse_jwplayer_data(
             jwplayer_data, video_id, *args, **kwargs)
 
@@ -2197,56 +2223,9 @@ class InfoExtractor(object):
 
             this_video_id = video_id or video_data['mediaid']
 
 
             this_video_id = video_id or video_data['mediaid']
 
-            formats = []
-            for source in video_data['sources']:
-                source_url = self._proto_relative_url(source['file'])
-                if base_url:
-                    source_url = compat_urlparse.urljoin(base_url, source_url)
-                source_type = source.get('type') or ''
-                ext = mimetype2ext(source_type) or determine_ext(source_url)
-                if source_type == 'hls' or ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
-                elif ext == 'mpd':
-                    formats.extend(self._extract_mpd_formats(
-                        source_url, this_video_id, mpd_id=mpd_id, fatal=False))
-                # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
-                elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
-                    formats.append({
-                        'url': source_url,
-                        'vcodec': 'none',
-                        'ext': ext,
-                    })
-                else:
-                    height = int_or_none(source.get('height'))
-                    if height is None:
-                        # Often no height is provided but there is a label in
-                        # format like 1080p.
-                        height = int_or_none(self._search_regex(
-                            r'^(\d{3,})[pP]$', source.get('label') or '',
-                            'height', default=None))
-                    a_format = {
-                        'url': source_url,
-                        'width': int_or_none(source.get('width')),
-                        'height': height,
-                        'ext': ext,
-                    }
-                    if source_url.startswith('rtmp'):
-                        a_format['ext'] = 'flv'
-
-                        # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
-                        # of jwplayer.flash.swf
-                        rtmp_url_parts = re.split(
-                            r'((?:mp4|mp3|flv):)', source_url, 1)
-                        if len(rtmp_url_parts) == 3:
-                            rtmp_url, prefix, play_path = rtmp_url_parts
-                            a_format.update({
-                                'url': rtmp_url,
-                                'play_path': prefix + play_path,
-                            })
-                        if rtmp_params:
-                            a_format.update(rtmp_params)
-                    formats.append(a_format)
+            formats = self._parse_jwplayer_formats(
+                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
+                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
             self._sort_formats(formats)
 
             subtitles = {}
             self._sort_formats(formats)
 
             subtitles = {}
@@ -2277,6 +2256,71 @@ class InfoExtractor(object):
         else:
             return self.playlist_result(entries)
 
         else:
             return self.playlist_result(entries)
 
+    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
+                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        urls = []
+        formats = []
+        for source in jwplayer_sources_data:
+            source_url = self._proto_relative_url(source.get('file'))
+            if not source_url:
+                continue
+            if base_url:
+                source_url = compat_urlparse.urljoin(base_url, source_url)
+            if source_url in urls:
+                continue
+            urls.append(source_url)
+            source_type = source.get('type') or ''
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if source_type == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id=m3u8_id, fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    source_url, video_id, mpd_id=mpd_id, fatal=False))
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    source_url, video_id, fatal=False))
+            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+            elif source_type.startswith('audio') or ext in (
+                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+                formats.append({
+                    'url': source_url,
+                    'vcodec': 'none',
+                    'ext': ext,
+                })
+            else:
+                height = int_or_none(source.get('height'))
+                if height is None:
+                    # Often no height is provided but there is a label in
+                    # format like "1080p", "720p SD", or 1080.
+                    height = int_or_none(self._search_regex(
+                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
+                        'height', default=None))
+                a_format = {
+                    'url': source_url,
+                    'width': int_or_none(source.get('width')),
+                    'height': height,
+                    'tbr': int_or_none(source.get('bitrate')),
+                    'ext': ext,
+                }
+                if source_url.startswith('rtmp'):
+                    a_format['ext'] = 'flv'
+                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                    # of jwplayer.flash.swf
+                    rtmp_url_parts = re.split(
+                        r'((?:mp4|mp3|flv):)', source_url, 1)
+                    if len(rtmp_url_parts) == 3:
+                        rtmp_url, prefix, play_path = rtmp_url_parts
+                        a_format.update({
+                            'url': rtmp_url,
+                            'play_path': prefix + play_path,
+                        })
+                    if rtmp_params:
+                        a_format.update(rtmp_params)
+                formats.append(a_format)
+        return formats
+
     def _live_title(self, name):
         """ Generate the title for a live video """
         now = datetime.datetime.now()
     def _live_title(self, name):
         """ Generate the title for a live video """
         now = datetime.datetime.now()