[generic] Add tests for #13557

[youtube-dl.git] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index c108d4a8a4e9a31d6b931d39c8a6233e75be1b0d..95c38698d6404be54be3c5c378f8b2e4f350685f 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -10,6 +10,7 @@ from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..compat import (
      compat_etree_fromstring,
+    compat_str,
      compat_urllib_parse_unquote,
      compat_urlparse,
      compat_xml_parse_error,
@@ -1184,7 +1185,7 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': ['Kaltura'],
          },
-        # Eagle.Platform embed (generic URL)
+        # EaglePlatform embed (generic URL)
          {
              'url': 'http://lenta.ru/news/2015/03/06/navalny/',
              # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1198,8 +1199,26 @@ class GenericIE(InfoExtractor):
                  'view_count': int,
                  'age_limit': 0,
              },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # referrer protected EaglePlatform embed
+        {
+            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+            'info_dict': {
+                'id': '582306',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 3382,
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
          },
-        # ClipYou (Eagle.Platform) embed (custom URL)
+        # ClipYou (EaglePlatform) embed (custom URL)
          {
              'url': 'http://muz-tv.ru/play/7129/',
              # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1211,6 +1230,9 @@ class GenericIE(InfoExtractor):
                  'duration': 216,
                  'view_count': int,
              },
+            'params': {
+                'skip_download': True,
+            },
          },
          # Pladform embed
          {
@@ -1521,6 +1543,21 @@ class GenericIE(InfoExtractor):
                  'title': 'Facebook video #599637780109885',
              },
          },
+        # Facebook <iframe> embed, plugin video
+        {
+            'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/',
+            'info_dict': {
+                'id': '1754168231264132',
+                'ext': 'mp4',
+                'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...',
+                'uploader': 'Tariq Ramadan (official)',
+                'timestamp': 1496758379,
+                'upload_date': '20170606',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          # Facebook API embed
          {
              'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
@@ -1907,14 +1944,14 @@ class GenericIE(InfoExtractor):
          content_type = head_response.headers.get('Content-Type', '').lower()
          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
          if m:
-            format_id = m.group('format_id')
+            format_id = compat_str(m.group('format_id'))
              if format_id.endswith('mpegurl'):
                  formats = self._extract_m3u8_formats(url, video_id, 'mp4')
              elif format_id == 'f4m':
                  formats = self._extract_f4m_formats(url, video_id)
              else:
                  formats = [{
-                    'format_id': m.group('format_id'),
+                    'format_id': format_id,
                      'url': url,
                      'vcodec': 'none' if m.group('type') == 'audio' else None
                  }]
@@ -2032,6 +2069,13 @@ class GenericIE(InfoExtractor):
          video_description = self._og_search_description(webpage, default=None)
          video_thumbnail = self._og_search_thumbnail(webpage, default=None)
  
+        info_dict.update({
+            'title': video_title,
+            'description': video_description,
+            'thumbnail': video_thumbnail,
+            'age_limit': age_limit,
+        })
+
          # Look for Brightcove Legacy Studio embeds
          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
          if bc_urls:
@@ -2221,9 +2265,9 @@ class GenericIE(InfoExtractor):
              return self.url_result(mobj.group('url'))
  
          # Look for embedded Facebook player
-        facebook_url = FacebookIE._extract_url(webpage)
-        if facebook_url is not None:
-            return self.url_result(facebook_url, 'Facebook')
+        facebook_urls = FacebookIE._extract_urls(webpage)
+        if facebook_urls:
+            return self.playlist_from_matches(facebook_urls, video_id, video_title)
  
          # Look for embedded VK player
          mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -2420,12 +2464,12 @@ class GenericIE(InfoExtractor):
          if kaltura_url:
              return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
  
-        # Look for Eagle.Platform embeds
+        # Look for EaglePlatform embeds
          eagleplatform_url = EaglePlatformIE._extract_url(webpage)
          if eagleplatform_url:
-            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
+            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
  
-        # Look for ClipYou (uses Eagle.Platform) embeds
+        # Look for ClipYou (uses EaglePlatform) embeds
          mobj = re.search(
              r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
          if mobj is not None:
@@ -2668,18 +2712,26 @@ class GenericIE(InfoExtractor):
              return self.playlist_from_matches(
                  mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
  
+        def merge_dicts(dict1, dict2):
+            merged = {}
+            for k, v in dict1.items():
+                if v is not None:
+                    merged[k] = v
+            for k, v in dict2.items():
+                if v is None:
+                    continue
+                if (k not in merged or
+                        (isinstance(v, compat_str) and v and
+                            isinstance(merged[k], compat_str) and
+                            not merged[k])):
+                    merged[k] = v
+            return merged
+
          # Looking for http://schema.org/VideoObject
          json_ld = self._search_json_ld(
              webpage, video_id, default={}, expected_type='VideoObject')
          if json_ld.get('url'):
-            info_dict.update({
-                'title': video_title or info_dict['title'],
-                'description': video_description,
-                'thumbnail': video_thumbnail,
-                'age_limit': age_limit
-            })
-            info_dict.update(json_ld)
-            return info_dict
+            return merge_dicts(json_ld, info_dict)
  
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@@ -2697,9 +2749,7 @@ class GenericIE(InfoExtractor):
          if jwplayer_data:
              info = self._parse_jwplayer_data(
                  jwplayer_data, video_id, require_title=False, base_url=url)
-            if not info.get('title'):
-                info['title'] = video_title
-            return info
+            return merge_dicts(info, info_dict)
  
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):