[generic] Add a test case for brightcove embed

[youtube-dl.git] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index b18e734c4492832948e87f821a5377ec9d49026a..ea4009b415301502dd50393901075f0a7cc2b5c1 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE
  from .svt import SVTIE
  from .pornhub import PornHubIE
  from .xhamster import XHamsterEmbedIE
+from .tnaflix import TNAFlixNetworkEmbedIE
  from .vimeo import VimeoIE
  from .dailymotion import DailymotionCloudIE
  from .onionstudios import OnionStudiosIE
@@ -58,6 +59,7 @@ from .videomore import VideomoreIE
  from .googledrive import GoogleDriveIE
  from .jwplatform import JWPlatformIE
  from .digiteka import DigitekaIE
+from .instagram import InstagramIE
  
  
  class GenericIE(InfoExtractor):
@@ -224,6 +226,49 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
+        # MPD from http://dash-mse-test.appspot.com/media.html
+        {
+            'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd',
+            'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53',
+            'info_dict': {
+                'id': 'car-20120827-manifest',
+                'ext': 'mp4',
+                'title': 'car-20120827-manifest',
+                'formats': 'mincount:9',
+            },
+            'params': {
+                'format': 'bestvideo',
+            },
+        },
+        # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
+        {
+            'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8',
+            'info_dict': {
+                'id': 'content',
+                'ext': 'mp4',
+                'title': 'content',
+                'formats': 'mincount:8',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            }
+        },
+        # m3u8 served with Content-Type: text/plain
+        {
+            'url': 'http://www.nacentapps.com/m3u8/index.m3u8',
+            'info_dict': {
+                'id': 'index',
+                'ext': 'mp4',
+                'title': 'index',
+                'upload_date': '20140720',
+                'formats': 'mincount:11',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            }
+        },
          # google redirect
          {
              'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@ -1079,7 +1124,23 @@ class GenericIE(InfoExtractor):
                  # m3u8 downloads
                  'skip_download': True,
              }
-        }
+        },
+        # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions'
+        # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm
+        {
+            'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html',
+            'info_dict': {
+                'id': '4785848093001',
+                'ext': 'mp4',
+                'title': 'The Cardinal Pell Interview',
+                'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
+                'uploader': 'GlobeCast Australia - GlobeStream',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -1227,23 +1288,30 @@ class GenericIE(InfoExtractor):
              full_response = self._request_webpage(request, video_id)
              head_response = full_response
  
+        info_dict = {
+            'id': video_id,
+            'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+            'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+        }
+
          # Check for direct link to a video
-        content_type = head_response.headers.get('Content-Type', '')
-        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+        content_type = head_response.headers.get('Content-Type', '').lower()
+        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
          if m:
-            upload_date = unified_strdate(
-                head_response.headers.get('Last-Modified'))
-            return {
-                'id': video_id,
-                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
-                'direct': True,
-                'formats': [{
+            format_id = m.group('format_id')
+            if format_id.endswith('mpegurl'):
+                formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            elif format_id == 'f4m':
+                formats = self._extract_f4m_formats(url, video_id)
+            else:
+                formats = [{
                      'format_id': m.group('format_id'),
                      'url': url,
                      'vcodec': 'none' if m.group('type') == 'audio' else None
-                }],
-                'upload_date': upload_date,
-            }
+                }]
+                info_dict['direct'] = True
+            info_dict['formats'] = formats
+            return info_dict
  
          if not self._downloader.params.get('test', False) and not is_intentional:
              force = self._downloader.params.get('force_generic_extractor', False)
@@ -1263,28 +1331,30 @@ class GenericIE(InfoExtractor):
              request.add_header('Accept-Encoding', '*')
              full_response = self._request_webpage(request, video_id)
  
+        first_bytes = full_response.read(512)
+
+        # Is it an M3U playlist?
+        if first_bytes.startswith(b'#EXTM3U'):
+            info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
+            return info_dict
+
          # Maybe it's a direct link to a video?
          # Be careful not to download the whole thing!
-        first_bytes = full_response.read(512)
          if not is_html(first_bytes):
              self._downloader.report_warning(
                  'URL could be a direct video link, returning it as such.')
-            upload_date = unified_strdate(
-                head_response.headers.get('Last-Modified'))
-            return {
-                'id': video_id,
-                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+            info_dict.update({
                  'direct': True,
                  'url': url,
-                'upload_date': upload_date,
-            }
+            })
+            return info_dict
  
          webpage = self._webpage_read_content(
              full_response, url, video_id, prefix=first_bytes)
  
          self.report_extraction(video_id)
  
-        # Is it an RSS feed, a SMIL file or a XSPF playlist?
+        # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
          try:
              doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
@@ -1293,6 +1363,13 @@ class GenericIE(InfoExtractor):
                  return self._parse_smil(doc, url, video_id)
              elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
+            elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
+                info_dict['formats'] = self._parse_mpd_formats(
+                    doc, video_id, mpd_base_url=url.rpartition('/')[0])
+                return info_dict
+            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+                return info_dict
          except compat_xml_parse_error:
              pass
  
@@ -1402,7 +1479,7 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded Dailymotion player
          matches = re.findall(
-            r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
          if matches:
              return _playlist_from_matches(
                  matches, lambda m: unescapeHTML(m[1]))
@@ -1547,6 +1624,11 @@ class GenericIE(InfoExtractor):
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'VK')
  
+        # Look for embedded Odnoklassniki player
+        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Odnoklassniki')
+
          # Look for embedded ivi player
          mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
          if mobj is not None:
@@ -1602,6 +1684,11 @@ class GenericIE(InfoExtractor):
          if xhamster_urls:
              return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
  
+        # Look for embedded TNAFlixNetwork player
+        tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
+        if tnaflix_urls:
+            return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+
          # Look for embedded Tvigle player
          mobj = re.search(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1839,6 +1926,19 @@ class GenericIE(InfoExtractor):
                  self._proto_relative_url(unescapeHTML(mobj.group(1))),
                  'AdobeTVVideo')
  
+        # Look for Vine embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
+
+        # Look for Instagram embeds
+        instagram_embed_url = InstagramIE._extract_embed_url(webpage)
+        if instagram_embed_url is not None:
+            return self.url_result(instagram_embed_url, InstagramIE.ie_key())
+
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
@@ -1946,6 +2046,10 @@ class GenericIE(InfoExtractor):
                  return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
              elif ext == 'm3u8':
                  entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
+            elif ext == 'mpd':
+                entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+            elif ext == 'f4m':
+                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
              else:
                  entry_info_dict['url'] = video_url