Merge remote-tracking branch 'sehrgut/Grooveshark'

[youtube-dl.git] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index bcb0765940df39656be9f78c1ab144976adb5e5e..62b1da25ee111748fc1e99ada617163274bcee69 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -16,6 +16,7 @@ from ..utils import (
  
      ExtractorError,
      HEADRequest,
+    orderedSet,
      parse_xml,
      smuggle_url,
      unescapeHTML,
@@ -289,6 +290,22 @@ class GenericIE(InfoExtractor):
                  'description': 'Mario\'s life in the fast lane has never looked so good.',
              },
          },
+        # YouTube embed via <data-embed-url="">
+        {
+            'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
+            'info_dict': {
+                'id': 'jpSGZsgga_I',
+                'ext': 'mp4',
+                'title': 'Asphalt 8: Airborne - Launch Trailer',
+                'uploader': 'Gameloft',
+                'uploader_id': 'gameloft',
+                'upload_date': '20130821',
+                'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a',
+            },
+            'params': {
+                'skip_download': True,
+            }
+        }
      ]
  
      def report_download_webpage(self, video_id):
@@ -479,6 +496,12 @@ class GenericIE(InfoExtractor):
          video_uploader = self._search_regex(
              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
  
+        # Helper method
+        def _playlist_from_matches(matches, getter, ie=None):
+            urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches)
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
          # Look for BrightCove:
          bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
          if bc_urls:
@@ -514,6 +537,7 @@ class GenericIE(InfoExtractor):
          matches = re.findall(r'''(?x)
              (?:
                  <iframe[^>]+?src=|
+                data-video-url=|
                  <embed[^>]+?src=|
                  embedSWF\(?:\s*
              )
@@ -522,19 +546,15 @@ class GenericIE(InfoExtractor):
                  (?:embed|v)/.+?)
              \1''', webpage)
          if matches:
-            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
-                     for tuppl in matches]
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
+            return _playlist_from_matches(
+                matches, lambda m: unescapeHTML(m[1]), ie='Youtube')
  
          # Look for embedded Dailymotion player
          matches = re.findall(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
          if matches:
-            urlrs = [self.url_result(unescapeHTML(tuppl[1]))
-                     for tuppl in matches]
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
+            return _playlist_from_matches(
+                matches, lambda m: unescapeHTML(m[1]))
  
          # Look for embedded Wistia player
          match = re.search(
@@ -648,10 +668,8 @@ class GenericIE(InfoExtractor):
          # Look for funnyordie embed
          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
          if matches:
-            urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
-                     for eurl in matches]
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
+            return _playlist_from_matches(
+                matches, getter=unescapeHTML, ie='FunnyOrDie')
  
          # Look for embedded RUTV player
          rutv_url = RUTVIE._extract_url(webpage)
@@ -706,6 +724,20 @@ class GenericIE(InfoExtractor):
              url = unescapeHTML(mobj.group('url'))
              return self.url_result(url, ie='MTVServicesEmbedded')
  
+        # Look for embedded yahoo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Yahoo')
+
+        # Look for embedded sbs.com.au player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'SBS')
+
          # Start with something easy: JW Player in SWFObject
          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
          if not found: