YoutubeSearchIE: the query is a str, in python 3 it fails if decode is called

[youtube-dl.git] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 105f90e2f1ee849a5e067b7af74908884f0d6af8..282334635079560723bf38562abae0032bf15b96 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1424,7 +1424,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
  
      def report_download_page(self, query, pagenum):
          """Report attempt to download search page with given number."""
-        query = query.decode(preferredencoding())
          self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
  
      def _get_n_results(self, query, n):
@@ -4091,6 +4090,64 @@ class ARDIE(InfoExtractor):
              info["url"] = stream["video_url"]
          return [info]
  
+class ZDFIE(InfoExtractor):
+    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+    _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
+    _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('video_id')
+
+        html = self._download_webpage(url, video_id)
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if streams is None:
+            raise ExtractorError(u'No media url found.')
+
+        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
+        # choose first/default media type and highest quality for now
+        for s in streams:        #find 300 - dsl1000mbit
+            if s['quality'] == '300' and s['media_type'] == 'wstreaming':
+                stream_=s
+                break
+        for s in streams:        #find veryhigh - dsl2000mbit
+            if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
+                stream_=s
+                break
+        if stream_ is None:
+            raise ExtractorError(u'No stream found.')
+
+        media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
+
+        self.report_extraction(video_id)
+        mobj = re.search(self._TITLE, html)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract title')
+        title = unescapeHTML(mobj.group('title'))
+
+        mobj = re.search(self._MMS_STREAM, media_link)
+        if mobj is None:
+            mobj = re.search(self._RTSP_STREAM, media_link)
+            if mobj is None:
+                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
+        mms_url = mobj.group('video_url')
+
+        mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract extention')
+        ext = mobj.group('ext')
+
+        return [{'id': video_id,
+                 'url': mms_url,
+                 'title': title,
+                 'ext': ext
+                 }]
+
  class TumblrIE(InfoExtractor):
      _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
  
@@ -4487,54 +4544,87 @@ class HypemIE(InfoExtractor):
      """Information Extractor for hypem"""
      _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
  
-    def _real_extract(self,url):
+    def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
-        data = {'ax':1 ,
-                  'ts': time.time()
-              }
-        id = mobj.group(1)
+        track_id = mobj.group(1)
+
+        data = { 'ax': 1, 'ts': time.time() }
          data_encoded = compat_urllib_parse.urlencode(data)
-        complete_url = url + "?"+data_encoded
+        complete_url = url + "?" + data_encoded
          request = compat_urllib_request.Request(complete_url)
-        response,urlh = self._download_webpage_handle(request, id, u'Downloading webpage with the url')
+        response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
          cookie = urlh.headers.get('Set-Cookie', '')
-        track_list = []
-        list_data = re.search(r'<script type="application/json" id="displayList-data">\n    (.*)    </script>',response)
-        html_tracks = list_data.group(1)
-        if html_tracks is None:
-            tracks = track_list
+
+        self.report_extraction(track_id)
+        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
+        if mobj is None:
+            raise ExtractorError(u'Unable to extrack tracks')
+        html_tracks = mobj.group(1).strip()
          try:
              track_list = json.loads(html_tracks)
-            tracks = track_list[u'tracks']
+            track = track_list[u'tracks'][0]
          except ValueError:
-            self.to_screen("Hypemachine contained invalid JSON.")
-            tracks =  track_list
-
-        for track in tracks:
-            key = track[u"key"]
-            id = track[u"id"]
-            artist = track[u"artist"]
-            title = track[u"song"]
-        serve_url = "http://hypem.com/serve/source/%s/%s"%(str(id), str(key))
-        self.report_extraction(id)
+            raise ExtractorError(u'Hypemachine contained invalid JSON.')
+
+        key = track[u"key"]
+        track_id = track[u"id"]
+        artist = track[u"artist"]
+        title = track[u"song"]
+
+        serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
          request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
          request.add_header('cookie', cookie)
-        response = compat_urllib_request.urlopen(request)
-        song_data_json = response.read()
-        response.close()
-        (song_data_json, response) = self._download_webpage_handle(request, id, u'Downloading webpage with the url')
-        song_data = json.loads(song_data_json)
+        song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
+        try:
+            song_data = json.loads(song_data_json)
+        except ValueError:
+            raise ExtractorError(u'Hypemachine contained invalid JSON.')
          final_url = song_data[u"url"]
+
          return [{
-            'id':       id,
+            'id':       track_id,
              'url':      final_url,
              'ext':      "mp3",
              'title':    title,
              'artist':   artist,
          }]
  
+class Vbox7IE(InfoExtractor):
+    """Information Extractor for Vbox7"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group(1)
+
+        redirect_page, urlh = self._download_webpage_handle(url, video_id)
+        redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
+        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+        title = re.search(r'<title>(.*)</title>', webpage)
+        title = (title.group(1)).split('/')[0].strip()
+
+        ext = "flv"
+        info_url = "http://vbox7.com/play/magare.do"
+        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+        info_request = compat_urllib_request.Request(info_url, data)
+        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+        if info_response is None:
+            raise ExtractorError(u'Unable to extract the media url')
+        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+        return [{
+            'id':        video_id,
+            'url':       final_url,
+            'ext':       ext,
+            'title':     title,
+            'thumbnail': thumbnail_url,
+        }]
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4589,6 +4679,7 @@ def gen_extractors():
          SpiegelIE(),
          LiveLeakIE(),
          ARDIE(),
+        ZDFIE(),
          TumblrIE(),
          BandcampIE(),
          RedTubeIE(),
@@ -4599,6 +4690,7 @@ def gen_extractors():
          TeamcocoIE(),
          XHamsterIE(),
          HypemIE(),
+        Vbox7IE(),
          GenericIE()
      ]