youtube_dl/extractor/jukebox.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     RegexNotFoundError,
   9     unescapeHTML,
  10 )
  11
  12
  13 class JukeboxIE(InfoExtractor):
  14     _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
  15     _TEST = {
  16         'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
  17         'info_dict': {
  18             'id': 'r303r',
  19             'ext': 'flv',
  20             'title': 'Kosheen-En Vivo Pride',
  21             'uploader': 'Kosheen',
  22         },
  23     }
  24
  25     def _real_extract(self, url):
  26         video_id = self._match_id(url)
  27
  28         html = self._download_webpage(url, video_id)
  29         iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
  30
  31         iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
  32         if re.search(r'class="jkb_waiting"', iframe_html) is not None:
  33             raise ExtractorError('Video is not available(in your country?)!')
  34
  35         self.report_extraction(video_id)
  36
  37         try:
  38             video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
  39                                            iframe_html, 'video url')
  40             video_url = unescapeHTML(video_url).replace('\/', '/')
  41         except RegexNotFoundError:
  42             youtube_url = self._search_regex(
  43                 r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
  44                 iframe_html, 'youtube url')
  45             youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
  46             self.to_screen('Youtube video detected')
  47             return self.url_result(youtube_url, ie='Youtube')
  48
  49         title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
  50                                         html, 'title')
  51         artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
  52                                          html, 'artist')
  53
  54         return {
  55             'id': video_id,
  56             'url': video_url,
  57             'title': artist + '-' + title,
  58             'uploader': artist,
  59         }