youtube_dl/extractor/ubu.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import int_or_none
   7
   8
   9 class UbuIE(InfoExtractor):
  10     _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html'
  11     _TEST = {
  12         'url': 'http://ubu.com/film/her_noise.html',
  13         'md5': '8edd46ee8aa6b265fb5ed6cf05c36bc9',
  14         'info_dict': {
  15             'id': 'her_noise',
  16             'ext': 'mp4',
  17             'title': 'Her Noise - The Making Of (2007)',
  18             'duration': 3600,
  19         },
  20     }
  21
  22     def _real_extract(self, url):
  23         mobj = re.match(self._VALID_URL, url)
  24         video_id = mobj.group('id')
  25
  26         webpage = self._download_webpage(url, video_id)
  27
  28         title = self._html_search_regex(
  29             r'<title>.+?Film &amp; Video: ([^<]+)</title>', webpage, 'title')
  30
  31         duration = int_or_none(self._html_search_regex(
  32             r'Duration: (\d+) minutes', webpage, 'duration', fatal=False, default=None))
  33         if duration:
  34             duration *= 60
  35
  36         formats = []
  37
  38         FORMAT_REGEXES = [
  39             ['sq', r"'flashvars'\s*,\s*'file=([^']+)'"],
  40             ['hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"']
  41         ]
  42
  43         for format_id, format_regex in FORMAT_REGEXES:
  44             m = re.search(format_regex, webpage)
  45             if m:
  46                 formats.append({
  47                     'url': m.group(1),
  48                     'format_id': format_id,
  49                 })
  50
  51         return {
  52             'id': video_id,
  53             'title': title,
  54             'duration': duration,
  55             'formats': formats,
  56         }