youtube_dl/extractor/mitele.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_urllib_parse_urlencode,
   9     compat_urlparse,
  10 )
  11 from ..utils import (
  12     get_element_by_attribute,
  13     int_or_none,
  14     remove_start,
  15 )
  16
  17
  18 class MiTeleIE(InfoExtractor):
  19     IE_DESC = 'mitele.es'
  20     _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
  21
  22     _TESTS = [{
  23         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
  24         # MD5 is unstable
  25         'info_dict': {
  26             'id': '0NF1jJnxS1Wu3pHrmvFyw2',
  27             'display_id': 'programa-144',
  28             'ext': 'flv',
  29             'title': 'Tor, la web invisible',
  30             'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
  31             'series': 'Diario de',
  32             'season': 'La redacción',
  33             'episode': 'Programa 144',
  34             'thumbnail': 're:(?i)^https?://.*\.jpg$',
  35             'duration': 2913,
  36         },
  37     }, {
  38         # no explicit title
  39         'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/',
  40         'info_dict': {
  41             'id': 'eLZSwoEd1S3pVyUm8lc6F',
  42             'display_id': 'programa-226',
  43             'ext': 'flv',
  44             'title': 'Cuarto Milenio - Temporada 6 - Programa 226',
  45             'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',
  46             'series': 'Cuarto Milenio',
  47             'season': 'Temporada 6',
  48             'episode': 'Programa 226',
  49             'thumbnail': 're:(?i)^https?://.*\.jpg$',
  50             'duration': 7312,
  51         },
  52         'params': {
  53             'skip_download': True,
  54         },
  55     }]
  56
  57     def _real_extract(self, url):
  58         display_id = self._match_id(url)
  59
  60         webpage = self._download_webpage(url, display_id)
  61
  62         config_url = self._search_regex(
  63             r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
  64         config_url = compat_urlparse.urljoin(url, config_url)
  65
  66         config = self._download_json(
  67             config_url, display_id, 'Downloading config JSON')
  68
  69         mmc = self._download_json(
  70             config['services']['mmc'], display_id, 'Downloading mmc JSON')
  71
  72         formats = []
  73         for location in mmc['locations']:
  74             gat = self._proto_relative_url(location.get('gat'), 'http:')
  75             bas = location.get('bas')
  76             loc = location.get('loc')
  77             ogn = location.get('ogn')
  78             if None in (gat, bas, loc, ogn):
  79                 continue
  80             token_data = {
  81                 'bas': bas,
  82                 'icd': loc,
  83                 'ogn': ogn,
  84                 'sta': '0',
  85             }
  86             media = self._download_json(
  87                 '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
  88                 display_id, 'Downloading %s JSON' % location['loc'])
  89             file_ = media.get('file')
  90             if not file_:
  91                 continue
  92             formats.extend(self._extract_f4m_formats(
  93                 file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
  94                 display_id, f4m_id=loc))
  95         self._sort_formats(formats)
  96
  97         title = self._search_regex(
  98             r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>',
  99             webpage, 'title', default=None)
 100
 101         mobj = re.search(r'''(?sx)
 102                             class="Destacado-text"[^>]*>.*?<h1>\s*
 103                             <span>(?P<series>[^<]+)</span>\s*
 104                             <span>(?P<season>[^<]+)</span>\s*
 105                             <span>(?P<episode>[^<]+)</span>''', webpage)
 106         series, season, episode = mobj.groups() if mobj else [None] * 3
 107
 108         if not title:
 109             if mobj:
 110                 title = '%s - %s - %s' % (series, season, episode)
 111             else:
 112                 title = remove_start(self._search_regex(
 113                     r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ')
 114
 115         video_id = self._search_regex(
 116             r'data-media-id\s*=\s*"([^"]+)"', webpage,
 117             'data media id', default=None) or display_id
 118         thumbnail = config.get('poster', {}).get('imageUrl')
 119         duration = int_or_none(mmc.get('duration'))
 120
 121         return {
 122             'id': video_id,
 123             'display_id': display_id,
 124             'title': title,
 125             'description': get_element_by_attribute('class', 'text', webpage),
 126             'series': series,
 127             'season': season,
 128             'episode': episode,
 129             'thumbnail': thumbnail,
 130             'duration': duration,
 131             'formats': formats,
 132         }