youtube_dl/extractor/min20.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5
   6
   7 class Min20IE(InfoExtractor):
   8     _VALID_URL = r'http://www\.20min\.ch/.+?-(?P<id>[0-9]+)$'
   9     _TEST = {
  10         'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469',
  11         'md5': 'cd4cbb99b94130cff423e967cd275e5e',
  12         'info_dict': {
  13             'id': '22050469',
  14             'ext': 'flv',
  15             'title': '«Wir müssen mutig nach vorne schauen»',
  16             'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.',
  17             'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg'
  18         }
  19     }
  20
  21     # location of the flv videos, can't be extracted from the web page
  22     _BASE_URL = "http://flv-rr.20min-tv.ch/videos/"
  23
  24     def _real_extract(self, url):
  25         video_id = self._match_id(url)
  26         webpage = self._download_webpage(url, video_id)
  27         title = self._html_search_regex(r'<h1><span>(.+?)</span></h1>', webpage, 'title')
  28         flash_id = self._search_regex(r"so\.addVariable\(\"file1\",\"([0-9]+)\"\)", webpage, 'flash_id')
  29
  30         description = self._html_search_regex(r'<meta name="description" content="(.+?)" />', webpage, 'description')
  31         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)" />', webpage, 'thumbnail')
  32         url = self._BASE_URL + flash_id + "m.flv"
  33
  34         return {
  35             'id': video_id,
  36             'url': url,
  37             'title': title,
  38             'description': description,
  39             'thumbnail': thumbnail
  40         }