youtube_dl/extractor/ntvru.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     int_or_none,
   7     strip_or_none,
   8     unescapeHTML,
   9     xpath_text,
  10 )
  11
  12
  13 class NTVRuIE(InfoExtractor):
  14     IE_NAME = 'ntv.ru'
  15     _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  16
  17     _TESTS = [{
  18         'url': 'http://www.ntv.ru/novosti/863142/',
  19         'md5': 'ba7ea172a91cb83eb734cad18c10e723',
  20         'info_dict': {
  21             'id': '746000',
  22             'ext': 'mp4',
  23             'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
  24             'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
  25             'thumbnail': r're:^http://.*\.jpg',
  26             'duration': 136,
  27         },
  28     }, {
  29         'url': 'http://www.ntv.ru/video/novosti/750370/',
  30         'md5': 'adecff79691b4d71e25220a191477124',
  31         'info_dict': {
  32             'id': '750370',
  33             'ext': 'mp4',
  34             'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
  35             'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
  36             'thumbnail': r're:^http://.*\.jpg',
  37             'duration': 172,
  38         },
  39     }, {
  40         'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
  41         'md5': '82dbd49b38e3af1d00df16acbeab260c',
  42         'info_dict': {
  43             'id': '747480',
  44             'ext': 'mp4',
  45             'title': '«Сегодня». 21 марта 2014 года. 16:00',
  46             'description': '«Сегодня». 21 марта 2014 года. 16:00',
  47             'thumbnail': r're:^http://.*\.jpg',
  48             'duration': 1496,
  49         },
  50     }, {
  51         'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/',
  52         'md5': 'e9c7cde24d9d3eaed545911a04e6d4f4',
  53         'info_dict': {
  54             'id': '1126480',
  55             'ext': 'mp4',
  56             'title': 'Остросюжетный фильм «Кома»',
  57             'description': 'Остросюжетный фильм «Кома»',
  58             'thumbnail': r're:^http://.*\.jpg',
  59             'duration': 5592,
  60         },
  61     }, {
  62         'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
  63         'md5': '9320cd0e23f3ea59c330dc744e06ff3b',
  64         'info_dict': {
  65             'id': '751482',
  66             'ext': 'mp4',
  67             'title': '«Дело врачей»: «Деревце жизни»',
  68             'description': '«Дело врачей»: «Деревце жизни»',
  69             'thumbnail': r're:^http://.*\.jpg',
  70             'duration': 2590,
  71         },
  72     }, {
  73         # Schemeless file URL
  74         'url': 'https://www.ntv.ru/video/1797442',
  75         'only_matching': True,
  76     }]
  77
  78     _VIDEO_ID_REGEXES = [
  79         r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
  80         r'<video embed=[^>]+><id>(\d+)</id>',
  81         r'<video restriction[^>]+><key>(\d+)</key>',
  82     ]
  83
  84     def _real_extract(self, url):
  85         video_id = self._match_id(url)
  86
  87         webpage = self._download_webpage(url, video_id)
  88
  89         video_url = self._og_search_property(
  90             ('video', 'video:iframe'), webpage, default=None)
  91         if video_url:
  92             video_id = self._search_regex(
  93                 r'https?://(?:www\.)?ntv\.ru/video/(?:embed/)?(\d+)',
  94                 video_url, 'video id', default=None)
  95
  96         if not video_id:
  97             video_id = self._html_search_regex(
  98                 self._VIDEO_ID_REGEXES, webpage, 'video id')
  99
 100         player = self._download_xml(
 101             'http://www.ntv.ru/vi%s/' % video_id,
 102             video_id, 'Downloading video XML')
 103
 104         title = strip_or_none(unescapeHTML(xpath_text(player, './data/title', 'title', fatal=True)))
 105
 106         video = player.find('./data/video')
 107
 108         formats = []
 109         for format_id in ['', 'hi', 'webm']:
 110             file_ = xpath_text(video, './%sfile' % format_id)
 111             if not file_:
 112                 continue
 113             if file_.startswith('//'):
 114                 file_ = self._proto_relative_url(file_)
 115             elif not file_.startswith('http'):
 116                 file_ = 'http://media.ntv.ru/vod/' + file_
 117             formats.append({
 118                 'url': file_,
 119                 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)),
 120             })
 121         self._sort_formats(formats)
 122
 123         return {
 124             'id': xpath_text(video, './id'),
 125             'title': title,
 126             'description': strip_or_none(unescapeHTML(xpath_text(player, './data/description'))),
 127             'thumbnail': xpath_text(video, './splash'),
 128             'duration': int_or_none(xpath_text(video, './totaltime')),
 129             'view_count': int_or_none(xpath_text(video, './views')),
 130             'formats': formats,
 131         }