youtube_dl/extractor/tass.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import json
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     js_to_json,
   9     qualities,
  10 )
  11
  12
  13 class TassIE(InfoExtractor):
  14     _VALID_URL = r'https?://(?:tass\.ru|itar-tass\.com)/[^/]+/(?P<id>\d+)'
  15     _TESTS = [
  16         {
  17             'url': 'http://tass.ru/obschestvo/1586870',
  18             'md5': '3b4cdd011bc59174596b6145cda474a4',
  19             'info_dict': {
  20                 'id': '1586870',
  21                 'ext': 'mp4',
  22                 'title': 'Посетителям московского зоопарка показали красную панду',
  23                 'description': 'Приехавшую из Дублина Зейну можно увидеть в павильоне "Кошки тропиков"',
  24                 'thumbnail': r're:^https?://.*\.jpg$',
  25             },
  26         },
  27         {
  28             'url': 'http://itar-tass.com/obschestvo/1600009',
  29             'only_matching': True,
  30         },
  31     ]
  32
  33     def _real_extract(self, url):
  34         video_id = self._match_id(url)
  35
  36         webpage = self._download_webpage(url, video_id)
  37
  38         sources = json.loads(js_to_json(self._search_regex(
  39             r'(?s)sources\s*:\s*(\[.+?\])', webpage, 'sources')))
  40
  41         quality = qualities(['sd', 'hd'])
  42
  43         formats = []
  44         for source in sources:
  45             video_url = source.get('file')
  46             if not video_url or not video_url.startswith('http') or not video_url.endswith('.mp4'):
  47                 continue
  48             label = source.get('label')
  49             formats.append({
  50                 'url': video_url,
  51                 'format_id': label,
  52                 'quality': quality(label),
  53             })
  54         self._sort_formats(formats)
  55
  56         return {
  57             'id': video_id,
  58             'title': self._og_search_title(webpage),
  59             'description': self._og_search_description(webpage),
  60             'thumbnail': self._og_search_thumbnail(webpage),
  61             'formats': formats,
  62         }