youtube_dl/extractor/liveleak.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     ExtractorError,
   9 )
  10
  11
  12 class LiveLeakIE(InfoExtractor):
  13     _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
  14     _TESTS = [{
  15         'url': 'http://www.liveleak.com/view?i=757_1364311680',
  16         'file': '757_1364311680.mp4',
  17         'md5': '0813c2430bea7a46bf13acf3406992f4',
  18         'info_dict': {
  19             'description': 'extremely bad day for this guy..!',
  20             'uploader': 'ljfriel2',
  21             'title': 'Most unlucky car accident'
  22         }
  23     },
  24     {
  25         'url': 'http://www.liveleak.com/view?i=f93_1390833151',
  26         'file': 'f93_1390833151.mp4',
  27         'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
  28         'info_dict': {
  29             'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
  30             'uploader': 'ARD_Stinkt',
  31             'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
  32         }
  33     }]
  34
  35     def _real_extract(self, url):
  36         mobj = re.match(self._VALID_URL, url)
  37
  38         video_id = mobj.group('video_id')
  39         webpage = self._download_webpage(url, video_id)
  40         sources_raw = self._search_regex(
  41             r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
  42         if sources_raw is None:
  43             sources_raw = '[{ %s}]' % (
  44                 self._search_regex(r'(file: ".*?"),', webpage, 'video URL'))
  45
  46         sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
  47         sources = json.loads(sources_json)
  48
  49         formats = [{
  50             'format_note': s.get('label'),
  51             'url': s['file'],
  52         } for s in sources]
  53         self._sort_formats(formats)
  54
  55         video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
  56         video_description = self._og_search_description(webpage)
  57         video_uploader = self._html_search_regex(
  58             r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
  59
  60         return {
  61             'id': video_id,
  62             'title': video_title,
  63             'description': video_description,
  64             'uploader': video_uploader,
  65             'formats': formats,
  66         }