youtube_dl/extractor/closertotruth.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7
   8
   9 class CloserToTruthIE(InfoExtractor):
  10     _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  11     _TESTS = [{
  12         'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
  13         'info_dict': {
  14             'id': '0_zof1ktre',
  15             'display_id': 'solutions-the-mind-body-problem',
  16             'ext': 'mov',
  17             'title': 'Solutions to the Mind-Body Problem?',
  18             'upload_date': '20140221',
  19             'timestamp': 1392956007,
  20             'uploader_id': 'CTTXML'
  21         },
  22         'params': {
  23             'skip_download': True,
  24         },
  25     }, {
  26         'url': 'http://closertotruth.com/episodes/how-do-brains-work',
  27         'info_dict': {
  28             'id': '0_iuxai6g6',
  29             'display_id': 'how-do-brains-work',
  30             'ext': 'mov',
  31             'title': 'How do Brains Work?',
  32             'upload_date': '20140221',
  33             'timestamp': 1392956024,
  34             'uploader_id': 'CTTXML'
  35         },
  36         'params': {
  37             'skip_download': True,
  38         },
  39     }, {
  40         'url': 'http://closertotruth.com/interviews/1725',
  41         'info_dict': {
  42             'id': '1725',
  43             'title': 'AyaFr-002',
  44         },
  45         'playlist_mincount': 2,
  46     }]
  47
  48     def _real_extract(self, url):
  49         display_id = self._match_id(url)
  50
  51         webpage = self._download_webpage(url, display_id)
  52
  53         partner_id = self._search_regex(
  54             r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
  55             webpage, 'kaltura partner_id')
  56
  57         title = self._search_regex(
  58             r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
  59
  60         select = self._search_regex(
  61             r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
  62             webpage, 'select version', default=None)
  63         if select:
  64             entry_ids = set()
  65             entries = []
  66             for mobj in re.finditer(
  67                     r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
  68                     webpage):
  69                 entry_id = mobj.group('id')
  70                 if entry_id in entry_ids:
  71                     continue
  72                 entry_ids.add(entry_id)
  73                 entries.append({
  74                     '_type': 'url_transparent',
  75                     'url': 'kaltura:%s:%s' % (partner_id, entry_id),
  76                     'ie_key': 'Kaltura',
  77                     'title': mobj.group('title'),
  78                 })
  79             if entries:
  80                 return self.playlist_result(entries, display_id, title)
  81
  82         entry_id = self._search_regex(
  83             r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
  84             webpage, 'kaltura entry_id', group='id')
  85
  86         return {
  87             '_type': 'url_transparent',
  88             'display_id': display_id,
  89             'url': 'kaltura:%s:%s' % (partner_id, entry_id),
  90             'ie_key': 'Kaltura',
  91             'title': title
  92         }