extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .subtitles import SubtitlesInfoExtractor
   7
   8 from ..utils import (
   9     compat_str,
  10 )
  11
  12
  13 class TEDIE(SubtitlesInfoExtractor):
  14     _VALID_URL = r'''(?x)
  15         (?P<proto>https?://)
  16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
  17         (
  18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  19             |
  20             ((?P<type_talk>talks)) # We have a simple talk
  21             |
  22             (?P<type_watch>watch)/[^/]+/[^/]+
  23         )
  24         (/lang/(.*?))? # The url may contain the language
  25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  26         .*)$
  27         '''
  28     _TESTS = [{
  29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  30         'md5': '4ea1dada91e4174b53dac2bb8ace429d',
  31         'info_dict': {
  32             'id': '102',
  33             'ext': 'mp4',
  34             'title': 'The illusion of consciousness',
  35             'description': ('Philosopher Dan Dennett makes a compelling '
  36                 'argument that not only don\'t we understand our own '
  37                 'consciousness, but that half the time our brains are '
  38                 'actively fooling us.'),
  39             'uploader': 'Dan Dennett',
  40         }
  41     }, {
  42         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  43         'md5': '226f4fb9c62380d11b7995efa4c87994',
  44         'info_dict': {
  45             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  46             'ext': 'mp4',
  47             'title': 'Vishal Sikka: The beauty and power of algorithms',
  48             'thumbnail': 're:^https?://.+\.jpg',
  49             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  50         }
  51     }]
  52
  53     _FORMATS_PREFERENCE = {
  54         'low': 1,
  55         'medium': 2,
  56         'high': 3,
  57     }
  58
  59     def _extract_info(self, webpage):
  60         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
  61             webpage, 'info json')
  62         return json.loads(info_json)
  63
  64     def _real_extract(self, url):
  65         m = re.match(self._VALID_URL, url, re.VERBOSE)
  66         if m.group('type') == 'embed':
  67             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  68             return self.url_result(desktop_url, 'TED')
  69         name = m.group('name')
  70         if m.group('type_talk'):
  71             return self._talk_info(url, name)
  72         elif m.group('type_watch'):
  73             return self._watch_info(url, name)
  74         else:
  75             return self._playlist_videos_info(url, name)
  76
  77     def _playlist_videos_info(self, url, name):
  78         '''Returns the videos of the playlist'''
  79
  80         webpage = self._download_webpage(url, name,
  81             'Downloading playlist webpage')
  82         info = self._extract_info(webpage)
  83         playlist_info = info['playlist']
  84
  85         playlist_entries = [
  86             self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  87             for talk in info['talks']
  88         ]
  89         return self.playlist_result(
  90             playlist_entries,
  91             playlist_id=compat_str(playlist_info['id']),
  92             playlist_title=playlist_info['title'])
  93
  94     def _talk_info(self, url, video_name):
  95         webpage = self._download_webpage(url, video_name)
  96         self.report_extraction(video_name)
  97
  98         talk_info = self._extract_info(webpage)['talks'][0]
  99
 100         formats = [{
 101             'ext': 'mp4',
 102             'url': format_url,
 103             'format_id': format_id,
 104             'format': format_id,
 105             'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
 106         } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
 107         self._sort_formats(formats)
 108
 109         video_id = compat_str(talk_info['id'])
 110         # subtitles
 111         video_subtitles = self.extract_subtitles(video_id, talk_info)
 112         if self._downloader.params.get('listsubtitles', False):
 113             self._list_available_subtitles(video_id, talk_info)
 114             return
 115
 116         thumbnail = talk_info['thumb']
 117         if not thumbnail.startswith('http'):
 118             thumbnail = 'http://' + thumbnail
 119         return {
 120             'id': video_id,
 121             'title': talk_info['title'],
 122             'uploader': talk_info['speaker'],
 123             'thumbnail': thumbnail,
 124             'description': self._og_search_description(webpage),
 125             'subtitles': video_subtitles,
 126             'formats': formats,
 127         }
 128
 129     def _get_available_subtitles(self, video_id, talk_info):
 130         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 131         if languages:
 132             sub_lang_list = {}
 133             for l in languages:
 134                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
 135                 sub_lang_list[l] = url
 136             return sub_lang_list
 137         else:
 138             self._downloader.report_warning(u'video doesn\'t have subtitles')
 139             return {}
 140
 141     def _watch_info(self, url, name):
 142         webpage = self._download_webpage(url, name)
 143
 144         config_json = self._html_search_regex(
 145             r"data-config='([^']+)", webpage, 'config')
 146         config = json.loads(config_json)
 147         video_url = config['video']['url']
 148         thumbnail = config.get('image', {}).get('url')
 149
 150         title = self._html_search_regex(
 151             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 152         description = self._html_search_regex(
 153             r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 154             webpage, 'description', fatal=False)
 155
 156         return {
 157             'id': name,
 158             'url': video_url,
 159             'title': title,
 160             'thumbnail': thumbnail,
 161             'description': description,
 162         }