youtube_dl/extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .subtitles import SubtitlesInfoExtractor
   7
   8 from ..utils import (
   9     compat_str,
  10 )
  11
  12
  13 class TEDIE(SubtitlesInfoExtractor):
  14     _VALID_URL = r'''(?x)
  15         (?P<proto>https?://)
  16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
  17         (
  18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  19             |
  20             ((?P<type_talk>talks)) # We have a simple talk
  21             |
  22             (?P<type_watch>watch)/[^/]+/[^/]+
  23         )
  24         (/lang/(.*?))? # The url may contain the language
  25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  26         .*)$
  27         '''
  28     _TESTS = [{
  29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  30         'md5': '4ea1dada91e4174b53dac2bb8ace429d',
  31         'info_dict': {
  32             'id': '102',
  33             'ext': 'mp4',
  34             'title': 'The illusion of consciousness',
  35             'description': ('Philosopher Dan Dennett makes a compelling '
  36                 'argument that not only don\'t we understand our own '
  37                 'consciousness, but that half the time our brains are '
  38                 'actively fooling us.'),
  39             'uploader': 'Dan Dennett',
  40             'width': 854,
  41         }
  42     }, {
  43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  44         'md5': '226f4fb9c62380d11b7995efa4c87994',
  45         'info_dict': {
  46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  47             'ext': 'mp4',
  48             'title': 'Vishal Sikka: The beauty and power of algorithms',
  49             'thumbnail': 're:^https?://.+\.jpg',
  50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  51         }
  52     }, {
  53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  54         'info_dict': {
  55             'id': '1972',
  56             'ext': 'flv',
  57             'title': 'Be passionate. Be courageous. Be your best.',
  58             'uploader': 'Gabby Giffords and Mark Kelly',
  59             'description': 'md5:d89e1d8ebafdac8e55df4c219ecdbfe9',
  60         },
  61         'params': {
  62             # rtmp download
  63             'skip_download': True,
  64         },
  65     }]
  66
  67     _NATIVE_FORMATS = {
  68         'low': {'preference': 1, 'width': 320, 'height': 180},
  69         'medium': {'preference': 2, 'width': 512, 'height': 288},
  70         'high': {'preference': 3, 'width': 854, 'height': 480},
  71     }
  72
  73     def _extract_info(self, webpage):
  74         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
  75             webpage, 'info json')
  76         return json.loads(info_json)
  77
  78     def _real_extract(self, url):
  79         m = re.match(self._VALID_URL, url, re.VERBOSE)
  80         if m.group('type') == 'embed':
  81             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  82             return self.url_result(desktop_url, 'TED')
  83         name = m.group('name')
  84         if m.group('type_talk'):
  85             return self._talk_info(url, name)
  86         elif m.group('type_watch'):
  87             return self._watch_info(url, name)
  88         else:
  89             return self._playlist_videos_info(url, name)
  90
  91     def _playlist_videos_info(self, url, name):
  92         '''Returns the videos of the playlist'''
  93
  94         webpage = self._download_webpage(url, name,
  95             'Downloading playlist webpage')
  96         info = self._extract_info(webpage)
  97         playlist_info = info['playlist']
  98
  99         playlist_entries = [
 100             self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
 101             for talk in info['talks']
 102         ]
 103         return self.playlist_result(
 104             playlist_entries,
 105             playlist_id=compat_str(playlist_info['id']),
 106             playlist_title=playlist_info['title'])
 107
 108     def _talk_info(self, url, video_name):
 109         webpage = self._download_webpage(url, video_name)
 110         self.report_extraction(video_name)
 111
 112         talk_info = self._extract_info(webpage)['talks'][0]
 113
 114         formats = [{
 115             'url': format_url,
 116             'format_id': format_id,
 117             'format': format_id,
 118         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
 119         if formats:
 120             for f in formats:
 121                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
 122                 if finfo:
 123                     f.update(finfo)
 124         else:
 125             # Use rtmp downloads
 126             formats = [{
 127                 'format_id': f['name'],
 128                 'url': talk_info['streamer'],
 129                 'play_path': f['file'],
 130                 'ext': 'flv',
 131                 'width': f['width'],
 132                 'height': f['height'],
 133                 'tbr': f['bitrate'],
 134             } for f in talk_info['resources']['rtmp']]
 135         self._sort_formats(formats)
 136
 137         video_id = compat_str(talk_info['id'])
 138         # subtitles
 139         video_subtitles = self.extract_subtitles(video_id, talk_info)
 140         if self._downloader.params.get('listsubtitles', False):
 141             self._list_available_subtitles(video_id, talk_info)
 142             return
 143
 144         thumbnail = talk_info['thumb']
 145         if not thumbnail.startswith('http'):
 146             thumbnail = 'http://' + thumbnail
 147         return {
 148             'id': video_id,
 149             'title': talk_info['title'],
 150             'uploader': talk_info['speaker'],
 151             'thumbnail': thumbnail,
 152             'description': self._og_search_description(webpage),
 153             'subtitles': video_subtitles,
 154             'formats': formats,
 155         }
 156
 157     def _get_available_subtitles(self, video_id, talk_info):
 158         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 159         if languages:
 160             sub_lang_list = {}
 161             for l in languages:
 162                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
 163                 sub_lang_list[l] = url
 164             return sub_lang_list
 165         else:
 166             self._downloader.report_warning(u'video doesn\'t have subtitles')
 167             return {}
 168
 169     def _watch_info(self, url, name):
 170         webpage = self._download_webpage(url, name)
 171
 172         config_json = self._html_search_regex(
 173             r"data-config='([^']+)", webpage, 'config')
 174         config = json.loads(config_json)
 175         video_url = config['video']['url']
 176         thumbnail = config.get('image', {}).get('url')
 177
 178         title = self._html_search_regex(
 179             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 180         description = self._html_search_regex(
 181             r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 182             webpage, 'description', fatal=False)
 183
 184         return {
 185             'id': name,
 186             'url': video_url,
 187             'title': title,
 188             'thumbnail': thumbnail,
 189             'description': description,
 190         }