youtube_dl/extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .subtitles import SubtitlesInfoExtractor
   7
   8 from ..utils import (
   9     compat_str,
  10 )
  11
  12
  13 class TEDIE(SubtitlesInfoExtractor):
  14     _VALID_URL = r'''(?x)
  15         (?P<proto>https?://)
  16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
  17         (
  18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  19             |
  20             ((?P<type_talk>talks)) # We have a simple talk
  21             |
  22             (?P<type_watch>watch)/[^/]+/[^/]+
  23         )
  24         (/lang/(.*?))? # The url may contain the language
  25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  26         .*)$
  27         '''
  28     _TESTS = [{
  29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  30         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
  31         'info_dict': {
  32             'id': '102',
  33             'ext': 'mp4',
  34             'title': 'The illusion of consciousness',
  35             'description': ('Philosopher Dan Dennett makes a compelling '
  36                 'argument that not only don\'t we understand our own '
  37                 'consciousness, but that half the time our brains are '
  38                 'actively fooling us.'),
  39             'uploader': 'Dan Dennett',
  40             'width': 854,
  41         }
  42     }, {
  43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  44         'md5': '226f4fb9c62380d11b7995efa4c87994',
  45         'info_dict': {
  46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  47             'ext': 'mp4',
  48             'title': 'Vishal Sikka: The beauty and power of algorithms',
  49             'thumbnail': 're:^https?://.+\.jpg',
  50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  51         }
  52     }, {
  53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  54         'md5': '49144e345a899b8cb34d315f3b9cfeeb',
  55         'info_dict': {
  56             'id': '1972',
  57             'ext': 'mp4',
  58             'title': 'Be passionate. Be courageous. Be your best.',
  59             'uploader': 'Gabby Giffords and Mark Kelly',
  60             'description': 'md5:5174aed4d0f16021b704120360f72b92',
  61         },
  62     }]
  63
  64     _NATIVE_FORMATS = {
  65         'low': {'preference': 1, 'width': 320, 'height': 180},
  66         'medium': {'preference': 2, 'width': 512, 'height': 288},
  67         'high': {'preference': 3, 'width': 854, 'height': 480},
  68     }
  69
  70     def _extract_info(self, webpage):
  71         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
  72             webpage, 'info json')
  73         return json.loads(info_json)
  74
  75     def _real_extract(self, url):
  76         m = re.match(self._VALID_URL, url, re.VERBOSE)
  77         if m.group('type') == 'embed':
  78             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  79             return self.url_result(desktop_url, 'TED')
  80         name = m.group('name')
  81         if m.group('type_talk'):
  82             return self._talk_info(url, name)
  83         elif m.group('type_watch'):
  84             return self._watch_info(url, name)
  85         else:
  86             return self._playlist_videos_info(url, name)
  87
  88     def _playlist_videos_info(self, url, name):
  89         '''Returns the videos of the playlist'''
  90
  91         webpage = self._download_webpage(url, name,
  92             'Downloading playlist webpage')
  93         info = self._extract_info(webpage)
  94         playlist_info = info['playlist']
  95
  96         playlist_entries = [
  97             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  98             for talk in info['talks']
  99         ]
 100         return self.playlist_result(
 101             playlist_entries,
 102             playlist_id=compat_str(playlist_info['id']),
 103             playlist_title=playlist_info['title'])
 104
 105     def _talk_info(self, url, video_name):
 106         webpage = self._download_webpage(url, video_name)
 107         self.report_extraction(video_name)
 108
 109         talk_info = self._extract_info(webpage)['talks'][0]
 110
 111         formats = [{
 112             'url': format_url,
 113             'format_id': format_id,
 114             'format': format_id,
 115         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
 116         if formats:
 117             for f in formats:
 118                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
 119                 if finfo:
 120                     f.update(finfo)
 121         else:
 122             # Use rtmp downloads
 123             formats = [{
 124                 'format_id': f['name'],
 125                 'url': talk_info['streamer'],
 126                 'play_path': f['file'],
 127                 'ext': 'flv',
 128                 'width': f['width'],
 129                 'height': f['height'],
 130                 'tbr': f['bitrate'],
 131             } for f in talk_info['resources']['rtmp']]
 132         self._sort_formats(formats)
 133
 134         video_id = compat_str(talk_info['id'])
 135         # subtitles
 136         video_subtitles = self.extract_subtitles(video_id, talk_info)
 137         if self._downloader.params.get('listsubtitles', False):
 138             self._list_available_subtitles(video_id, talk_info)
 139             return
 140
 141         thumbnail = talk_info['thumb']
 142         if not thumbnail.startswith('http'):
 143             thumbnail = 'http://' + thumbnail
 144         return {
 145             'id': video_id,
 146             'title': talk_info['title'],
 147             'uploader': talk_info['speaker'],
 148             'thumbnail': thumbnail,
 149             'description': self._og_search_description(webpage),
 150             'subtitles': video_subtitles,
 151             'formats': formats,
 152         }
 153
 154     def _get_available_subtitles(self, video_id, talk_info):
 155         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 156         if languages:
 157             sub_lang_list = {}
 158             for l in languages:
 159                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
 160                 sub_lang_list[l] = url
 161             return sub_lang_list
 162         else:
 163             self._downloader.report_warning('video doesn\'t have subtitles')
 164             return {}
 165
 166     def _watch_info(self, url, name):
 167         webpage = self._download_webpage(url, name)
 168
 169         config_json = self._html_search_regex(
 170             r"data-config='([^']+)", webpage, 'config')
 171         config = json.loads(config_json)
 172         video_url = config['video']['url']
 173         thumbnail = config.get('image', {}).get('url')
 174
 175         title = self._html_search_regex(
 176             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 177         description = self._html_search_regex(
 178             [
 179                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 180                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
 181             ],
 182             webpage, 'description', fatal=False)
 183
 184         return {
 185             'id': name,
 186             'url': video_url,
 187             'title': title,
 188             'thumbnail': thumbnail,
 189             'description': description,
 190         }