youtube_dl/extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .common import InfoExtractor
   7
   8 from ..compat import compat_str
   9 from ..utils import int_or_none
  10
  11
  12 class TEDIE(InfoExtractor):
  13     _VALID_URL = r'''(?x)
  14         (?P<proto>https?://)
  15         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  16         (
  17             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  18             |
  19             ((?P<type_talk>talks)) # We have a simple talk
  20             |
  21             (?P<type_watch>watch)/[^/]+/[^/]+
  22         )
  23         (/lang/(.*?))? # The url may contain the language
  24         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  25         .*)$
  26         '''
  27     _TESTS = [{
  28         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  29         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
  30         'info_dict': {
  31             'id': '102',
  32             'ext': 'mp4',
  33             'title': 'The illusion of consciousness',
  34             'description': ('Philosopher Dan Dennett makes a compelling '
  35                             'argument that not only don\'t we understand our own '
  36                             'consciousness, but that half the time our brains are '
  37                             'actively fooling us.'),
  38             'uploader': 'Dan Dennett',
  39             'width': 854,
  40             'duration': 1308,
  41         }
  42     }, {
  43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  44         'md5': '226f4fb9c62380d11b7995efa4c87994',
  45         'info_dict': {
  46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  47             'ext': 'mp4',
  48             'title': 'Vishal Sikka: The beauty and power of algorithms',
  49             'thumbnail': 're:^https?://.+\.jpg',
  50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  51         }
  52     }, {
  53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  54         'info_dict': {
  55             'id': '1972',
  56             'ext': 'mp4',
  57             'title': 'Be passionate. Be courageous. Be your best.',
  58             'uploader': 'Gabby Giffords and Mark Kelly',
  59             'description': 'md5:5174aed4d0f16021b704120360f72b92',
  60             'duration': 1128,
  61         },
  62     }, {
  63         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  64         'info_dict': {
  65             'id': '10',
  66             'title': 'Who are the hackers?',
  67         },
  68         'playlist_mincount': 6,
  69     }, {
  70         # contains a youtube video
  71         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  72         'add_ie': ['Youtube'],
  73         'info_dict': {
  74             'id': '_ZG8HBuDjgc',
  75             'ext': 'mp4',
  76             'title': 'Douglas Adams: Parrots the Universe and Everything',
  77             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  78             'uploader': 'University of California Television (UCTV)',
  79             'uploader_id': 'UCtelevision',
  80             'upload_date': '20080522',
  81         },
  82         'params': {
  83             'skip_download': True,
  84         },
  85     }, {
  86         # YouTube video
  87         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
  88         'add_ie': ['Youtube'],
  89         'info_dict': {
  90             'id': 'aFBIPO-P7LM',
  91             'ext': 'mp4',
  92             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
  93             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
  94             'uploader': 'TEDx Talks',
  95             'uploader_id': 'TEDxTalks',
  96             'upload_date': '20111216',
  97         },
  98         'params': {
  99             'skip_download': True,
 100         },
 101     }]
 102
 103     _NATIVE_FORMATS = {
 104         'low': {'preference': 1, 'width': 320, 'height': 180},
 105         'medium': {'preference': 2, 'width': 512, 'height': 288},
 106         'high': {'preference': 3, 'width': 854, 'height': 480},
 107     }
 108
 109     def _extract_info(self, webpage):
 110         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
 111                                        webpage, 'info json')
 112         return json.loads(info_json)
 113
 114     def _real_extract(self, url):
 115         m = re.match(self._VALID_URL, url, re.VERBOSE)
 116         if m.group('type').startswith('embed'):
 117             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
 118             return self.url_result(desktop_url, 'TED')
 119         name = m.group('name')
 120         if m.group('type_talk'):
 121             return self._talk_info(url, name)
 122         elif m.group('type_watch'):
 123             return self._watch_info(url, name)
 124         else:
 125             return self._playlist_videos_info(url, name)
 126
 127     def _playlist_videos_info(self, url, name):
 128         '''Returns the videos of the playlist'''
 129
 130         webpage = self._download_webpage(url, name,
 131                                          'Downloading playlist webpage')
 132         info = self._extract_info(webpage)
 133         playlist_info = info['playlist']
 134
 135         playlist_entries = [
 136             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
 137             for talk in info['talks']
 138         ]
 139         return self.playlist_result(
 140             playlist_entries,
 141             playlist_id=compat_str(playlist_info['id']),
 142             playlist_title=playlist_info['title'])
 143
 144     def _talk_info(self, url, video_name):
 145         webpage = self._download_webpage(url, video_name)
 146         self.report_extraction(video_name)
 147
 148         talk_info = self._extract_info(webpage)['talks'][0]
 149
 150         external = talk_info.get('external')
 151         if external:
 152             service = external['service']
 153             self.to_screen('Found video from %s' % service)
 154             ext_url = None
 155             if service.lower() == 'youtube':
 156                 ext_url = external.get('code')
 157             return {
 158                 '_type': 'url',
 159                 'url': ext_url or external['uri'],
 160             }
 161
 162         formats = [{
 163             'url': format_url,
 164             'format_id': format_id,
 165             'format': format_id,
 166         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
 167         if formats:
 168             for f in formats:
 169                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
 170                 if finfo:
 171                     f.update(finfo)
 172
 173         for format_id, resources in talk_info['resources'].items():
 174             if format_id == 'h264':
 175                 for resource in resources:
 176                     bitrate = int_or_none(resource.get('bitrate'))
 177                     formats.append({
 178                         'url': resource['file'],
 179                         'format_id': '%s-%sk' % (format_id, bitrate),
 180                         'tbr': bitrate,
 181                     })
 182             elif format_id == 'rtmp':
 183                 streamer = talk_info.get('streamer')
 184                 if not streamer:
 185                     continue
 186                 for resource in resources:
 187                     formats.append({
 188                         'format_id': '%s-%s' % (format_id, resource.get('name')),
 189                         'url': streamer,
 190                         'play_path': resource['file'],
 191                         'ext': 'flv',
 192                         'width': int_or_none(resource.get('width')),
 193                         'height': int_or_none(resource.get('height')),
 194                         'tbr': int_or_none(resource.get('bitrate')),
 195                     })
 196             elif format_id == 'hls':
 197                 hls_formats = self._extract_m3u8_formats(
 198                     resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
 199                 for f in hls_formats:
 200                     f['acodec'] = 'none'
 201                 formats.extend(hls_formats)
 202
 203         audio_download = talk_info.get('audioDownload')
 204         if audio_download:
 205             formats.append({
 206                 'url': audio_download,
 207                 'format_id': 'audio',
 208                 'vcodec': 'none',
 209             })
 210
 211         self._sort_formats(formats)
 212
 213         video_id = compat_str(talk_info['id'])
 214
 215         thumbnail = talk_info['thumb']
 216         if not thumbnail.startswith('http'):
 217             thumbnail = 'http://' + thumbnail
 218         return {
 219             'id': video_id,
 220             'title': talk_info['title'].strip(),
 221             'uploader': talk_info['speaker'],
 222             'thumbnail': thumbnail,
 223             'description': self._og_search_description(webpage),
 224             'subtitles': self._get_subtitles(video_id, talk_info),
 225             'formats': formats,
 226             'duration': talk_info.get('duration'),
 227         }
 228
 229     def _get_subtitles(self, video_id, talk_info):
 230         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 231         if languages:
 232             sub_lang_list = {}
 233             for l in languages:
 234                 sub_lang_list[l] = [
 235                     {
 236                         'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
 237                         'ext': ext,
 238                     }
 239                     for ext in ['ted', 'srt']
 240                 ]
 241             return sub_lang_list
 242         else:
 243             return {}
 244
 245     def _watch_info(self, url, name):
 246         webpage = self._download_webpage(url, name)
 247
 248         config_json = self._html_search_regex(
 249             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
 250             webpage, 'config')
 251         config = json.loads(config_json)['config']
 252         video_url = config['video']['url']
 253         thumbnail = config.get('image', {}).get('url')
 254
 255         title = self._html_search_regex(
 256             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 257         description = self._html_search_regex(
 258             [
 259                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 260                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
 261             ],
 262             webpage, 'description', fatal=False)
 263
 264         return {
 265             'id': name,
 266             'url': video_url,
 267             'title': title,
 268             'thumbnail': thumbnail,
 269             'description': description,
 270         }