youtube_dl/extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .subtitles import SubtitlesInfoExtractor
   7
   8 from ..compat import (
   9     compat_str,
  10 )
  11
  12
  13 class TEDIE(SubtitlesInfoExtractor):
  14     _VALID_URL = r'''(?x)
  15         (?P<proto>https?://)
  16         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  17         (
  18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  19             |
  20             ((?P<type_talk>talks)) # We have a simple talk
  21             |
  22             (?P<type_watch>watch)/[^/]+/[^/]+
  23         )
  24         (/lang/(.*?))? # The url may contain the language
  25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  26         .*)$
  27         '''
  28     _TESTS = [{
  29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  30         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
  31         'info_dict': {
  32             'id': '102',
  33             'ext': 'mp4',
  34             'title': 'The illusion of consciousness',
  35             'description': ('Philosopher Dan Dennett makes a compelling '
  36                             'argument that not only don\'t we understand our own '
  37                             'consciousness, but that half the time our brains are '
  38                             'actively fooling us.'),
  39             'uploader': 'Dan Dennett',
  40             'width': 854,
  41             'duration': 1308,
  42         }
  43     }, {
  44         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  45         'md5': '226f4fb9c62380d11b7995efa4c87994',
  46         'info_dict': {
  47             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  48             'ext': 'mp4',
  49             'title': 'Vishal Sikka: The beauty and power of algorithms',
  50             'thumbnail': 're:^https?://.+\.jpg',
  51             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  52         }
  53     }, {
  54         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  55         'info_dict': {
  56             'id': '1972',
  57             'ext': 'mp4',
  58             'title': 'Be passionate. Be courageous. Be your best.',
  59             'uploader': 'Gabby Giffords and Mark Kelly',
  60             'description': 'md5:5174aed4d0f16021b704120360f72b92',
  61             'duration': 1128,
  62         },
  63     }, {
  64         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  65         'info_dict': {
  66             'id': '10',
  67             'title': 'Who are the hackers?',
  68         },
  69         'playlist_mincount': 6,
  70     }, {
  71         # contains a youtube video
  72         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  73         'add_ie': ['Youtube'],
  74         'info_dict': {
  75             'id': '_ZG8HBuDjgc',
  76             'ext': 'mp4',
  77             'title': 'Douglas Adams: Parrots the Universe and Everything',
  78             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  79             'uploader': 'University of California Television (UCTV)',
  80             'uploader_id': 'UCtelevision',
  81             'upload_date': '20080522',
  82         },
  83         'params': {
  84             'skip_download': True,
  85         },
  86     }, {
  87         # YouTube video
  88         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
  89         'add_ie': ['Youtube'],
  90         'info_dict': {
  91             'id': 'aFBIPO-P7LM',
  92             'ext': 'mp4',
  93             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
  94             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
  95             'uploader': 'TEDx Talks',
  96             'uploader_id': 'TEDxTalks',
  97             'upload_date': '20111216',
  98         },
  99         'params': {
 100             'skip_download': True,
 101         },
 102     }]
 103
 104     _NATIVE_FORMATS = {
 105         'low': {'preference': 1, 'width': 320, 'height': 180},
 106         'medium': {'preference': 2, 'width': 512, 'height': 288},
 107         'high': {'preference': 3, 'width': 854, 'height': 480},
 108     }
 109
 110     def _extract_info(self, webpage):
 111         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
 112                                        webpage, 'info json')
 113         return json.loads(info_json)
 114
 115     def _real_extract(self, url):
 116         m = re.match(self._VALID_URL, url, re.VERBOSE)
 117         if m.group('type').startswith('embed'):
 118             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
 119             return self.url_result(desktop_url, 'TED')
 120         name = m.group('name')
 121         if m.group('type_talk'):
 122             return self._talk_info(url, name)
 123         elif m.group('type_watch'):
 124             return self._watch_info(url, name)
 125         else:
 126             return self._playlist_videos_info(url, name)
 127
 128     def _playlist_videos_info(self, url, name):
 129         '''Returns the videos of the playlist'''
 130
 131         webpage = self._download_webpage(url, name,
 132                                          'Downloading playlist webpage')
 133         info = self._extract_info(webpage)
 134         playlist_info = info['playlist']
 135
 136         playlist_entries = [
 137             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
 138             for talk in info['talks']
 139         ]
 140         return self.playlist_result(
 141             playlist_entries,
 142             playlist_id=compat_str(playlist_info['id']),
 143             playlist_title=playlist_info['title'])
 144
 145     def _talk_info(self, url, video_name):
 146         webpage = self._download_webpage(url, video_name)
 147         self.report_extraction(video_name)
 148
 149         talk_info = self._extract_info(webpage)['talks'][0]
 150
 151         external = talk_info.get('external')
 152         if external:
 153             service = external['service']
 154             self.to_screen('Found video from %s' % service)
 155             ext_url = None
 156             if service.lower() == 'youtube':
 157                 ext_url = external.get('code')
 158             return {
 159                 '_type': 'url',
 160                 'url': ext_url or external['uri'],
 161             }
 162
 163         formats = [{
 164             'url': format_url,
 165             'format_id': format_id,
 166             'format': format_id,
 167         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
 168         if formats:
 169             for f in formats:
 170                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
 171                 if finfo:
 172                     f.update(finfo)
 173         else:
 174             # Use rtmp downloads
 175             formats = [{
 176                 'format_id': f['name'],
 177                 'url': talk_info['streamer'],
 178                 'play_path': f['file'],
 179                 'ext': 'flv',
 180                 'width': f['width'],
 181                 'height': f['height'],
 182                 'tbr': f['bitrate'],
 183             } for f in talk_info['resources']['rtmp']]
 184         self._sort_formats(formats)
 185
 186         video_id = compat_str(talk_info['id'])
 187         # subtitles
 188         video_subtitles = self.extract_subtitles(video_id, talk_info)
 189         if self._downloader.params.get('listsubtitles', False):
 190             self._list_available_subtitles(video_id, talk_info)
 191             return
 192
 193         thumbnail = talk_info['thumb']
 194         if not thumbnail.startswith('http'):
 195             thumbnail = 'http://' + thumbnail
 196         return {
 197             'id': video_id,
 198             'title': talk_info['title'].strip(),
 199             'uploader': talk_info['speaker'],
 200             'thumbnail': thumbnail,
 201             'description': self._og_search_description(webpage),
 202             'subtitles': video_subtitles,
 203             'formats': formats,
 204             'duration': talk_info.get('duration'),
 205         }
 206
 207     def _get_available_subtitles(self, video_id, talk_info):
 208         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 209         if languages:
 210             sub_lang_list = {}
 211             for l in languages:
 212                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
 213                 sub_lang_list[l] = url
 214             return sub_lang_list
 215         else:
 216             self._downloader.report_warning('video doesn\'t have subtitles')
 217             return {}
 218
 219     def _watch_info(self, url, name):
 220         webpage = self._download_webpage(url, name)
 221
 222         config_json = self._html_search_regex(
 223             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
 224             webpage, 'config')
 225         config = json.loads(config_json)['config']
 226         video_url = config['video']['url']
 227         thumbnail = config.get('image', {}).get('url')
 228
 229         title = self._html_search_regex(
 230             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 231         description = self._html_search_regex(
 232             [
 233                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 234                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
 235             ],
 236             webpage, 'description', fatal=False)
 237
 238         return {
 239             'id': name,
 240             'url': video_url,
 241             'title': title,
 242             'thumbnail': thumbnail,
 243             'description': description,
 244         }