1 from __future__ import unicode_literals
6 from .common import InfoExtractor
8 from ..compat import compat_str
9 from ..utils import int_or_none
12 class TEDIE(InfoExtractor):
15 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
17 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 ((?P<type_talk>talks)) # We have a simple talk
21 (?P<type_watch>watch)/[^/]+/[^/]+
23 (/lang/(.*?))? # The url may contain the language
24 /(?P<name>[\w-]+) # Here goes the name and then ".html"
28 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
29 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
33 'title': 'The illusion of consciousness',
34 'description': ('Philosopher Dan Dennett makes a compelling '
35 'argument that not only don\'t we understand our own '
36 'consciousness, but that half the time our brains are '
37 'actively fooling us.'),
38 'uploader': 'Dan Dennett',
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
57 'title': 'Be passionate. Be courageous. Be your best.',
58 'uploader': 'Gabby Giffords and Mark Kelly',
59 'description': 'md5:5174aed4d0f16021b704120360f72b92',
63 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
66 'title': 'Who are the hackers?',
68 'playlist_mincount': 6,
70 # contains a youtube video
71 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
72 'add_ie': ['Youtube'],
76 'title': 'Douglas Adams: Parrots the Universe and Everything',
77 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
78 'uploader': 'University of California Television (UCTV)',
79 'uploader_id': 'UCtelevision',
80 'upload_date': '20080522',
83 'skip_download': True,
87 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
88 'add_ie': ['Youtube'],
92 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
93 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
94 'uploader': 'TEDx Talks',
95 'uploader_id': 'TEDxTalks',
96 'upload_date': '20111216',
99 'skip_download': True,
104 'low': {'preference': 1, 'width': 320, 'height': 180},
105 'medium': {'preference': 2, 'width': 512, 'height': 288},
106 'high': {'preference': 3, 'width': 854, 'height': 480},
109 def _extract_info(self, webpage):
110 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
111 webpage, 'info json')
112 return json.loads(info_json)
114 def _real_extract(self, url):
115 m = re.match(self._VALID_URL, url, re.VERBOSE)
116 if m.group('type').startswith('embed'):
117 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
118 return self.url_result(desktop_url, 'TED')
119 name = m.group('name')
120 if m.group('type_talk'):
121 return self._talk_info(url, name)
122 elif m.group('type_watch'):
123 return self._watch_info(url, name)
125 return self._playlist_videos_info(url, name)
127 def _playlist_videos_info(self, url, name):
128 '''Returns the videos of the playlist'''
130 webpage = self._download_webpage(url, name,
131 'Downloading playlist webpage')
132 info = self._extract_info(webpage)
133 playlist_info = info['playlist']
136 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
137 for talk in info['talks']
139 return self.playlist_result(
141 playlist_id=compat_str(playlist_info['id']),
142 playlist_title=playlist_info['title'])
144 def _talk_info(self, url, video_name):
145 webpage = self._download_webpage(url, video_name)
146 self.report_extraction(video_name)
148 talk_info = self._extract_info(webpage)['talks'][0]
150 external = talk_info.get('external')
152 service = external['service']
153 self.to_screen('Found video from %s' % service)
155 if service.lower() == 'youtube':
156 ext_url = external.get('code')
159 'url': ext_url or external['uri'],
164 'format_id': format_id,
166 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
169 finfo = self._NATIVE_FORMATS.get(f['format_id'])
173 for format_id, resources in talk_info['resources'].items():
174 if format_id == 'h264':
175 for resource in resources:
176 bitrate = int_or_none(resource.get('bitrate'))
178 'url': resource['file'],
179 'format_id': '%s-%sk' % (format_id, bitrate),
182 elif format_id == 'rtmp':
183 streamer = talk_info.get('streamer')
186 for resource in resources:
188 'format_id': '%s-%s' % (format_id, resource.get('name')),
190 'play_path': resource['file'],
192 'width': int_or_none(resource.get('width')),
193 'height': int_or_none(resource.get('height')),
194 'tbr': int_or_none(resource.get('bitrate')),
196 elif format_id == 'hls':
197 hls_formats = self._extract_m3u8_formats(
198 resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
199 for f in hls_formats:
201 formats.extend(hls_formats)
203 audio_download = talk_info.get('audioDownload')
206 'url': audio_download,
207 'format_id': 'audio',
211 self._sort_formats(formats)
213 video_id = compat_str(talk_info['id'])
215 thumbnail = talk_info['thumb']
216 if not thumbnail.startswith('http'):
217 thumbnail = 'http://' + thumbnail
220 'title': talk_info['title'].strip(),
221 'uploader': talk_info['speaker'],
222 'thumbnail': thumbnail,
223 'description': self._og_search_description(webpage),
224 'subtitles': self._get_subtitles(video_id, talk_info),
226 'duration': talk_info.get('duration'),
229 def _get_subtitles(self, video_id, talk_info):
230 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
236 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
239 for ext in ['ted', 'srt']
245 def _watch_info(self, url, name):
246 webpage = self._download_webpage(url, name)
248 config_json = self._html_search_regex(
249 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
251 config = json.loads(config_json)['config']
252 video_url = config['video']['url']
253 thumbnail = config.get('image', {}).get('url')
255 title = self._html_search_regex(
256 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
257 description = self._html_search_regex(
259 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
260 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
262 webpage, 'description', fatal=False)
268 'thumbnail': thumbnail,
269 'description': description,