]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/extractor/ceskatelevize.py
[kaltura] sanitize embed URLs
[youtube-dl.git] / youtube_dl / extractor / ceskatelevize.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_parse_unquote,
9     compat_urllib_parse_urlparse,
10 )
11 from ..utils import (
12     ExtractorError,
13     float_or_none,
14     sanitized_Request,
15     unescapeHTML,
16     update_url_query,
17     urlencode_postdata,
18     USER_AGENTS,
19 )
20
21
22 class CeskaTelevizeIE(InfoExtractor):
23     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
24     _TESTS = [{
25         'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
26         'info_dict': {
27             'id': '61924494877246241',
28             'ext': 'mp4',
29             'title': 'Hyde Park Civilizace: Život v Grónsku',
30             'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
31             'thumbnail': r're:^https?://.*\.jpg',
32             'duration': 3350,
33         },
34         'params': {
35             # m3u8 download
36             'skip_download': True,
37         },
38     }, {
39         'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
40         'info_dict': {
41             'id': '61924494877028507',
42             'ext': 'mp4',
43             'title': 'Hyde Park Civilizace: Bonus 01 - En',
44             'description': 'English Subtittles',
45             'thumbnail': r're:^https?://.*\.jpg',
46             'duration': 81.3,
47         },
48         'params': {
49             # m3u8 download
50             'skip_download': True,
51         },
52     }, {
53         # live stream
54         'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
55         'info_dict': {
56             'id': 402,
57             'ext': 'mp4',
58             'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
59             'is_live': True,
60         },
61         'params': {
62             # m3u8 download
63             'skip_download': True,
64         },
65         'skip': 'Georestricted to Czech Republic',
66     }, {
67         'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
68         'only_matching': True,
69     }]
70
71     def _real_extract(self, url):
72         playlist_id = self._match_id(url)
73
74         webpage = self._download_webpage(url, playlist_id)
75
76         NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
77         if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
78             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
79
80         type_ = None
81         episode_id = None
82
83         playlist = self._parse_json(
84             self._search_regex(
85                 r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
86                 default='{}'), playlist_id)
87         if playlist:
88             type_ = playlist.get('type')
89             episode_id = playlist.get('id')
90
91         if not type_:
92             type_ = self._html_search_regex(
93                 r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
94                 webpage, 'type')
95         if not episode_id:
96             episode_id = self._html_search_regex(
97                 r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
98                 webpage, 'episode_id')
99
100         data = {
101             'playlist[0][type]': type_,
102             'playlist[0][id]': episode_id,
103             'requestUrl': compat_urllib_parse_urlparse(url).path,
104             'requestSource': 'iVysilani',
105         }
106
107         entries = []
108
109         for user_agent in (None, USER_AGENTS['Safari']):
110             req = sanitized_Request(
111                 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
112                 data=urlencode_postdata(data))
113
114             req.add_header('Content-type', 'application/x-www-form-urlencoded')
115             req.add_header('x-addr', '127.0.0.1')
116             req.add_header('X-Requested-With', 'XMLHttpRequest')
117             if user_agent:
118                 req.add_header('User-Agent', user_agent)
119             req.add_header('Referer', url)
120
121             playlistpage = self._download_json(req, playlist_id, fatal=False)
122
123             if not playlistpage:
124                 continue
125
126             playlist_url = playlistpage['url']
127             if playlist_url == 'error_region':
128                 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
129
130             req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
131             req.add_header('Referer', url)
132
133             playlist_title = self._og_search_title(webpage, default=None)
134             playlist_description = self._og_search_description(webpage, default=None)
135
136             playlist = self._download_json(req, playlist_id, fatal=False)
137             if not playlist:
138                 continue
139
140             playlist = playlist.get('playlist')
141             if not isinstance(playlist, list):
142                 continue
143
144             playlist_len = len(playlist)
145
146             for num, item in enumerate(playlist):
147                 is_live = item.get('type') == 'LIVE'
148                 formats = []
149                 for format_id, stream_url in item.get('streamUrls', {}).items():
150                     if 'playerType=flash' in stream_url:
151                         stream_formats = self._extract_m3u8_formats(
152                             stream_url, playlist_id, 'mp4', 'm3u8_native',
153                             m3u8_id='hls-%s' % format_id, fatal=False)
154                     else:
155                         stream_formats = self._extract_mpd_formats(
156                             stream_url, playlist_id,
157                             mpd_id='dash-%s' % format_id, fatal=False)
158                     # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
159                     if format_id == 'audioDescription':
160                         for f in stream_formats:
161                             f['source_preference'] = -10
162                     formats.extend(stream_formats)
163
164                 if user_agent and len(entries) == playlist_len:
165                     entries[num]['formats'].extend(formats)
166                     continue
167
168                 item_id = item.get('id') or item['assetId']
169                 title = item['title']
170
171                 duration = float_or_none(item.get('duration'))
172                 thumbnail = item.get('previewImageUrl')
173
174                 subtitles = {}
175                 if item.get('type') == 'VOD':
176                     subs = item.get('subtitles')
177                     if subs:
178                         subtitles = self.extract_subtitles(episode_id, subs)
179
180                 if playlist_len == 1:
181                     final_title = playlist_title or title
182                     if is_live:
183                         final_title = self._live_title(final_title)
184                 else:
185                     final_title = '%s (%s)' % (playlist_title, title)
186
187                 entries.append({
188                     'id': item_id,
189                     'title': final_title,
190                     'description': playlist_description if playlist_len == 1 else None,
191                     'thumbnail': thumbnail,
192                     'duration': duration,
193                     'formats': formats,
194                     'subtitles': subtitles,
195                     'is_live': is_live,
196                 })
197
198         for e in entries:
199             self._sort_formats(e['formats'])
200
201         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
202
203     def _get_subtitles(self, episode_id, subs):
204         original_subtitles = self._download_webpage(
205             subs[0]['url'], episode_id, 'Downloading subtitles')
206         srt_subs = self._fix_subtitles(original_subtitles)
207         return {
208             'cs': [{
209                 'ext': 'srt',
210                 'data': srt_subs,
211             }]
212         }
213
214     @staticmethod
215     def _fix_subtitles(subtitles):
216         """ Convert millisecond-based subtitles to SRT """
217
218         def _msectotimecode(msec):
219             """ Helper utility to convert milliseconds to timecode """
220             components = []
221             for divider in [1000, 60, 60, 100]:
222                 components.append(msec % divider)
223                 msec //= divider
224             return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
225
226         def _fix_subtitle(subtitle):
227             for line in subtitle.splitlines():
228                 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
229                 if m:
230                     yield m.group(1)
231                     start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
232                     yield '{0} --> {1}'.format(start, stop)
233                 else:
234                     yield line
235
236         return '\r\n'.join(_fix_subtitle(subtitles))
237
238
239 class CeskaTelevizePoradyIE(InfoExtractor):
240     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
241     _TESTS = [{
242         # video with 18+ caution trailer
243         'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
244         'info_dict': {
245             'id': '215562210900007-bogotart',
246             'title': 'Queer: Bogotart',
247             'description': 'Alternativní průvodce současným queer světem',
248         },
249         'playlist': [{
250             'info_dict': {
251                 'id': '61924494876844842',
252                 'ext': 'mp4',
253                 'title': 'Queer: Bogotart (Varování 18+)',
254                 'duration': 10.2,
255             },
256         }, {
257             'info_dict': {
258                 'id': '61924494877068022',
259                 'ext': 'mp4',
260                 'title': 'Queer: Bogotart (Queer)',
261                 'thumbnail': r're:^https?://.*\.jpg',
262                 'duration': 1558.3,
263             },
264         }],
265         'params': {
266             # m3u8 download
267             'skip_download': True,
268         },
269     }, {
270         # iframe embed
271         'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
272         'only_matching': True,
273     }]
274
275     def _real_extract(self, url):
276         video_id = self._match_id(url)
277
278         webpage = self._download_webpage(url, video_id)
279
280         data_url = update_url_query(unescapeHTML(self._search_regex(
281             (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
282              r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
283             webpage, 'iframe player url', group='url')), query={
284                 'autoStart': 'true',
285         })
286
287         return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())