]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/extractor/orf.py
[srgssr] Extend _VALID_URL (closes #26555, closes #26556, closes #26578)
[youtube-dl.git] / youtube_dl / extractor / orf.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     clean_html,
10     determine_ext,
11     float_or_none,
12     HEADRequest,
13     int_or_none,
14     orderedSet,
15     remove_end,
16     str_or_none,
17     strip_jsonp,
18     unescapeHTML,
19     unified_strdate,
20     url_or_none,
21 )
22
23
24 class ORFTVthekIE(InfoExtractor):
25     IE_NAME = 'orf:tvthek'
26     IE_DESC = 'ORF TVthek'
27     _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
28
29     _TESTS = [{
30         'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
31         'playlist': [{
32             'md5': '2942210346ed779588f428a92db88712',
33             'info_dict': {
34                 'id': '8896777',
35                 'ext': 'mp4',
36                 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
37                 'description': 'md5:c1272f0245537812d4e36419c207b67d',
38                 'duration': 2668,
39                 'upload_date': '20141208',
40             },
41         }],
42         'skip': 'Blocked outside of Austria / Germany',
43     }, {
44         'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
45         'info_dict': {
46             'id': '7982259',
47             'ext': 'mp4',
48             'title': 'Best of Ingrid Thurnher',
49             'upload_date': '20140527',
50             'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
51         },
52         'params': {
53             'skip_download': True,  # rtsp downloads
54         },
55         'skip': 'Blocked outside of Austria / Germany',
56     }, {
57         'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
58         'only_matching': True,
59     }, {
60         'url': 'http://tvthek.orf.at/profile/Universum/35429',
61         'only_matching': True,
62     }]
63
64     def _real_extract(self, url):
65         playlist_id = self._match_id(url)
66         webpage = self._download_webpage(url, playlist_id)
67
68         data_jsb = self._parse_json(
69             self._search_regex(
70                 r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
71                 webpage, 'playlist', group='json'),
72             playlist_id, transform_source=unescapeHTML)['playlist']['videos']
73
74         entries = []
75         for sd in data_jsb:
76             video_id, title = sd.get('id'), sd.get('title')
77             if not video_id or not title:
78                 continue
79             video_id = compat_str(video_id)
80             formats = []
81             for fd in sd['sources']:
82                 src = url_or_none(fd.get('src'))
83                 if not src:
84                     continue
85                 format_id_list = []
86                 for key in ('delivery', 'quality', 'quality_string'):
87                     value = fd.get(key)
88                     if value:
89                         format_id_list.append(value)
90                 format_id = '-'.join(format_id_list)
91                 ext = determine_ext(src)
92                 if ext == 'm3u8':
93                     m3u8_formats = self._extract_m3u8_formats(
94                         src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
95                     if any('/geoprotection' in f['url'] for f in m3u8_formats):
96                         self.raise_geo_restricted()
97                     formats.extend(m3u8_formats)
98                 elif ext == 'f4m':
99                     formats.extend(self._extract_f4m_formats(
100                         src, video_id, f4m_id=format_id, fatal=False))
101                 else:
102                     formats.append({
103                         'format_id': format_id,
104                         'url': src,
105                         'protocol': fd.get('protocol'),
106                     })
107
108             # Check for geoblocking.
109             # There is a property is_geoprotection, but that's always false
110             geo_str = sd.get('geoprotection_string')
111             if geo_str:
112                 try:
113                     http_url = next(
114                         f['url']
115                         for f in formats
116                         if re.match(r'^https?://.*\.mp4$', f['url']))
117                 except StopIteration:
118                     pass
119                 else:
120                     req = HEADRequest(http_url)
121                     self._request_webpage(
122                         req, video_id,
123                         note='Testing for geoblocking',
124                         errnote=((
125                             'This video seems to be blocked outside of %s. '
126                             'You may want to try the streaming-* formats.')
127                             % geo_str),
128                         fatal=False)
129
130             self._check_formats(formats, video_id)
131             self._sort_formats(formats)
132
133             subtitles = {}
134             for sub in sd.get('subtitles', []):
135                 sub_src = sub.get('src')
136                 if not sub_src:
137                     continue
138                 subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
139                     'url': sub_src,
140                 })
141
142             upload_date = unified_strdate(sd.get('created_date'))
143             entries.append({
144                 '_type': 'video',
145                 'id': video_id,
146                 'title': title,
147                 'formats': formats,
148                 'subtitles': subtitles,
149                 'description': sd.get('description'),
150                 'duration': int_or_none(sd.get('duration_in_seconds')),
151                 'upload_date': upload_date,
152                 'thumbnail': sd.get('image_full_url'),
153             })
154
155         return {
156             '_type': 'playlist',
157             'entries': entries,
158             'id': playlist_id,
159         }
160
161
162 class ORFRadioIE(InfoExtractor):
163     def _real_extract(self, url):
164         mobj = re.match(self._VALID_URL, url)
165         show_date = mobj.group('date')
166         show_id = mobj.group('show')
167
168         data = self._download_json(
169             'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s'
170             % (self._API_STATION, show_id, show_date), show_id)
171
172         entries = []
173         for info in data['streams']:
174             loop_stream_id = str_or_none(info.get('loopStreamId'))
175             if not loop_stream_id:
176                 continue
177             title = str_or_none(data.get('title'))
178             if not title:
179                 continue
180             start = int_or_none(info.get('start'), scale=1000)
181             end = int_or_none(info.get('end'), scale=1000)
182             duration = end - start if end and start else None
183             entries.append({
184                 'id': loop_stream_id.replace('.mp3', ''),
185                 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
186                 'title': title,
187                 'description': clean_html(data.get('subtitle')),
188                 'duration': duration,
189                 'timestamp': start,
190                 'ext': 'mp3',
191                 'series': data.get('programTitle'),
192             })
193
194         return {
195             '_type': 'playlist',
196             'id': show_id,
197             'title': data.get('title'),
198             'description': clean_html(data.get('subtitle')),
199             'entries': entries,
200         }
201
202
203 class ORFFM4IE(ORFRadioIE):
204     IE_NAME = 'orf:fm4'
205     IE_DESC = 'radio FM4'
206     _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)'
207     _API_STATION = 'fm4'
208     _LOOP_STATION = 'fm4'
209
210     _TEST = {
211         'url': 'http://fm4.orf.at/player/20170107/4CC',
212         'md5': '2b0be47375432a7ef104453432a19212',
213         'info_dict': {
214             'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
215             'ext': 'mp3',
216             'title': 'Solid Steel Radioshow',
217             'description': 'Die Mixshow von Coldcut und Ninja Tune.',
218             'duration': 3599,
219             'timestamp': 1483819257,
220             'upload_date': '20170107',
221         },
222         'skip': 'Shows from ORF radios are only available for 7 days.',
223         'only_matching': True,
224     }
225
226
227 class ORFNOEIE(ORFRadioIE):
228     IE_NAME = 'orf:noe'
229     IE_DESC = 'Radio Niederösterreich'
230     _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
231     _API_STATION = 'noe'
232     _LOOP_STATION = 'oe2n'
233
234     _TEST = {
235         'url': 'https://noe.orf.at/player/20200423/NGM',
236         'only_matching': True,
237     }
238
239
240 class ORFWIEIE(ORFRadioIE):
241     IE_NAME = 'orf:wien'
242     IE_DESC = 'Radio Wien'
243     _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
244     _API_STATION = 'wie'
245     _LOOP_STATION = 'oe2w'
246
247     _TEST = {
248         'url': 'https://wien.orf.at/player/20200423/WGUM',
249         'only_matching': True,
250     }
251
252
253 class ORFBGLIE(ORFRadioIE):
254     IE_NAME = 'orf:burgenland'
255     IE_DESC = 'Radio Burgenland'
256     _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
257     _API_STATION = 'bgl'
258     _LOOP_STATION = 'oe2b'
259
260     _TEST = {
261         'url': 'https://burgenland.orf.at/player/20200423/BGM',
262         'only_matching': True,
263     }
264
265
266 class ORFOOEIE(ORFRadioIE):
267     IE_NAME = 'orf:oberoesterreich'
268     IE_DESC = 'Radio Oberösterreich'
269     _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
270     _API_STATION = 'ooe'
271     _LOOP_STATION = 'oe2o'
272
273     _TEST = {
274         'url': 'https://ooe.orf.at/player/20200423/OGMO',
275         'only_matching': True,
276     }
277
278
279 class ORFSTMIE(ORFRadioIE):
280     IE_NAME = 'orf:steiermark'
281     IE_DESC = 'Radio Steiermark'
282     _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
283     _API_STATION = 'stm'
284     _LOOP_STATION = 'oe2st'
285
286     _TEST = {
287         'url': 'https://steiermark.orf.at/player/20200423/STGMS',
288         'only_matching': True,
289     }
290
291
292 class ORFKTNIE(ORFRadioIE):
293     IE_NAME = 'orf:kaernten'
294     IE_DESC = 'Radio Kärnten'
295     _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
296     _API_STATION = 'ktn'
297     _LOOP_STATION = 'oe2k'
298
299     _TEST = {
300         'url': 'https://kaernten.orf.at/player/20200423/KGUMO',
301         'only_matching': True,
302     }
303
304
305 class ORFSBGIE(ORFRadioIE):
306     IE_NAME = 'orf:salzburg'
307     IE_DESC = 'Radio Salzburg'
308     _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
309     _API_STATION = 'sbg'
310     _LOOP_STATION = 'oe2s'
311
312     _TEST = {
313         'url': 'https://salzburg.orf.at/player/20200423/SGUM',
314         'only_matching': True,
315     }
316
317
318 class ORFTIRIE(ORFRadioIE):
319     IE_NAME = 'orf:tirol'
320     IE_DESC = 'Radio Tirol'
321     _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
322     _API_STATION = 'tir'
323     _LOOP_STATION = 'oe2t'
324
325     _TEST = {
326         'url': 'https://tirol.orf.at/player/20200423/TGUMO',
327         'only_matching': True,
328     }
329
330
331 class ORFVBGIE(ORFRadioIE):
332     IE_NAME = 'orf:vorarlberg'
333     IE_DESC = 'Radio Vorarlberg'
334     _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
335     _API_STATION = 'vbg'
336     _LOOP_STATION = 'oe2v'
337
338     _TEST = {
339         'url': 'https://vorarlberg.orf.at/player/20200423/VGUM',
340         'only_matching': True,
341     }
342
343
344 class ORFOE3IE(ORFRadioIE):
345     IE_NAME = 'orf:oe3'
346     IE_DESC = 'Radio Ã–sterreich 3'
347     _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
348     _API_STATION = 'oe3'
349     _LOOP_STATION = 'oe3'
350
351     _TEST = {
352         'url': 'https://oe3.orf.at/player/20200424/3WEK',
353         'only_matching': True,
354     }
355
356
357 class ORFOE1IE(ORFRadioIE):
358     IE_NAME = 'orf:oe1'
359     IE_DESC = 'Radio Ã–sterreich 1'
360     _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
361     _API_STATION = 'oe1'
362     _LOOP_STATION = 'oe1'
363
364     _TEST = {
365         'url': 'http://oe1.orf.at/player/20170108/456544',
366         'md5': '34d8a6e67ea888293741c86a099b745b',
367         'info_dict': {
368             'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
369             'ext': 'mp3',
370             'title': 'Morgenjournal',
371             'duration': 609,
372             'timestamp': 1483858796,
373             'upload_date': '20170108',
374         },
375         'skip': 'Shows from ORF radios are only available for 7 days.'
376     }
377
378
379 class ORFIPTVIE(InfoExtractor):
380     IE_NAME = 'orf:iptv'
381     IE_DESC = 'iptv.ORF.at'
382     _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
383
384     _TEST = {
385         'url': 'http://iptv.orf.at/stories/2275236/',
386         'md5': 'c8b22af4718a4b4af58342529453e3e5',
387         'info_dict': {
388             'id': '350612',
389             'ext': 'flv',
390             'title': 'Weitere Evakuierungen um Vulkan Calbuco',
391             'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
392             'duration': 68.197,
393             'thumbnail': r're:^https?://.*\.jpg$',
394             'upload_date': '20150425',
395         },
396     }
397
398     def _real_extract(self, url):
399         story_id = self._match_id(url)
400
401         webpage = self._download_webpage(
402             'http://iptv.orf.at/stories/%s' % story_id, story_id)
403
404         video_id = self._search_regex(
405             r'data-video(?:id)?="(\d+)"', webpage, 'video id')
406
407         data = self._download_json(
408             'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
409             video_id)[0]
410
411         duration = float_or_none(data['duration'], 1000)
412
413         video = data['sources']['default']
414         load_balancer_url = video['loadBalancerUrl']
415         abr = int_or_none(video.get('audioBitrate'))
416         vbr = int_or_none(video.get('bitrate'))
417         fps = int_or_none(video.get('videoFps'))
418         width = int_or_none(video.get('videoWidth'))
419         height = int_or_none(video.get('videoHeight'))
420         thumbnail = video.get('preview')
421
422         rendition = self._download_json(
423             load_balancer_url, video_id, transform_source=strip_jsonp)
424
425         f = {
426             'abr': abr,
427             'vbr': vbr,
428             'fps': fps,
429             'width': width,
430             'height': height,
431         }
432
433         formats = []
434         for format_id, format_url in rendition['redirect'].items():
435             if format_id == 'rtmp':
436                 ff = f.copy()
437                 ff.update({
438                     'url': format_url,
439                     'format_id': format_id,
440                 })
441                 formats.append(ff)
442             elif determine_ext(format_url) == 'f4m':
443                 formats.extend(self._extract_f4m_formats(
444                     format_url, video_id, f4m_id=format_id))
445             elif determine_ext(format_url) == 'm3u8':
446                 formats.extend(self._extract_m3u8_formats(
447                     format_url, video_id, 'mp4', m3u8_id=format_id))
448             else:
449                 continue
450         self._sort_formats(formats)
451
452         title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
453         description = self._og_search_description(webpage)
454         upload_date = unified_strdate(self._html_search_meta(
455             'dc.date', webpage, 'upload date'))
456
457         return {
458             'id': video_id,
459             'title': title,
460             'description': description,
461             'duration': duration,
462             'thumbnail': thumbnail,
463             'upload_date': upload_date,
464             'formats': formats,
465         }
466
467
468 class ORFFM4StoryIE(InfoExtractor):
469     IE_NAME = 'orf:fm4:story'
470     IE_DESC = 'fm4.orf.at stories'
471     _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
472
473     _TEST = {
474         'url': 'http://fm4.orf.at/stories/2865738/',
475         'playlist': [{
476             'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
477             'info_dict': {
478                 'id': '547792',
479                 'ext': 'flv',
480                 'title': 'Manu Delago und Inner Tongue live',
481                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
482                 'duration': 1748.52,
483                 'thumbnail': r're:^https?://.*\.jpg$',
484                 'upload_date': '20170913',
485             },
486         }, {
487             'md5': 'c6dd2179731f86f4f55a7b49899d515f',
488             'info_dict': {
489                 'id': '547798',
490                 'ext': 'flv',
491                 'title': 'Manu Delago und Inner Tongue live (2)',
492                 'duration': 1504.08,
493                 'thumbnail': r're:^https?://.*\.jpg$',
494                 'upload_date': '20170913',
495                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
496             },
497         }],
498     }
499
500     def _real_extract(self, url):
501         story_id = self._match_id(url)
502         webpage = self._download_webpage(url, story_id)
503
504         entries = []
505         all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
506         for idx, video_id in enumerate(all_ids):
507             data = self._download_json(
508                 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
509                 video_id)[0]
510
511             duration = float_or_none(data['duration'], 1000)
512
513             video = data['sources']['q8c']
514             load_balancer_url = video['loadBalancerUrl']
515             abr = int_or_none(video.get('audioBitrate'))
516             vbr = int_or_none(video.get('bitrate'))
517             fps = int_or_none(video.get('videoFps'))
518             width = int_or_none(video.get('videoWidth'))
519             height = int_or_none(video.get('videoHeight'))
520             thumbnail = video.get('preview')
521
522             rendition = self._download_json(
523                 load_balancer_url, video_id, transform_source=strip_jsonp)
524
525             f = {
526                 'abr': abr,
527                 'vbr': vbr,
528                 'fps': fps,
529                 'width': width,
530                 'height': height,
531             }
532
533             formats = []
534             for format_id, format_url in rendition['redirect'].items():
535                 if format_id == 'rtmp':
536                     ff = f.copy()
537                     ff.update({
538                         'url': format_url,
539                         'format_id': format_id,
540                     })
541                     formats.append(ff)
542                 elif determine_ext(format_url) == 'f4m':
543                     formats.extend(self._extract_f4m_formats(
544                         format_url, video_id, f4m_id=format_id))
545                 elif determine_ext(format_url) == 'm3u8':
546                     formats.extend(self._extract_m3u8_formats(
547                         format_url, video_id, 'mp4', m3u8_id=format_id))
548                 else:
549                     continue
550             self._sort_formats(formats)
551
552             title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
553             if idx >= 1:
554                 # Titles are duplicates, make them unique
555                 title += ' (' + str(idx + 1) + ')'
556             description = self._og_search_description(webpage)
557             upload_date = unified_strdate(self._html_search_meta(
558                 'dc.date', webpage, 'upload date'))
559
560             entries.append({
561                 'id': video_id,
562                 'title': title,
563                 'description': description,
564                 'duration': duration,
565                 'thumbnail': thumbnail,
566                 'upload_date': upload_date,
567                 'formats': formats,
568             })
569
570         return self.playlist_result(entries)