2 from __future__ import unicode_literals
6 from .common import InfoExtractor
7 from .generic import GenericIE
20 class ARDMediathekIE(InfoExtractor):
21 IE_NAME = 'ARD:mediathek'
22 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
25 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
26 'only_matching': True,
28 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
32 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
33 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
35 'skip': 'Blocked outside of Germany',
38 def _extract_media_info(self, media_info_url, webpage, video_id):
39 media_info = self._download_json(
40 media_info_url, video_id, 'Downloading media JSON')
42 formats = self._extract_formats(media_info, video_id)
45 if '"fsk"' in webpage:
47 'This video is only available after 20:00', expected=True)
48 elif media_info.get('_geoblocked'):
49 raise ExtractorError('This video is not available due to geo restriction', expected=True)
51 self._sort_formats(formats)
53 duration = int_or_none(media_info.get('_duration'))
54 thumbnail = media_info.get('_previewImage')
57 subtitle_url = media_info.get('_subtitleUrl')
67 'thumbnail': thumbnail,
69 'subtitles': subtitles,
72 def _extract_formats(self, media_info, video_id):
73 type_ = media_info.get('_type')
74 media_array = media_info.get('_mediaArray', [])
76 for num, media in enumerate(media_array):
77 for stream in media.get('_mediaStreamArray', []):
78 stream_urls = stream.get('_stream')
81 if not isinstance(stream_urls, list):
82 stream_urls = [stream_urls]
83 quality = stream.get('_quality')
84 server = stream.get('_server')
85 for stream_url in stream_urls:
86 ext = determine_ext(stream_url)
88 formats.extend(self._extract_f4m_formats(
89 stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
90 video_id, preference=-1, f4m_id='hds'))
92 formats.extend(self._extract_m3u8_formats(
93 stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
95 if server and server.startswith('rtmp'):
98 'play_path': stream_url,
99 'format_id': 'a%s-rtmp-%s' % (num, quality),
101 elif stream_url.startswith('http'):
104 'format_id': 'a%s-%s-%s' % (num, ext, quality)
108 m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
111 'width': int(m.group('width')),
112 'height': int(m.group('height')),
119 def _real_extract(self, url):
120 # determine video id from url
121 m = re.match(self._VALID_URL, url)
123 numid = re.search(r'documentId=([0-9]+)', url)
125 video_id = numid.group(1)
127 video_id = m.group('video_id')
129 webpage = self._download_webpage(url, video_id)
131 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
132 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
134 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
135 raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
137 if re.search(r'[\?&]rss($|[=&])', url):
138 doc = parse_xml(webpage)
140 return GenericIE()._extract_rss(url, video_id, doc)
142 title = self._html_search_regex(
143 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
144 r'<meta name="dcterms.title" content="(.*?)"/>',
145 r'<h4 class="headline">(.*?)</h4>'],
147 description = self._html_search_meta(
148 'dcterms.abstract', webpage, 'description', default=None)
149 if description is None:
150 description = self._html_search_meta(
151 'description', webpage, 'meta description')
153 # Thumbnail is sometimes not present.
154 # It is in the mobile version, but that seems to use a different URL
155 # structure altogether.
156 thumbnail = self._og_search_thumbnail(webpage, default=None)
158 media_streams = re.findall(r'''(?x)
159 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
160 "([^"]+)"''', webpage)
163 QUALITIES = qualities(['lo', 'hi', 'hq'])
165 for furl in set(media_streams):
166 if furl.endswith('.f4m'):
169 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
170 fid = fid_m.group(1) if fid_m else None
172 'quality': QUALITIES(fid),
176 self._sort_formats(formats)
180 else: # request JSON file
181 info = self._extract_media_info(
182 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
187 'description': description,
188 'thumbnail': thumbnail,
194 class ARDIE(InfoExtractor):
195 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
197 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
198 'md5': 'd216c3a86493f9322545e045ddc3eb35',
200 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
204 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
205 'upload_date': '20140804',
206 'thumbnail': 're:^https?://.*\.jpg$',
210 def _real_extract(self, url):
211 mobj = re.match(self._VALID_URL, url)
212 display_id = mobj.group('display_id')
214 player_url = mobj.group('mainurl') + '~playerXml.xml'
215 doc = self._download_xml(player_url, display_id)
216 video_node = doc.find('./video')
217 upload_date = unified_strdate(xpath_text(
218 video_node, './broadcastDate'))
219 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
222 for a in video_node.findall('.//asset'):
224 'format_id': a.attrib['type'],
225 'width': int_or_none(a.find('./frameWidth').text),
226 'height': int_or_none(a.find('./frameHeight').text),
227 'vbr': int_or_none(a.find('./bitrateVideo').text),
228 'abr': int_or_none(a.find('./bitrateAudio').text),
229 'vcodec': a.find('./codecVideo').text,
230 'tbr': int_or_none(a.find('./totalBitrate').text),
232 if a.find('./serverPrefix').text:
233 f['url'] = a.find('./serverPrefix').text
234 f['playpath'] = a.find('./fileName').text
236 f['url'] = a.find('./fileName').text
238 self._sort_formats(formats)
241 'id': mobj.group('id'),
243 'display_id': display_id,
244 'title': video_node.find('./title').text,
245 'duration': parse_duration(video_node.find('./duration').text),
246 'upload_date': upload_date,
247 'thumbnail': thumbnail,