]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/extractor/franceculture.py
[generic] Do not download images as videos by accident
[youtube-dl.git] / youtube_dl / extractor / franceculture.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import json
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9     compat_parse_qs,
10     compat_urlparse,
11 )
12
13
14 class FranceCultureIE(InfoExtractor):
15     _VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
16     _TEST = {
17         'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
18         'info_dict': {
19             'id': '4795174',
20             'ext': 'mp3',
21             'title': 'Rendez-vous au pays des geeks',
22             'vcodec': 'none',
23             'uploader': 'Colette Fellous',
24             'upload_date': '20140301',
25             'duration': 3601,
26             'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
27             'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
28         }
29     }
30
31     def _real_extract(self, url):
32         mobj = re.match(self._VALID_URL, url)
33         video_id = mobj.group('id')
34         baseurl = mobj.group('baseurl')
35
36         webpage = self._download_webpage(url, video_id)
37         params_code = self._search_regex(
38             r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
39             webpage, 'parameter code')
40         params = compat_parse_qs(params_code)
41         video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
42
43         title = self._html_search_regex(
44             r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
45         uploader = self._html_search_regex(
46             r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
47             webpage, 'uploader', fatal=False)
48         thumbnail_part = self._html_search_regex(
49             r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
50             'thumbnail', fatal=False)
51         if thumbnail_part is None:
52             thumbnail = None
53         else:
54             thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
55         description = self._html_search_regex(
56             r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
57
58         info = json.loads(params['infoData'][0])[0]
59         duration = info.get('media_length')
60         upload_date_candidate = info.get('media_section5')
61         upload_date = (
62             upload_date_candidate
63             if (upload_date_candidate is not None and
64                 re.match(r'[0-9]{8}$', upload_date_candidate))
65             else None)
66
67         return {
68             'id': video_id,
69             'url': video_url,
70             'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
71             'duration': duration,
72             'uploader': uploader,
73             'upload_date': upload_date,
74             'title': title,
75             'thumbnail': thumbnail,
76             'description': description,
77         }