youtube_dl/extractor/francetv.py

   1 # encoding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 import re
   6 import json
   7
   8 from .common import InfoExtractor
   9 from ..utils import (
  10     compat_urlparse,
  11 )
  12
  13
  14 class FranceTVBaseInfoExtractor(InfoExtractor):
  15     def _extract_video(self, video_id):
  16         info = self._download_xml(
  17             'http://www.francetvinfo.fr/appftv/webservices/video/'
  18             'getInfosOeuvre.php?id-diffusion='
  19             + video_id, video_id, 'Downloading XML config')
  20
  21         manifest_url = info.find('videos/video/url').text
  22         video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8')
  23         video_url = video_url.replace('/z/', '/i/')
  24         thumbnail_path = info.find('image').text
  25
  26         return {'id': video_id,
  27                 'ext': 'flv' if video_url.startswith('rtmp') else 'mp4',
  28                 'url': video_url,
  29                 'title': info.find('titre').text,
  30                 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path),
  31                 'description': info.find('synopsis').text,
  32                 }
  33
  34
  35 class PluzzIE(FranceTVBaseInfoExtractor):
  36     IE_NAME = 'pluzz.francetv.fr'
  37     _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html'
  38
  39     # Can't use tests, videos expire in 7 days
  40
  41     def _real_extract(self, url):
  42         title = re.match(self._VALID_URL, url).group(1)
  43         webpage = self._download_webpage(url, title)
  44         video_id = self._search_regex(
  45             r'data-diffusion="(\d+)"', webpage, 'ID')
  46         return self._extract_video(video_id)
  47
  48
  49 class FranceTvInfoIE(FranceTVBaseInfoExtractor):
  50     IE_NAME = 'francetvinfo.fr'
  51     _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'
  52
  53     _TEST = {
  54         'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
  55         'file': '84981923.mp4',
  56         'info_dict': {
  57             'title': 'Soir 3',
  58         },
  59         'params': {
  60             'skip_download': True,
  61         },
  62     }
  63
  64     def _real_extract(self, url):
  65         mobj = re.match(self._VALID_URL, url)
  66         page_title = mobj.group('title')
  67         webpage = self._download_webpage(url, page_title)
  68         video_id = self._search_regex(r'id-video=(\d+?)[@"]', webpage, 'video id')
  69         return self._extract_video(video_id)
  70
  71
  72 class FranceTVIE(FranceTVBaseInfoExtractor):
  73     IE_NAME = 'francetv'
  74     IE_DESC = 'France 2, 3, 4, 5 and Ô'
  75     _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
  76         (?:
  77             emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
  78         |   (emissions?|jt)/(?P<key>[^/?]+)
  79         )'''
  80
  81     _TESTS = [
  82         # france2
  83         {
  84             'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
  85             'file': '75540104.mp4',
  86             'info_dict': {
  87                 'title': '13h15, le samedi...',
  88                 'description': 'md5:2e5b58ba7a2d3692b35c792be081a03d',
  89             },
  90             'params': {
  91                 # m3u8 download
  92                 'skip_download': True,
  93             },
  94         },
  95         # france3
  96         {
  97             'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
  98             'info_dict': {
  99                 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
 100                 'ext': 'flv',
 101                 'title': 'Le scandale du prix des médicaments',
 102                 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',
 103             },
 104             'params': {
 105                 # rtmp download
 106                 'skip_download': True,
 107             },
 108         },
 109         # france4
 110         {
 111             'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
 112             'info_dict': {
 113                 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
 114                 'ext': 'flv',
 115                 'title': 'Hero Corp Making of - Extrait 1',
 116                 'description': 'md5:c87d54871b1790679aec1197e73d650a',
 117             },
 118             'params': {
 119                 # rtmp download
 120                 'skip_download': True,
 121             },
 122         },
 123         # france5
 124         {
 125             'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968',
 126             'info_dict': {
 127                 'id': '92837968',
 128                 'ext': 'mp4',
 129                 'title': 'C à dire ?!',
 130                 'description': 'md5:fb1db1cbad784dcce7c7a7bd177c8e2f',
 131             },
 132             'params': {
 133                 # m3u8 download
 134                 'skip_download': True,
 135             },
 136         },
 137         # franceo
 138         {
 139             'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013',
 140             'info_dict': {
 141                 'id': '92327925',
 142                 'ext': 'mp4',
 143                 'title': 'Infô-Afrique',
 144                 'description': 'md5:ebf346da789428841bee0fd2a935ea55',
 145             },
 146             'params': {
 147                 # m3u8 download
 148                 'skip_download': True,
 149             },
 150             'skip': 'The id changes frequently',
 151         },
 152     ]
 153
 154     def _real_extract(self, url):
 155         mobj = re.match(self._VALID_URL, url)
 156         if mobj.group('key'):
 157             webpage = self._download_webpage(url, mobj.group('key'))
 158             id_res = [
 159                 (r'''(?x)<div\s+class="video-player">\s*
 160                     <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+
 161                     class="francetv-video-player">'''),
 162                 (r'<a id="player_direct" href="http://info\.francetelevisions'
 163                  '\.fr/\?id-video=([^"/&]+)'),
 164                 (r'<a class="video" id="ftv_player_(.+?)"'),
 165             ]
 166             video_id = self._html_search_regex(id_res, webpage, 'video ID')
 167         else:
 168             video_id = mobj.group('id')
 169         return self._extract_video(video_id)
 170
 171
 172 class GenerationQuoiIE(InfoExtractor):
 173     IE_NAME = 'france2.fr:generation-quoi'
 174     _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)'
 175
 176     _TEST = {
 177         'url': 'http://generation-quoi.france2.fr/portrait/garde-a-vous',
 178         'file': 'k7FJX8VBcvvLmX4wA5Q.mp4',
 179         'info_dict': {
 180             'title': 'Génération Quoi - Garde à Vous',
 181             'uploader': 'Génération Quoi',
 182         },
 183         'params': {
 184             # It uses Dailymotion
 185             'skip_download': True,
 186         },
 187     }
 188
 189     def _real_extract(self, url):
 190         mobj = re.match(self._VALID_URL, url)
 191         name = mobj.group('name')
 192         info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % name)
 193         info_json = self._download_webpage(info_url, name)
 194         info = json.loads(info_json)
 195         return self.url_result('http://www.dailymotion.com/video/%s' % info['id'],
 196             ie='Dailymotion')
 197
 198
 199 class CultureboxIE(FranceTVBaseInfoExtractor):
 200     IE_NAME = 'culturebox.francetvinfo.fr'
 201     _VALID_URL = r'https?://culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)'
 202
 203     _TEST = {
 204         'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813',
 205         'info_dict': {
 206             'id': 'EV_6785',
 207             'ext': 'mp4',
 208             'title': 'Einstein on the beach au Théâtre du Châtelet',
 209             'description': 'md5:9ce2888b1efefc617b5e58b3f6200eeb',
 210         },
 211         'params': {
 212             # m3u8 download
 213             'skip_download': True,
 214         },
 215     }
 216
 217     def _real_extract(self, url):
 218         mobj = re.match(self._VALID_URL, url)
 219         name = mobj.group('name')
 220         webpage = self._download_webpage(url, name)
 221         video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, 'video id')
 222         return self._extract_video(video_id)