youtube_dl/extractor/francetv.py

   1 # encoding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 import re
   6 import json
   7
   8 from .common import InfoExtractor
   9 from ..utils import (
  10     compat_urlparse,
  11 )
  12
  13
  14 class FranceTVBaseInfoExtractor(InfoExtractor):
  15     def _extract_video(self, video_id):
  16         info = self._download_xml(
  17             'http://www.francetvinfo.fr/appftv/webservices/video/'
  18             'getInfosOeuvre.php?id-diffusion='
  19             + video_id, video_id, 'Downloading XML config')
  20
  21         manifest_url = info.find('videos/video/url').text
  22         manifest_url = manifest_url.replace('/z/', '/i/')
  23
  24         if manifest_url.startswith('rtmp'):
  25             formats = [{'url': manifest_url, 'ext': 'flv'}]
  26         else:
  27             formats = []
  28             available_formats = self._search_regex(r'/[^,]*,(.*?),k\.mp4', manifest_url, 'available formats')
  29             for index, format_descr in enumerate(available_formats.split(',')):
  30                 format_info = {
  31                     'url': manifest_url.replace('manifest.f4m', 'index_%d_av.m3u8' % index),
  32                     'ext': 'mp4',
  33                 }
  34                 m_resolution = re.search(r'(?P<width>\d+)x(?P<height>\d+)', format_descr)
  35                 if m_resolution is not None:
  36                     format_info.update({
  37                         'width': int(m_resolution.group('width')),
  38                         'height': int(m_resolution.group('height')),
  39                     })
  40                 formats.append(format_info)
  41
  42         thumbnail_path = info.find('image').text
  43
  44         return {
  45             'id': video_id,
  46             'title': info.find('titre').text,
  47             'formats': formats,
  48             'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path),
  49             'description': info.find('synopsis').text,
  50         }
  51
  52
  53 class PluzzIE(FranceTVBaseInfoExtractor):
  54     IE_NAME = 'pluzz.francetv.fr'
  55     _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html'
  56
  57     # Can't use tests, videos expire in 7 days
  58
  59     def _real_extract(self, url):
  60         title = re.match(self._VALID_URL, url).group(1)
  61         webpage = self._download_webpage(url, title)
  62         video_id = self._search_regex(
  63             r'data-diffusion="(\d+)"', webpage, 'ID')
  64         return self._extract_video(video_id)
  65
  66
  67 class FranceTvInfoIE(FranceTVBaseInfoExtractor):
  68     IE_NAME = 'francetvinfo.fr'
  69     _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P<title>.+)\.html'
  70
  71     _TESTS = [{
  72         'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
  73         'info_dict': {
  74             'id': '84981923',
  75             'ext': 'mp4',
  76             'title': 'Soir 3',
  77         },
  78         'params': {
  79             'skip_download': True,
  80         },
  81     }, {
  82         'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
  83         'info_dict': {
  84             'id': 'EV_20019',
  85             'ext': 'mp4',
  86             'title': 'Débat des candidats à la Commission européenne',
  87             'description': 'Débat des candidats à la Commission européenne',
  88         },
  89         'params': {
  90             'skip_download': 'HLS (reqires ffmpeg)'
  91         }
  92     }]
  93
  94     def _real_extract(self, url):
  95         mobj = re.match(self._VALID_URL, url)
  96         page_title = mobj.group('title')
  97         webpage = self._download_webpage(url, page_title)
  98         video_id = self._search_regex(r'id-video=((?:[^0-9]*?_)?[0-9]+)[@"]', webpage, 'video id')
  99         return self._extract_video(video_id)
 100
 101
 102 class FranceTVIE(FranceTVBaseInfoExtractor):
 103     IE_NAME = 'francetv'
 104     IE_DESC = 'France 2, 3, 4, 5 and Ô'
 105     _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
 106         (?:
 107             emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
 108         |   (emissions?|jt)/(?P<key>[^/?]+)
 109         )'''
 110
 111     _TESTS = [
 112         # france2
 113         {
 114             'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
 115             'file': '75540104.mp4',
 116             'info_dict': {
 117                 'title': '13h15, le samedi...',
 118                 'description': 'md5:2e5b58ba7a2d3692b35c792be081a03d',
 119             },
 120             'params': {
 121                 # m3u8 download
 122                 'skip_download': True,
 123             },
 124         },
 125         # france3
 126         {
 127             'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
 128             'info_dict': {
 129                 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
 130                 'ext': 'flv',
 131                 'title': 'Le scandale du prix des médicaments',
 132                 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',
 133             },
 134             'params': {
 135                 # rtmp download
 136                 'skip_download': True,
 137             },
 138         },
 139         # france4
 140         {
 141             'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
 142             'info_dict': {
 143                 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
 144                 'ext': 'flv',
 145                 'title': 'Hero Corp Making of - Extrait 1',
 146                 'description': 'md5:c87d54871b1790679aec1197e73d650a',
 147             },
 148             'params': {
 149                 # rtmp download
 150                 'skip_download': True,
 151             },
 152         },
 153         # france5
 154         {
 155             'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968',
 156             'info_dict': {
 157                 'id': '92837968',
 158                 'ext': 'mp4',
 159                 'title': 'C à dire ?!',
 160                 'description': 'md5:fb1db1cbad784dcce7c7a7bd177c8e2f',
 161             },
 162             'params': {
 163                 # m3u8 download
 164                 'skip_download': True,
 165             },
 166         },
 167         # franceo
 168         {
 169             'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013',
 170             'info_dict': {
 171                 'id': '92327925',
 172                 'ext': 'mp4',
 173                 'title': 'Infô-Afrique',
 174                 'description': 'md5:ebf346da789428841bee0fd2a935ea55',
 175             },
 176             'params': {
 177                 # m3u8 download
 178                 'skip_download': True,
 179             },
 180             'skip': 'The id changes frequently',
 181         },
 182     ]
 183
 184     def _real_extract(self, url):
 185         mobj = re.match(self._VALID_URL, url)
 186         if mobj.group('key'):
 187             webpage = self._download_webpage(url, mobj.group('key'))
 188             id_res = [
 189                 (r'''(?x)<div\s+class="video-player">\s*
 190                     <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+
 191                     class="francetv-video-player">'''),
 192                 (r'<a id="player_direct" href="http://info\.francetelevisions'
 193                  '\.fr/\?id-video=([^"/&]+)'),
 194                 (r'<a class="video" id="ftv_player_(.+?)"'),
 195             ]
 196             video_id = self._html_search_regex(id_res, webpage, 'video ID')
 197         else:
 198             video_id = mobj.group('id')
 199         return self._extract_video(video_id)
 200
 201
 202 class GenerationQuoiIE(InfoExtractor):
 203     IE_NAME = 'france2.fr:generation-quoi'
 204     _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)'
 205
 206     _TEST = {
 207         'url': 'http://generation-quoi.france2.fr/portrait/garde-a-vous',
 208         'file': 'k7FJX8VBcvvLmX4wA5Q.mp4',
 209         'info_dict': {
 210             'title': 'Génération Quoi - Garde à Vous',
 211             'uploader': 'Génération Quoi',
 212         },
 213         'params': {
 214             # It uses Dailymotion
 215             'skip_download': True,
 216         },
 217         'skip': 'Only available from France',
 218     }
 219
 220     def _real_extract(self, url):
 221         mobj = re.match(self._VALID_URL, url)
 222         name = mobj.group('name')
 223         info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % name)
 224         info_json = self._download_webpage(info_url, name)
 225         info = json.loads(info_json)
 226         return self.url_result('http://www.dailymotion.com/video/%s' % info['id'],
 227             ie='Dailymotion')
 228
 229
 230 class CultureboxIE(FranceTVBaseInfoExtractor):
 231     IE_NAME = 'culturebox.francetvinfo.fr'
 232     _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)'
 233
 234     _TEST = {
 235         'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813',
 236         'info_dict': {
 237             'id': 'EV_6785',
 238             'ext': 'mp4',
 239             'title': 'Einstein on the beach au Théâtre du Châtelet',
 240             'description': 'md5:9ce2888b1efefc617b5e58b3f6200eeb',
 241         },
 242         'params': {
 243             # m3u8 download
 244             'skip_download': True,
 245         },
 246     }
 247
 248     def _real_extract(self, url):
 249         mobj = re.match(self._VALID_URL, url)
 250         name = mobj.group('name')
 251         webpage = self._download_webpage(url, name)
 252         video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, 'video id')
 253         return self._extract_video(video_id)