youtube_dl/extractor/franceculture.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import json
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     compat_parse_qs,
  10     compat_urlparse,
  11 )
  12
  13
  14 class FranceCultureIE(InfoExtractor):
  15     _VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
  16     _TEST = {
  17         'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
  18         'info_dict': {
  19             'id': '4795174',
  20             'ext': 'mp3',
  21             'title': 'Rendez-vous au pays des geeks',
  22             'vcodec': 'none',
  23             'uploader': 'Colette Fellous',
  24             'upload_date': '20140301',
  25             'duration': 3601,
  26             'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
  27             'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
  28         }
  29     }
  30
  31     def _real_extract(self, url):
  32         mobj = re.match(self._VALID_URL, url)
  33         video_id = mobj.group('id')
  34         baseurl = mobj.group('baseurl')
  35
  36         webpage = self._download_webpage(url, video_id)
  37         params_code = self._search_regex(
  38             r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
  39             webpage, 'parameter code')
  40         params = compat_parse_qs(params_code)
  41         video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
  42
  43         title = self._html_search_regex(
  44             r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
  45         uploader = self._html_search_regex(
  46             r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
  47             webpage, 'uploader', fatal=False)
  48         thumbnail_part = self._html_search_regex(
  49             r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
  50             'thumbnail', fatal=False)
  51         if thumbnail_part is None:
  52             thumbnail = None
  53         else:
  54             thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
  55         description = self._html_search_regex(
  56             r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
  57
  58         info = json.loads(params['infoData'][0])[0]
  59         duration = info.get('media_length')
  60         upload_date_candidate = info.get('media_section5')
  61         upload_date = (
  62             upload_date_candidate
  63             if (upload_date_candidate is not None and
  64                 re.match(r'[0-9]{8}$', upload_date_candidate))
  65             else None)
  66
  67         return {
  68             'id': video_id,
  69             'url': video_url,
  70             'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
  71             'duration': duration,
  72             'uploader': uploader,
  73             'upload_date': upload_date,
  74             'title': title,
  75             'thumbnail': thumbnail,
  76             'description': description,
  77         }