]> gitweb @ CieloNegro.org - youtube-dl.git/commitdiff
Merge remote-tracking branch 'origin/master' into pr-bbcnews
authorfnord <fnord@fnord.mobi>
Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
committerfnord <fnord@fnord.mobi>
Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
1  2 
youtube_dl/extractor/__init__.py
youtube_dl/extractor/bbc.py

index a48346e60e539797781700163575c92e08df70f2,dc1a302e69c13f00d67230b2920ce50106ea2629..1a9585c92647d2a2cd1fa1a5d27ad11c2bf54dcf
@@@ -4,7 -4,10 +4,10 @@@ from .abc import ABCI
  from .abc7news import Abc7NewsIE
  from .academicearth import AcademicEarthCourseIE
  from .addanime import AddAnimeIE
- from .adobetv import AdobeTVIE
+ from .adobetv import (
+     AdobeTVIE,
+     AdobeTVVideoIE,
+ )
  from .adultswim import AdultSwimIE
  from .aftenposten import AftenpostenIE
  from .aftonbladet import AftonbladetIE
@@@ -35,7 -38,7 +38,7 @@@ from .azubu import AzubuI
  from .baidu import BaiduVideoIE
  from .bambuser import BambuserIE, BambuserChannelIE
  from .bandcamp import BandcampIE, BandcampAlbumIE
 -from .bbccouk import BBCCoUkIE
 +from .bbc import BBCCoUkIE, BBCNewsIE
  from .beeg import BeegIE
  from .behindkink import BehindKinkIE
  from .beatportpro import BeatportProIE
@@@ -103,6 -106,7 +106,7 @@@ from .dailymotion import 
      DailymotionIE,
      DailymotionPlaylistIE,
      DailymotionUserIE,
+     DailymotionCloudIE,
  )
  from .daum import DaumIE
  from .dbtv import DBTVIE
@@@ -401,6 -405,7 +405,7 @@@ from .pbs import PBSI
  from .philharmoniedeparis import PhilharmonieDeParisIE
  from .phoenix import PhoenixIE
  from .photobucket import PhotobucketIE
+ from .pinkbike import PinkbikeIE
  from .planetaplay import PlanetaPlayIE
  from .pladform import PladformIE
  from .played import PlayedIE
@@@ -696,7 -701,10 +701,10 @@@ from .wrzuta import WrzutaI
  from .wsj import WSJIE
  from .xbef import XBefIE
  from .xboxclips import XboxClipsIE
- from .xhamster import XHamsterIE
+ from .xhamster import (
+     XHamsterIE,
+     XHamsterEmbedIE,
+ )
  from .xminus import XMinusIE
  from .xnxx import XNXXIE
  from .xstream import XstreamIE
index bb671d47310675087d9f49b1131e0b6676a1a309,5825d286774fa003d343f41c85e07e35340cb428..471d865d26c5feb03a66e3eca42fae5ccbe21e39
@@@ -5,11 -5,9 +5,11 @@@ import xml.etree.ElementTre
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
 +    parse_duration,
      int_or_none,
  )
  from ..compat import compat_HTTPError
 +import re
  
  
  class BBCCoUkIE(InfoExtractor):
@@@ -17,8 -15,6 +17,8 @@@
      IE_DESC = 'BBC iPlayer'
      _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
  
 +    mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
 +
      _TESTS = [
          {
              'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
          for connection in self._extract_connections(media):
              captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
              lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-             ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
-             srt = ''
-             def _extract_text(p):
-                 if p.text is not None:
-                     stripped_text = p.text.strip()
-                     if stripped_text:
-                         return stripped_text
-                 return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
-             for pos, p in enumerate(ps):
-                 srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
              subtitles[lang] = [
                  {
                      'url': connection.get('href'),
                      'ext': 'ttml',
                  },
-                 {
-                     'data': srt,
-                     'ext': 'srt',
-                 },
              ]
          return subtitles
  
      def _download_media_selector(self, programme_id):
          try:
              media_selection = self._download_xml(
 -                'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
 +                self.mediaselector_url % programme_id,
                  programme_id, 'Downloading media selection XML')
          except ExtractorError as ee:
              if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
              'formats': formats,
              'subtitles': subtitles,
          }
 +
 +
 +class BBCNewsIE(BBCCoUkIE):
 +    IE_NAME = 'bbc.com'
 +    IE_DESC = 'BBC news'
 +    _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$'
 +
 +    mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
 +
 +    _TESTS = [{
 +        'url': 'http://www.bbc.com/news/world-europe-32668511',
 +        'info_dict': {
 +            'id': 'world-europe-32668511',
 +            'title': 'Russia stages massive WW2 parade despite Western boycott',
 +        },
 +        'playlist_count': 2,
 +    },{
 +        'url': 'http://www.bbc.com/news/business-28299555',
 +        'info_dict': {
 +            'id': 'business-28299555',
 +            'title': 'Farnborough Airshow: Video highlights',
 +        },
 +        'playlist_count': 9,
 +    },{
 +        'url': 'http://www.bbc.com/news/world-europe-32041533',
 +        'note': 'Video',
 +        'info_dict': {
 +            'id': 'p02mprgb',
 +            'ext': 'mp4',
 +            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 +            'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
 +            'duration': 47,
 +            'upload_date': '20150324',
 +            'uploader': 'BBC News',
 +        },
 +        'params': {
 +            'skip_download': True,
 +        }
 +    },{
 +        'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
 +        'note': 'Video',
 +        'info_dict': {
 +            'id': 'NA',
 +            'ext': 'mp4',
 +            'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
 +            'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
 +            'duration': 47,
 +            'upload_date': '20150615',
 +            'uploader': 'BBC News',
 +        },
 +        'params': {
 +            'skip_download': True,
 +        }
 +    },{
 +        'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
 +        'note': 'Video',
 +        'info_dict': {
 +            'id': '39275083',
 +            'ext': 'mp4',
 +            'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
 +            'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
 +            'duration': 87,
 +            'upload_date': '20150619',
 +            'uploader': 'BBC News',
 +        },
 +        'params': {
 +            'skip_download': True,
 +        }
 +    }]
 +
 +    def _real_extract(self, url):
 +        list_id = self._match_id(url)
 +        webpage = self._download_webpage(url, list_id)
 +
 +        list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title')
 +
 +        pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
 +        if pubdate:
 +           pubdate = pubdate.replace('-','')
 +
 +        ret = []
 +        jsent = []
 +
 +        # works with bbc.com/news/something-something-123456 articles
 +        jsent = map(
 +           lambda m: self._parse_json(m,list_id),
 +           re.findall(r"data-media-meta='({[^']+})'", webpage)
 +        )
 +
 +        if len(jsent) == 0:
 +           # http://www.bbc.com/news/video_and_audio/international
 +           # and single-video articles
 +           masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
 +           if masset:
 +              jmasset = self._parse_json(masset,list_id)
 +              for key, val in jmasset.get('videos',{}).items():
 +                  for skey, sval in val.items():
 +                      sval['id'] = skey
 +                      jsent.append(sval)
 +
 +        if len(jsent) == 0:
 +           # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
 +           # in http://www.bbc.com/news/video_and_audio/international
 +           # prone to breaking if entries have sourceFiles list
 +           jsent = map(
 +               lambda m: self._parse_json(m,list_id),
 +               re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
 +           )          
 +
 +        if len(jsent) == 0:
 +           raise ExtractorError('No video found', expected=True)
 +
 +        for jent in jsent:
 +            programme_id = jent.get('externalId')
 +            xml_url = jent.get('href')
 +
 +            title = jent.get('caption',list_title)
 +
 +            duration = parse_duration(jent.get('duration'))
 +            description = list_title
 +            if jent.get('caption'):
 +               description += ' - ' + jent.get('caption')
 +            thumbnail = None
 +            if jent.has_key('image'):
 +               thumbnail=jent['image'].get('href')
 +
 +            formats = []
 +            subtitles = []
 +
 +            if programme_id:
 +               formats, subtitles = self._download_media_selector(programme_id)
 +            elif jent.has_key('sourceFiles'):
 +               # mediaselector not used at
 +               # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
 +               for key, val in jent['sourceFiles'].items():
 +                  formats.append( {
 +                     'ext': val.get('encoding'),
 +                     'url': val.get('url'),
 +                     'filesize': int(val.get('filesize')),
 +                     'format_id': key
 +                  } )
 +            elif xml_url:
 +               # Cheap fallback
 +               # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
 +               xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
 +               programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
 +               formats, subtitles = self._download_media_selector(programme_id)
 +
 +            if len(formats) == 0:
 +               raise ExtractorError('unsupported json media entry.\n    '+str(jent)+'\n')
 +               
 +            self._sort_formats(formats)
 +
 +            id = jent.get('id') if programme_id == None else programme_id
 +            if id == None:
 +               id = 'NA'
 +
 +            ret.append( {
 +                'id': id,
 +                'uploader': 'BBC News',
 +                'upload_date': pubdate,
 +                'title': title,
 +                'description': description,
 +                'thumbnail': thumbnail,
 +                'duration': duration,
 +                'formats': formats,
 +                'subtitles': subtitles,
 +            } )
 +
 +        if len(ret) > 0:
 +           return self.playlist_result(ret, list_id, list_title)
 +        raise ExtractorError('No video found', expected=True)