Merge remote-tracking branch 'origin/master' into pr-bbcnews

author fnord <fnord@fnord.mobi>

Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)

committer fnord <fnord@fnord.mobi>

Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
author fnord <fnord@fnord.mobi>
Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
committer fnord <fnord@fnord.mobi>
Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
diff --combined youtube_dl/extractor/__init__.py

index a48346e60e539797781700163575c92e08df70f2,dc1a302e69c13f00d67230b2920ce50106ea2629..1a9585c92647d2a2cd1fa1a5d27ad11c2bf54dcf
--- 1/youtube_dl/extractor/__init__.py
--- 2/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@@ -4,7 -4,10 +4,10 @@@ from .abc import ABCI
   from .abc7news import Abc7NewsIE
   from .academicearth import AcademicEarthCourseIE
   from .addanime import AddAnimeIE
- from .adobetv import AdobeTVIE
+ from .adobetv import (
+     AdobeTVIE,
+     AdobeTVVideoIE,
+ )
   from .adultswim import AdultSwimIE
   from .aftenposten import AftenpostenIE
   from .aftonbladet import AftonbladetIE
@@@ -35,7 -38,7 +38,7 @@@ from .azubu import AzubuI
   from .baidu import BaiduVideoIE
   from .bambuser import BambuserIE, BambuserChannelIE
   from .bandcamp import BandcampIE, BandcampAlbumIE
- -from .bbccouk import BBCCoUkIE
+ +from .bbc import BBCCoUkIE, BBCNewsIE
   from .beeg import BeegIE
   from .behindkink import BehindKinkIE
   from .beatportpro import BeatportProIE
@@@ -103,6 -106,7 +106,7 @@@ from .dailymotion import 
       DailymotionIE,
       DailymotionPlaylistIE,
       DailymotionUserIE,
+     DailymotionCloudIE,
   )
   from .daum import DaumIE
   from .dbtv import DBTVIE
@@@ -401,6 -405,7 +405,7 @@@ from .pbs import PBSI
   from .philharmoniedeparis import PhilharmonieDeParisIE
   from .phoenix import PhoenixIE
   from .photobucket import PhotobucketIE
+ from .pinkbike import PinkbikeIE
   from .planetaplay import PlanetaPlayIE
   from .pladform import PladformIE
   from .played import PlayedIE
@@@ -696,7 -701,10 +701,10 @@@ from .wrzuta import WrzutaI
   from .wsj import WSJIE
   from .xbef import XBefIE
   from .xboxclips import XboxClipsIE
- from .xhamster import XHamsterIE
+ from .xhamster import (
+     XHamsterIE,
+     XHamsterEmbedIE,
+ )
   from .xminus import XMinusIE
   from .xnxx import XNXXIE
   from .xstream import XstreamIE
diff --combined youtube_dl/extractor/bbc.py

index bb671d47310675087d9f49b1131e0b6676a1a309,5825d286774fa003d343f41c85e07e35340cb428..471d865d26c5feb03a66e3eca42fae5ccbe21e39
--- 1/youtube_dl/extractor/bbc.py
--- 2/youtube_dl/extractor/bbccouk.py
+++ b/youtube_dl/extractor/bbc.py
@@@ -5,11 -5,9 +5,11 @@@ import xml.etree.ElementTre
   from .common import InfoExtractor
   from ..utils import (
       ExtractorError,
+ +    parse_duration,
       int_or_none,
   )
   from ..compat import compat_HTTPError
+ +import re
   
   
   class BBCCoUkIE(InfoExtractor):
@@@ -17,8 -15,6 +17,8 @@@
       IE_DESC = 'BBC iPlayer'
       _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
   
+ +    mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
+ +
       _TESTS = [
           {
               'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
@@@ -255,33 -251,18 +255,18 @@@
           for connection in self._extract_connections(media):
               captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
               lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-             ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
-             srt = ''
- 
-             def _extract_text(p):
-                 if p.text is not None:
-                     stripped_text = p.text.strip()
-                     if stripped_text:
-                         return stripped_text
-                 return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
-             for pos, p in enumerate(ps):
-                 srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
               subtitles[lang] = [
                   {
                       'url': connection.get('href'),
                       'ext': 'ttml',
                   },
-                 {
-                     'data': srt,
-                     'ext': 'srt',
-                 },
               ]
           return subtitles
   
       def _download_media_selector(self, programme_id):
           try:
               media_selection = self._download_xml(
- -                'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
+ +                self.mediaselector_url % programme_id,
                   programme_id, 'Downloading media selection XML')
           except ExtractorError as ee:
               if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
@@@ -396,175 -377,3 +381,175 @@@
               'formats': formats,
               'subtitles': subtitles,
           }
+ +
+ +
+ +class BBCNewsIE(BBCCoUkIE):
+ +    IE_NAME = 'bbc.com'
+ +    IE_DESC = 'BBC news'
+ +    _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$'
+ +
+ +    mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
+ +
+ +    _TESTS = [{
+ +        'url': 'http://www.bbc.com/news/world-europe-32668511',
+ +        'info_dict': {
+ +            'id': 'world-europe-32668511',
+ +            'title': 'Russia stages massive WW2 parade despite Western boycott',
+ +        },
+ +        'playlist_count': 2,
+ +    },{
+ +        'url': 'http://www.bbc.com/news/business-28299555',
+ +        'info_dict': {
+ +            'id': 'business-28299555',
+ +            'title': 'Farnborough Airshow: Video highlights',
+ +        },
+ +        'playlist_count': 9,
+ +    },{
+ +        'url': 'http://www.bbc.com/news/world-europe-32041533',
+ +        'note': 'Video',
+ +        'info_dict': {
+ +            'id': 'p02mprgb',
+ +            'ext': 'mp4',
+ +            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+ +            'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+ +            'duration': 47,
+ +            'upload_date': '20150324',
+ +            'uploader': 'BBC News',
+ +        },
+ +        'params': {
+ +            'skip_download': True,
+ +        }
+ +    },{
+ +        'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
+ +        'note': 'Video',
+ +        'info_dict': {
+ +            'id': 'NA',
+ +            'ext': 'mp4',
+ +            'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
+ +            'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
+ +            'duration': 47,
+ +            'upload_date': '20150615',
+ +            'uploader': 'BBC News',
+ +        },
+ +        'params': {
+ +            'skip_download': True,
+ +        }
+ +    },{
+ +        'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
+ +        'note': 'Video',
+ +        'info_dict': {
+ +            'id': '39275083',
+ +            'ext': 'mp4',
+ +            'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
+ +            'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
+ +            'duration': 87,
+ +            'upload_date': '20150619',
+ +            'uploader': 'BBC News',
+ +        },
+ +        'params': {
+ +            'skip_download': True,
+ +        }
+ +    }]
+ +
+ +    def _real_extract(self, url):
+ +        list_id = self._match_id(url)
+ +        webpage = self._download_webpage(url, list_id)
+ +
+ +        list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title')
+ +
+ +        pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
+ +        if pubdate:
+ +           pubdate = pubdate.replace('-','')
+ +
+ +        ret = []
+ +        jsent = []
+ +
+ +        # works with bbc.com/news/something-something-123456 articles
+ +        jsent = map(
+ +           lambda m: self._parse_json(m,list_id),
+ +           re.findall(r"data-media-meta='({[^']+})'", webpage)
+ +        )
+ +
+ +        if len(jsent) == 0:
+ +           # http://www.bbc.com/news/video_and_audio/international
+ +           # and single-video articles
+ +           masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
+ +           if masset:
+ +              jmasset = self._parse_json(masset,list_id)
+ +              for key, val in jmasset.get('videos',{}).items():
+ +                  for skey, sval in val.items():
+ +                      sval['id'] = skey
+ +                      jsent.append(sval)
+ +
+ +        if len(jsent) == 0:
+ +           # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
+ +           # in http://www.bbc.com/news/video_and_audio/international
+ +           # prone to breaking if entries have sourceFiles list
+ +           jsent = map(
+ +               lambda m: self._parse_json(m,list_id),
+ +               re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
+ +           )          
+ +
+ +        if len(jsent) == 0:
+ +           raise ExtractorError('No video found', expected=True)
+ +
+ +        for jent in jsent:
+ +            programme_id = jent.get('externalId')
+ +            xml_url = jent.get('href')
+ +
+ +            title = jent.get('caption',list_title)
+ +
+ +            duration = parse_duration(jent.get('duration'))
+ +            description = list_title
+ +            if jent.get('caption'):
+ +               description += ' - ' + jent.get('caption')
+ +            thumbnail = None
+ +            if jent.has_key('image'):
+ +               thumbnail=jent['image'].get('href')
+ +
+ +            formats = []
+ +            subtitles = []
+ +
+ +            if programme_id:
+ +               formats, subtitles = self._download_media_selector(programme_id)
+ +            elif jent.has_key('sourceFiles'):
+ +               # mediaselector not used at
+ +               # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
+ +               for key, val in jent['sourceFiles'].items():
+ +                  formats.append( {
+ +                     'ext': val.get('encoding'),
+ +                     'url': val.get('url'),
+ +                     'filesize': int(val.get('filesize')),
+ +                     'format_id': key
+ +                  } )
+ +            elif xml_url:
+ +               # Cheap fallback
+ +               # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
+ +               xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
+ +               programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
+ +               formats, subtitles = self._download_media_selector(programme_id)
+ +
+ +            if len(formats) == 0:
+ +               raise ExtractorError('unsupported json media entry.\n    '+str(jent)+'\n')
+ +               
+ +            self._sort_formats(formats)
+ +
+ +            id = jent.get('id') if programme_id == None else programme_id
+ +            if id == None:
+ +               id = 'NA'
+ +
+ +            ret.append( {
+ +                'id': id,
+ +                'uploader': 'BBC News',
+ +                'upload_date': pubdate,
+ +                'title': title,
+ +                'description': description,
+ +                'thumbnail': thumbnail,
+ +                'duration': duration,
+ +                'formats': formats,
+ +                'subtitles': subtitles,
+ +            } )
+ +
+ +        if len(ret) > 0:
+ +           return self.playlist_result(ret, list_id, list_title)
+ +        raise ExtractorError('No video found', expected=True)
author	fnord <fnord@fnord.mobi>
	Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
committer	fnord <fnord@fnord.mobi>
	Thu, 25 Jun 2015 05:34:46 +0000 (00:34 -0500)
		1	2
youtube_dl/extractor/__init__.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/bbc.py	patch \|	diff1 \|	diff2 \|	blob \| history