X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=8a2ed0ab6851e3ee128aea4fbb1254c0d37cdd7b;hb=3f2ce6896acfd487ce915788255ddee1e04794b3;hp=83e6d024c7fd4c6ee2adde607e4f1cd203910805;hpb=522f6c066da93b6baec3399b7098556e5ec55f43;p=youtube-dl.git diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 83e6d024c..8a2ed0ab6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..utils import ( @@ -17,6 +18,7 @@ from ..utils import ( from ..compat import ( compat_etree_fromstring, compat_HTTPError, + compat_urlparse, ) @@ -223,6 +225,8 @@ class BBCCoUkIE(InfoExtractor): } ] + _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' + class MediaSelectionError(Exception): def __init__(self, id): self.id = id @@ -334,6 +338,15 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + if re.search(self._USP_RE, href): + usp_formats = self._extract_m3u8_formats( + re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), + programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for f in usp_formats: + if f.get('height') and f['height'] > 720: + continue + formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -1026,7 +1039,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' @@ -1056,19 +1069,35 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] - title, description = self._extract_title_and_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): @@ -1117,6 +1146,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, }, { 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', 'only_matching': True,