X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbilibili.py;h=d8eb718212b3d8482f7ac7cf479217da2391de6d;hb=794e5dcd7e24784c05e042e7e0655c584347f5c3;hp=85156ce49cca1b99f9e534f48c270f5e34621339;hpb=55af2b26e0f169bef2f10a7b5f6ec8e34c6dbb6d;p=youtube-dl.git diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 85156ce49..d8eb71821 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,113 +1,189 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import xml.etree.ElementTree as ET +import calendar +import datetime +import re from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, + compat_str, + compat_parse_qs, + compat_xml_parse_error, +) from ..utils import ( - int_or_none, ExtractorError, + int_or_none, + float_or_none, + xpath_text, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P[0-9]+)/' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P\d+)' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '2c301e4dab317596e837c3e7633e7d86', + 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { 'id': '1554319', - 'ext': 'flv', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', - 'duration': 308313, + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'duration': 308.315, + 'timestamp': 1398012660, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', - 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'timestamp': 1397983878, 'uploader': '菊子桑', + 'uploader_id': '156160', }, }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { - 'id': '1041170', + 'id': '1507019', + 'ext': 'mp4', 'title': '【BD1080P】刀语【诸神&异域】', + 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'timestamp': 1396530060, + 'upload_date': '20140403', + 'uploader': '枫叶逝去', + 'uploader_id': '520116', }, - 'playlist_count': 12, + }, { + 'url': 'http://www.bilibili.com/video/av4808130/', + 'info_dict': { + 'id': '7802182', + 'ext': 'mp4', + 'title': '【长篇】哆啦A梦443【钉铛】', + 'description': '(2016.05.27)来组合客人的脸吧&寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', + }, + }, { + # Missing upload time + 'url': 'http://www.bilibili.com/video/av1867637/', + 'info_dict': { + 'id': '2880301', + 'ext': 'mp4', + 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', + 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', + 'uploader': '黑夜为猫', + 'uploader_id': '610729', + }, + 'params': { + # Just to test metadata extraction + 'skip_download': True, + }, + 'expected_warnings': ['upload time'], }] - def _extract_video_info(self, cid, view_data, page_num=1, num_pages=1): - title = view_data['title'] + # BiliBili blocks keys from time to time. The current key is extracted from + # the Android client + # TODO: find the sign algorithm used in the flash player + _APP_KEY = '86385cdc024c0f6c' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + params = compat_parse_qs(self._search_regex( + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + webpage, 'player parameters')) + cid = params['cid'][0] + + info_xml_str = self._download_webpage( + 'http://interface.bilibili.com/v_cdn_play', + cid, query={'appkey': self._APP_KEY, 'cid': cid}, + note='Downloading video info page') - page = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, - cid, - 'Downloading page %d/%d' % (page_num, num_pages) - ) + err_msg = None + durls = None + info_xml = None try: - err_info = json.loads(page) - raise ExtractorError( - 'BiliBili said: ' + err_info['error_text'], expected=True) - except ValueError: - pass + info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) + except compat_xml_parse_error: + info_json = self._parse_json(info_xml_str, video_id, fatal=False) + err_msg = (info_json or {}).get('error_text') + else: + err_msg = xpath_text(info_xml, './message') - doc = ET.fromstring(page) - durls = doc.findall('./durl') + if info_xml is not None: + durls = info_xml.findall('./durl') + if not durls: + if err_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) + else: + raise ExtractorError('No videos found!') entries = [] for durl in durls: - formats = [] - backup_url = durl.find('./backup_url') - if backup_url is not None: - formats.append({'url': backup_url.find('./url').text}) - size = durl.find('./filesize|./size') - formats.append({ + size = xpath_text(durl, ['./filesize', './size']) + formats = [{ 'url': durl.find('./url').text, - 'filesize': int_or_none(size.text) if size else None, - 'ext': 'flv', - }) + 'filesize': int_or_none(size), + }] + for backup_url in durl.findall('./backup_url/url'): + formats.append({ + 'url': backup_url.text, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url.text else -3, + }) + + self._sort_formats(formats) + entries.append({ - 'id': '%s_part%s' % (cid, durl.find('./order').text), - 'title': title, - 'duration': int_or_none(durl.find('./length').text) // 1000, + 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), + 'duration': int_or_none(xpath_text(durl, './length'), 1000), 'formats': formats, }) + title = self._html_search_regex(']+title="([^"]+)">', webpage, 'title') + description = self._html_search_meta('description', webpage) + datetime_str = self._html_search_regex( + r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False) + timestamp = None + if datetime_str: + timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) + + # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': cid, + 'id': compat_str(cid), 'title': title, - 'description': view_data.get('description'), - 'thumbnail': view_data.get('pic'), - 'uploader': view_data.get('author'), - 'timestamp': int_or_none(view_data.get('created')), - 'view_count': view_data.get('play'), - 'duration': int_or_none(doc.find('./timelength').text), + 'description': description, + 'timestamp': timestamp, + 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), + 'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), } + uploader_mobj = re.search( + r']+href="https?://space\.bilibili\.com/(?P\d+)"[^>]+title="(?P[^"]+)"', + webpage) + if uploader_mobj: + info.update({ + 'uploader': uploader_mobj.group('name'), + 'uploader_id': uploader_mobj.group('id'), + }) + + for entry in entries: + entry.update(info) + if len(entries) == 1: - entries[0].update(info) return entries[0] else: - info.update({ + for idx, entry in enumerate(entries): + entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + + return { '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'description': description, 'entries': entries, - }) - return info - - def _real_extract(self, url): - video_id = self._match_id(url) - view_data = self._download_json('http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s' % video_id, video_id) - - num_pages = int_or_none(view_data['pages']) - if num_pages > 1: - play_list_title = view_data['title'] - page_list = self._download_json('http://www.bilibili.com/widget/getPageList?aid=%s' % video_id, video_id, 'Downloading page list metadata') - entries = [] - for page in page_list: - view_data['title'] = page['pagename'] - entries.append(self._extract_video_info(str(page['cid']), view_data, page['page'], num_pages)) - return self.playlist_result(entries, video_id, play_list_title, view_data.get('description')) - else: - return self._extract_video_info(str(view_data['cid']), view_data) + }