youtube_dl/extractor/bilibili.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5 import json
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_etree_fromstring,
  10     compat_str,
  11 )
  12 from ..utils import (
  13     int_or_none,
  14     unescapeHTML,
  15     ExtractorError,
  16     xpath_text,
  17 )
  18
  19
  20 class BiliBiliIE(InfoExtractor):
  21     _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?'
  22
  23     _TESTS = [{
  24         'url': 'http://www.bilibili.tv/video/av1074402/',
  25         'md5': '2c301e4dab317596e837c3e7633e7d86',
  26         'info_dict': {
  27             'id': '1554319',
  28             'ext': 'flv',
  29             'title': '【金坷垃】金泡沫',
  30             'duration': 308313,
  31             'upload_date': '20140420',
  32             'thumbnail': 're:^https?://.+\.jpg',
  33             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
  34             'timestamp': 1397983878,
  35             'uploader': '菊子桑',
  36         },
  37     }, {
  38         'url': 'http://www.bilibili.com/video/av1041170/',
  39         'info_dict': {
  40             'id': '1041170',
  41             'title': '【BD1080P】刀语【诸神&异域】',
  42             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦！~',
  43             'uploader': '枫叶逝去',
  44             'timestamp': 1396501299,
  45         },
  46         'playlist_count': 9,
  47     }]
  48
  49     def _real_extract(self, url):
  50         mobj = re.match(self._VALID_URL, url)
  51         video_id = mobj.group('id')
  52         page_num = mobj.group('page_num') or '1'
  53
  54         view_data = self._download_json(
  55             'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num),
  56             video_id)
  57         if 'error' in view_data:
  58             raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True)
  59
  60         cid = view_data['cid']
  61         title = unescapeHTML(view_data['title'])
  62
  63         doc = self._download_xml(
  64             'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid,
  65             cid,
  66             'Downloading page %s/%s' % (page_num, view_data['pages'])
  67         )
  68
  69         if xpath_text(doc, './result') == 'error':
  70             raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True)
  71
  72         entries = []
  73
  74         for durl in doc.findall('./durl'):
  75             size = xpath_text(durl, ['./filesize', './size'])
  76             formats = [{
  77                 'url': durl.find('./url').text,
  78                 'filesize': int_or_none(size),
  79                 'ext': 'flv',
  80             }]
  81             backup_urls = durl.find('./backup_url')
  82             if backup_urls is not None:
  83                 for backup_url in backup_urls.findall('./url'):
  84                     formats.append({'url': backup_url.text})
  85             formats.reverse()
  86
  87             entries.append({
  88                 'id': '%s_part%s' % (cid, xpath_text(durl, './order')),
  89                 'title': title,
  90                 'duration': int_or_none(xpath_text(durl, './length'), 1000),
  91                 'formats': formats,
  92             })
  93
  94         info = {
  95             'id': compat_str(cid),
  96             'title': title,
  97             'description': view_data.get('description'),
  98             'thumbnail': view_data.get('pic'),
  99             'uploader': view_data.get('author'),
 100             'timestamp': int_or_none(view_data.get('created')),
 101             'view_count': view_data.get('play'),
 102             'duration': int_or_none(xpath_text(doc, './timelength')),
 103         }
 104
 105         if len(entries) == 1:
 106             entries[0].update(info)
 107             return entries[0]
 108         else:
 109             info.update({
 110                 '_type': 'multi_video',
 111                 'id': video_id,
 112                 'entries': entries,
 113             })
 114             return info