youtube_dl/extractor/iqiyi.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 from .common import InfoExtractor
   6
   7 from ..compat import compat_urllib_parse
   8
   9 from ..utils import ExtractorError
  10
  11 import re
  12 import time
  13 import uuid
  14 import math
  15 import random
  16 import zlib
  17 import hashlib
  18
  19
  20 class IqiyiIE(InfoExtractor):
  21     IE_NAME = 'iqiyi'
  22
  23     _VALID_URL = r'http://(?:www\.)iqiyi.com/.+?\.html'
  24
  25     _TEST = {
  26         'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
  27         'md5': '2cb594dc2781e6c941a110d8f358118b',
  28         'info_dict': {
  29             'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
  30             'title': '美国德州空中惊现奇异云团 酷似UFO',
  31             'ext': 'f4v',
  32         }
  33     }
  34
  35     def construct_video_urls(self, data, video_id, _uuid, bid):
  36         def do_xor(x, y):
  37             a = y % 3
  38             if a == 1:
  39                 return x ^ 121
  40             if a == 2:
  41                 return x ^ 72
  42             return x ^ 103
  43
  44         def get_encode_code(l):
  45             a = 0
  46             b = l.split('-')
  47             c = len(b)
  48             s = ''
  49             for i in range(c - 1, -1, -1):
  50                 a = do_xor(int(b[c - i - 1], 16), i)
  51                 s += chr(a)
  52             return s[::-1]
  53
  54         def get_path_key(x):
  55             mg = ')(*&^flash@#$%a'
  56             tm = self._download_json(
  57                 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id)['t']
  58             t = str(int(math.floor(int(tm) / (600.0))))
  59             return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
  60
  61         # get accept format
  62         # getting all format will spend minutes for a big video.
  63         if bid == 'best':
  64             bids = [int(i['bid']) for i in data['vp']['tkl'][0]['vs']
  65                     if 0 < int(i['bid']) <= 10]
  66             bid = str(max(bids))
  67
  68         video_urls_dict = {}
  69         for i in data['vp']['tkl'][0]['vs']:
  70             if 0 < int(i['bid']) <= 10:
  71                 format_id = self.get_format(i['bid'])
  72             else:
  73                 continue
  74
  75             video_urls = []
  76
  77             video_urls_info = i['fs']
  78             if not i['fs'][0]['l'].startswith('/'):
  79                 t = get_encode_code(i['fs'][0]['l'])
  80                 if t.endswith('mp4'):
  81                     video_urls_info = i['flvs']
  82
  83             if int(i['bid']) != int(bid):  # ignore missing match format
  84                 video_urls.extend(
  85                     [('http://example.com/v.flv', ii['b']) for ii in video_urls_info])
  86                 video_urls_dict[format_id] = video_urls
  87                 continue
  88
  89             for ii in video_urls_info:
  90                 vl = ii['l']
  91                 if not vl.startswith('/'):
  92                     vl = get_encode_code(vl)
  93                 key = get_path_key(
  94                     vl.split('/')[-1].split('.')[0])
  95                 filesize = ii['b']
  96                 base_url = data['vp']['du'].split('/')
  97                 base_url.insert(-1, key)
  98                 base_url = '/'.join(base_url)
  99                 param = {
 100                     'su': _uuid,
 101                     'qyid': uuid.uuid4().hex,
 102                     'client': '',
 103                     'z': '',
 104                     'bt': '',
 105                     'ct': '',
 106                     'tn': str(int(time.time()))
 107                 }
 108                 api_video_url = base_url + vl + '?' + \
 109                     compat_urllib_parse.urlencode(param)
 110                 js = self._download_json(api_video_url, video_id)
 111                 video_url = js['l']
 112                 video_urls.append(
 113                     (video_url, filesize))
 114
 115             video_urls_dict[format_id] = video_urls
 116         return video_urls_dict
 117
 118     def get_format(self, bid):
 119         _dict = {
 120             '1': 'h6',
 121             '2': 'h5',
 122             '3': 'h4',
 123             '4': 'h3',
 124             '5': 'h2',
 125             '10': 'h1'
 126         }
 127         return _dict.get(str(bid), None)
 128
 129     def get_bid(self, format_id):
 130         _dict = {
 131             'h6': '1',
 132             'h5': '2',
 133             'h4': '3',
 134             'h3': '4',
 135             'h2': '5',
 136             'h1': '10',
 137             'best': 'best'
 138         }
 139         return _dict.get(format_id, None)
 140
 141     def get_raw_data(self, tvid, video_id, enc_key, _uuid):
 142         tm = str(int(time.time()))
 143         param = {
 144             'key': 'fvip',
 145             'src': hashlib.md5(b'youtube-dl').hexdigest(),
 146             'tvId': tvid,
 147             'vid': video_id,
 148             'vinfo': 1,
 149             'tm': tm,
 150             'enc': hashlib.md5(
 151                 (enc_key + tm + tvid).encode('utf8')).hexdigest(),
 152             'qyid': _uuid,
 153             'tn': random.random(),
 154             'um': 0,
 155             'authkey': hashlib.md5(
 156                 (tm + tvid).encode('utf8')).hexdigest()
 157         }
 158
 159         api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
 160             compat_urllib_parse.urlencode(param)
 161         raw_data = self._download_json(api_url, video_id)
 162         return raw_data
 163
 164     def get_enc_key(self, swf_url, video_id):
 165         req = self._request_webpage(
 166             swf_url, video_id, note='download swf content')
 167         cn = req.read()
 168         cn = zlib.decompress(cn[8:])
 169         pt = re.compile(b'MixerRemote\x08(?P<enc_key>.+?)\$&vv')
 170         enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8')
 171         return enc_key
 172
 173     def _real_extract(self, url):
 174         webpage = self._download_webpage(
 175             url, 'temp_id', note='download video page')
 176         tvid = self._search_regex(
 177             r'tvId ?= ?(\'|\")(?P<tvid>\d+)', webpage, 'tvid', flags=re.I, group='tvid')
 178         video_id = self._search_regex(
 179             r'videoId ?= ?(\'|\")(?P<video_id>[a-z\d]+)',
 180             webpage, 'video_id', flags=re.I, group='video_id')
 181         swf_url = self._search_regex(
 182             r'(?P<swf>http://.+?MainPlayer.+?\.swf)', webpage, 'swf')
 183         _uuid = uuid.uuid4().hex
 184
 185         enc_key = self.get_enc_key(swf_url, video_id)
 186
 187         raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
 188         assert raw_data['code'] == 'A000000'
 189         if not raw_data['data']['vp']['tkl']:
 190             raise ExtractorError('No support iQiqy VIP video')
 191
 192         data = raw_data['data']
 193
 194         title = data['vi']['vn']
 195
 196         format = self._downloader.params.get('format', None)
 197         bid = self.get_bid(format) if format else 'best'
 198         if not bid:
 199             raise ExtractorError('Can\'t get format.')
 200
 201         # generate video_urls_dict
 202         video_urls_dict = self.construct_video_urls(
 203             data, video_id, _uuid, bid)
 204
 205         # construct info
 206         entries = []
 207         for format_id in video_urls_dict:
 208             video_urls = video_urls_dict[format_id]
 209             for i, video_url_info in enumerate(video_urls):
 210                 if len(entries) < i + 1:
 211                     entries.append({'formats': []})
 212                 entries[i]['formats'].append(
 213                     {
 214                         'url': video_url_info[0],
 215                         'filesize': video_url_info[-1],
 216                         'format_id': format_id,
 217                         'preference': int(self.get_bid(format_id))
 218                     }
 219                 )
 220
 221         for i in range(len(entries)):
 222             self._sort_formats(entries[i]['formats'])
 223             entries[i].update(
 224                 {
 225                     'id': '_part%d' % (i + 1),
 226                     'title': title,
 227                 }
 228             )
 229
 230         if len(entries) > 1:
 231             info = {
 232                 '_type': 'multi_video',
 233                 'id': video_id,
 234                 'title': title,
 235                 'entries': entries,
 236             }
 237         else:
 238             info = entries[0]
 239             info['id'] = video_id
 240             info['title'] = title
 241
 242         return info