youtube_dl/extractor/youku.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import itertools
   5 import random
   6 import re
   7 import string
   8 import time
   9
  10 from .common import InfoExtractor
  11 from ..utils import (
  12     ExtractorError,
  13     get_element_by_attribute,
  14 )
  15
  16
  17 class YoukuIE(InfoExtractor):
  18     IE_NAME = 'youku'
  19     IE_DESC = '优酷'
  20     _VALID_URL = r'''(?x)
  21         (?:
  22             http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
  23             youku:)
  24         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
  25     '''
  26
  27     _TESTS = [{
  28         # MD5 is unstable
  29         'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
  30         'info_dict': {
  31             'id': 'XMTc1ODE5Njcy',
  32             'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
  33             'ext': 'mp4',
  34         }
  35     }, {
  36         'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
  37         'only_matching': True,
  38     }, {
  39         'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
  40         'info_dict': {
  41             'id': 'XODgxNjg1Mzk2',
  42             'ext': 'mp4',
  43             'title': '武媚娘传奇 85',
  44         },
  45     }, {
  46         'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
  47         'info_dict': {
  48             'id': 'XMTI1OTczNDM5Mg',
  49             'ext': 'mp4',
  50             'title': '花千骨 04',
  51         },
  52     }, {
  53         'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
  54         'note': 'Video protected with password',
  55         'info_dict': {
  56             'id': 'XNjA1NzA2Njgw',
  57             'ext': 'mp4',
  58             'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起',
  59         },
  60         'params': {
  61             'videopassword': '100600',
  62         },
  63     }, {
  64         # /play/get.json contains streams with "channel_type":"tail"
  65         'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html',
  66         'info_dict': {
  67             'id': 'XOTUxMzg4NDMy',
  68             'ext': 'mp4',
  69             'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft',
  70         },
  71     }]
  72
  73     @staticmethod
  74     def get_ysuid():
  75         return '%d%s' % (int(time.time()), ''.join([
  76             random.choice(string.ascii_letters) for i in range(3)]))
  77
  78     def get_format_name(self, fm):
  79         _dict = {
  80             '3gp': 'h6',
  81             '3gphd': 'h5',
  82             'flv': 'h4',
  83             'flvhd': 'h4',
  84             'mp4': 'h3',
  85             'mp4hd': 'h3',
  86             'mp4hd2': 'h4',
  87             'mp4hd3': 'h4',
  88             'hd2': 'h2',
  89             'hd3': 'h1',
  90         }
  91         return _dict.get(fm)
  92
  93     def _real_extract(self, url):
  94         video_id = self._match_id(url)
  95
  96         self._set_cookie('youku.com', '__ysuid', self.get_ysuid())
  97         self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
  98
  99         _, urlh = self._download_webpage_handle(
 100             'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info')
 101         # The etag header is '"foobar"'; let's remove the double quotes
 102         cna = urlh.headers['etag'][1:-1]
 103
 104         # request basic data
 105         basic_data_params = {
 106             'vid': video_id,
 107             'ccode': '0401',
 108             'client_ip': '192.168.1.1',
 109             'utid': cna,
 110             'client_ts': time.time() / 1000,
 111         }
 112
 113         video_password = self._downloader.params.get('videopassword')
 114         if video_password:
 115             basic_data_params['password'] = video_password
 116
 117         headers = {
 118             'Referer': url,
 119         }
 120         headers.update(self.geo_verification_headers())
 121         data = self._download_json(
 122             'https://ups.youku.com/ups/get.json', video_id,
 123             'Downloading JSON metadata',
 124             query=basic_data_params, headers=headers)['data']
 125
 126         error = data.get('error')
 127         if error:
 128             error_note = error.get('note')
 129             if error_note is not None and '因版权原因无法观看此视频' in error_note:
 130                 raise ExtractorError(
 131                     'Youku said: Sorry, this video is available in China only', expected=True)
 132             elif error_note and '该视频被设为私密' in error_note:
 133                 raise ExtractorError(
 134                     'Youku said: Sorry, this video is private', expected=True)
 135             else:
 136                 msg = 'Youku server reported error %i' % error.get('code')
 137                 if error_note is not None:
 138                     msg += ': ' + error_note
 139                 raise ExtractorError(msg)
 140
 141         # get video title
 142         title = data['video']['title']
 143
 144         formats = [{
 145             'url': stream['m3u8_url'],
 146             'format_id': self.get_format_name(stream.get('stream_type')),
 147             'ext': 'mp4',
 148             'protocol': 'm3u8_native',
 149             'filesize': int(stream.get('size')),
 150             'width': stream.get('width'),
 151             'height': stream.get('height'),
 152         } for stream in data['stream'] if stream.get('channel_type') != 'tail']
 153         self._sort_formats(formats)
 154
 155         return {
 156             'id': video_id,
 157             'title': title,
 158             'formats': formats,
 159         }
 160
 161
 162 class YoukuShowIE(InfoExtractor):
 163     _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html'
 164     IE_NAME = 'youku:show'
 165
 166     _TEST = {
 167         'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html',
 168         'info_dict': {
 169             'id': 'zc7c670be07ff11e48b3f',
 170             'title': '花千骨 未删减版',
 171             'description': 'md5:578d4f2145ae3f9128d9d4d863312910',
 172         },
 173         'playlist_count': 50,
 174     }
 175
 176     _PAGE_SIZE = 40
 177
 178     def _find_videos_in_page(self, webpage):
 179         videos = re.findall(
 180             r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage)
 181         return [
 182             self.url_result(video_url, YoukuIE.ie_key(), title)
 183             for video_url, title in videos]
 184
 185     def _real_extract(self, url):
 186         show_id = self._match_id(url)
 187         webpage = self._download_webpage(url, show_id)
 188
 189         entries = self._find_videos_in_page(webpage)
 190
 191         playlist_title = self._html_search_regex(
 192             r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False)
 193         detail_div = get_element_by_attribute('class', 'detail', webpage) or ''
 194         playlist_description = self._html_search_regex(
 195             r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>',
 196             detail_div, 'playlist description', fatal=False)
 197
 198         for idx in itertools.count(1):
 199             episodes_page = self._download_webpage(
 200                 'http://www.youku.com/show_episode/id_%s.html' % show_id,
 201                 show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)},
 202                 note='Downloading episodes page %d' % idx)
 203             new_entries = self._find_videos_in_page(episodes_page)
 204             entries.extend(new_entries)
 205             if len(new_entries) < self._PAGE_SIZE:
 206                 break
 207
 208         return self.playlist_result(entries, show_id, playlist_title, playlist_description)