youtube_dl/extractor/twitch.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import itertools
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     compat_urllib_parse,
  10     compat_urllib_request,
  11     ExtractorError,
  12     parse_iso8601,
  13 )
  14
  15
  16 class TwitchIE(InfoExtractor):
  17     # TODO: One broadcast may be split into multiple videos. The key
  18     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
  19     # starts at 1 and increases. Can we treat all parts as one video?
  20     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
  21         (?:
  22             (?P<channelid>[^/]+)|
  23             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
  24             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
  25         )
  26         /?(?:\#.*)?$
  27         """
  28     _PAGE_LIMIT = 100
  29     _API_BASE = 'https://api.twitch.tv'
  30     _LOGIN_URL = 'https://secure.twitch.tv/user/login'
  31     _TESTS = [{
  32         'url': 'http://www.twitch.tv/riotgames/b/577357806',
  33         'info_dict': {
  34             'id': 'a577357806',
  35             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
  36         },
  37         'playlist_mincount': 12,
  38     }, {
  39         'url': 'http://www.twitch.tv/acracingleague/c/5285812',
  40         'info_dict': {
  41             'id': 'c5285812',
  42             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
  43         },
  44         'playlist_mincount': 3,
  45     }, {
  46         'url': 'http://www.twitch.tv/vanillatv',
  47         'info_dict': {
  48             'id': 'vanillatv',
  49             'title': 'VanillaTV',
  50         },
  51         'playlist_mincount': 412,
  52     }]
  53
  54     def _handle_error(self, response):
  55         if not isinstance(response, dict):
  56             return
  57         error = response.get('error')
  58         if error:
  59             raise ExtractorError(
  60                 '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
  61                 expected=True)
  62
  63     def _download_json(self, url, video_id, note='Downloading JSON metadata'):
  64         response = super(TwitchIE, self)._download_json(url, video_id, note)
  65         self._handle_error(response)
  66         return response
  67
  68     def _extract_media(self, item, item_id):
  69         ITEMS = {
  70             'a': 'video',
  71             'c': 'chapter',
  72         }
  73         info = self._extract_info(self._download_json(
  74             '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
  75             'Downloading %s info JSON' % ITEMS[item]))
  76         response = self._download_json(
  77             '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
  78             'Downloading %s playlist JSON' % ITEMS[item])
  79         entries = []
  80         chunks = response['chunks']
  81         qualities = list(chunks.keys())
  82         for num, fragment in enumerate(zip(*chunks.values()), start=1):
  83             formats = []
  84             for fmt_num, fragment_fmt in enumerate(fragment):
  85                 format_id = qualities[fmt_num]
  86                 fmt = {
  87                     'url': fragment_fmt['url'],
  88                     'format_id': format_id,
  89                     'quality': 1 if format_id == 'live' else 0,
  90                 }
  91                 m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
  92                 if m:
  93                     fmt['height'] = int(m.group('height'))
  94                 formats.append(fmt)
  95             self._sort_formats(formats)
  96             entry = dict(info)
  97             entry['id'] = '%s_%d' % (entry['id'], num)
  98             entry['title'] = '%s part %d' % (entry['title'], num)
  99             entry['formats'] = formats
 100             entries.append(entry)
 101         return self.playlist_result(entries, info['id'], info['title'])
 102
 103     def _extract_info(self, info):
 104         return {
 105             'id': info['_id'],
 106             'title': info['title'],
 107             'description': info['description'],
 108             'duration': info['length'],
 109             'thumbnail': info['preview'],
 110             'uploader': info['channel']['display_name'],
 111             'uploader_id': info['channel']['name'],
 112             'timestamp': parse_iso8601(info['recorded_at']),
 113             'view_count': info['views'],
 114         }
 115
 116     def _real_initialize(self):
 117         self._login()
 118
 119     def _login(self):
 120         (username, password) = self._get_login_info()
 121         if username is None:
 122             return
 123
 124         login_page = self._download_webpage(
 125             self._LOGIN_URL, None, 'Downloading login page')
 126
 127         authenticity_token = self._search_regex(
 128             r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
 129             login_page, 'authenticity token')
 130
 131         login_form = {
 132             'utf8': '✓'.encode('utf-8'),
 133             'authenticity_token': authenticity_token,
 134             'redirect_on_login': '',
 135             'embed_form': 'false',
 136             'mp_source_action': '',
 137             'follow': '',
 138             'user[login]': username,
 139             'user[password]': password,
 140         }
 141
 142         request = compat_urllib_request.Request(
 143             self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
 144         request.add_header('Referer', self._LOGIN_URL)
 145         response = self._download_webpage(
 146             request, None, 'Logging in as %s' % username)
 147
 148         m = re.search(
 149             r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
 150         if m:
 151             raise ExtractorError(
 152                 'Unable to login: %s' % m.group('msg').strip(), expected=True)
 153
 154     def _real_extract(self, url):
 155         mobj = re.match(self._VALID_URL, url)
 156         if mobj.group('chapterid'):
 157             return self._extract_media('c', mobj.group('chapterid'))
 158
 159             """
 160             webpage = self._download_webpage(url, chapter_id)
 161             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 162             if not m:
 163                 raise ExtractorError('Cannot find archive of a chapter')
 164             archive_id = m.group(1)
 165
 166             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 167             doc = self._download_xml(
 168                 api, chapter_id,
 169                 note='Downloading chapter information',
 170                 errnote='Chapter information download failed')
 171             for a in doc.findall('.//archive'):
 172                 if archive_id == a.find('./id').text:
 173                     break
 174             else:
 175                 raise ExtractorError('Could not find chapter in chapter information')
 176
 177             video_url = a.find('./video_file_url').text
 178             video_ext = video_url.rpartition('.')[2] or 'flv'
 179
 180             chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
 181             chapter_info = self._download_json(
 182                 chapter_api_url, 'c' + chapter_id,
 183                 note='Downloading chapter metadata',
 184                 errnote='Download of chapter metadata failed')
 185
 186             bracket_start = int(doc.find('.//bracket_start').text)
 187             bracket_end = int(doc.find('.//bracket_end').text)
 188
 189             # TODO determine start (and probably fix up file)
 190             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 191             #video_url += '?start=' + TODO:start_timestamp
 192             # bracket_start is 13290, but we want 51670615
 193             self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
 194                                             'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 195
 196             info = {
 197                 'id': 'c' + chapter_id,
 198                 'url': video_url,
 199                 'ext': video_ext,
 200                 'title': chapter_info['title'],
 201                 'thumbnail': chapter_info['preview'],
 202                 'description': chapter_info['description'],
 203                 'uploader': chapter_info['channel']['display_name'],
 204                 'uploader_id': chapter_info['channel']['name'],
 205             }
 206             return info
 207             """
 208         elif mobj.group('videoid'):
 209             return self._extract_media('a', mobj.group('videoid'))
 210         elif mobj.group('channelid'):
 211             channel_id = mobj.group('channelid')
 212             info = self._download_json(
 213                 '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
 214                 channel_id, 'Downloading channel info JSON')
 215             channel_name = info.get('display_name') or info.get('name')
 216             entries = []
 217             offset = 0
 218             limit = self._PAGE_LIMIT
 219             for counter in itertools.count(1):
 220                 response = self._download_json(
 221                     '%s/kraken/channels/%s/videos/?offset=%d&limit=%d'
 222                     % (self._API_BASE, channel_id, offset, limit),
 223                     channel_id, 'Downloading channel videos JSON page %d' % counter)
 224                 videos = response['videos']
 225                 if not videos:
 226                     break
 227                 entries.extend([self.url_result(video['url'], 'Twitch') for video in videos])
 228                 offset += limit
 229             return self.playlist_result(entries, channel_id, channel_name)