]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/extractor/twitch.py
[twitch] Move URL matching tests into extractor
[youtube-dl.git] / youtube_dl / extractor / twitch.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import (
9     compat_urllib_parse,
10     compat_urllib_request,
11 )
12 from ..utils import (
13     ExtractorError,
14     parse_iso8601,
15 )
16
17
18 class TwitchBaseIE(InfoExtractor):
19     _VALID_URL_BASE = r'http://(?:www\.)?twitch\.tv'
20
21     _API_BASE = 'https://api.twitch.tv'
22     _LOGIN_URL = 'https://secure.twitch.tv/user/login'
23
24     def _handle_error(self, response):
25         if not isinstance(response, dict):
26             return
27         error = response.get('error')
28         if error:
29             raise ExtractorError(
30                 '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
31                 expected=True)
32
33     def _download_json(self, url, video_id, note='Downloading JSON metadata'):
34         response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
35         self._handle_error(response)
36         return response
37
38     def _real_initialize(self):
39         self._login()
40
41     def _login(self):
42         (username, password) = self._get_login_info()
43         if username is None:
44             return
45
46         login_page = self._download_webpage(
47             self._LOGIN_URL, None, 'Downloading login page')
48
49         authenticity_token = self._search_regex(
50             r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
51             login_page, 'authenticity token')
52
53         login_form = {
54             'utf8': '✓'.encode('utf-8'),
55             'authenticity_token': authenticity_token,
56             'redirect_on_login': '',
57             'embed_form': 'false',
58             'mp_source_action': '',
59             'follow': '',
60             'user[login]': username,
61             'user[password]': password,
62         }
63
64         request = compat_urllib_request.Request(
65             self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
66         request.add_header('Referer', self._LOGIN_URL)
67         response = self._download_webpage(
68             request, None, 'Logging in as %s' % username)
69
70         m = re.search(
71             r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
72         if m:
73             raise ExtractorError(
74                 'Unable to login: %s' % m.group('msg').strip(), expected=True)
75
76
77 class TwitchItemBaseIE(TwitchBaseIE):
78     def _download_info(self, item, item_id):
79         return self._extract_info(self._download_json(
80             '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
81             'Downloading %s info JSON' % self._ITEM_TYPE))
82
83     def _extract_media(self, item_id):
84         info = self._download_info(self._ITEM_SHORTCUT, item_id)
85         response = self._download_json(
86             '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
87             'Downloading %s playlist JSON' % self._ITEM_TYPE)
88         entries = []
89         chunks = response['chunks']
90         qualities = list(chunks.keys())
91         for num, fragment in enumerate(zip(*chunks.values()), start=1):
92             formats = []
93             for fmt_num, fragment_fmt in enumerate(fragment):
94                 format_id = qualities[fmt_num]
95                 fmt = {
96                     'url': fragment_fmt['url'],
97                     'format_id': format_id,
98                     'quality': 1 if format_id == 'live' else 0,
99                 }
100                 m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
101                 if m:
102                     fmt['height'] = int(m.group('height'))
103                 formats.append(fmt)
104             self._sort_formats(formats)
105             entry = dict(info)
106             entry['id'] = '%s_%d' % (entry['id'], num)
107             entry['title'] = '%s part %d' % (entry['title'], num)
108             entry['formats'] = formats
109             entries.append(entry)
110         return self.playlist_result(entries, info['id'], info['title'])
111
112     def _extract_info(self, info):
113         return {
114             'id': info['_id'],
115             'title': info['title'],
116             'description': info['description'],
117             'duration': info['length'],
118             'thumbnail': info['preview'],
119             'uploader': info['channel']['display_name'],
120             'uploader_id': info['channel']['name'],
121             'timestamp': parse_iso8601(info['recorded_at']),
122             'view_count': info['views'],
123         }
124
125     def _real_extract(self, url):
126         return self._extract_media(self._match_id(url))
127
128
129 class TwitchVideoIE(TwitchItemBaseIE):
130     IE_NAME = 'twitch:video'
131     _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
132     _ITEM_TYPE = 'video'
133     _ITEM_SHORTCUT = 'a'
134
135     _TEST = {
136         'url': 'http://www.twitch.tv/riotgames/b/577357806',
137         'info_dict': {
138             'id': 'a577357806',
139             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
140         },
141         'playlist_mincount': 12,
142     }
143
144
145 class TwitchChapterIE(TwitchItemBaseIE):
146     IE_NAME = 'twitch:chapter'
147     _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
148     _ITEM_TYPE = 'chapter'
149     _ITEM_SHORTCUT = 'c'
150
151     _TESTS = [{
152         'url': 'http://www.twitch.tv/acracingleague/c/5285812',
153         'info_dict': {
154             'id': 'c5285812',
155             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
156         },
157         'playlist_mincount': 3,
158     }, {
159         'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
160         'only_matching': True,
161     }]
162
163
164 class TwitchVodIE(TwitchItemBaseIE):
165     IE_NAME = 'twitch:vod'
166     _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
167     _ITEM_TYPE = 'vod'
168     _ITEM_SHORTCUT = 'v'
169
170     _TEST = {
171         'url': 'http://www.twitch.tv/ksptv/v/3622000',
172         'info_dict': {
173             'id': 'v3622000',
174             'ext': 'mp4',
175             'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
176             'thumbnail': 're:^https?://.*\.jpg$',
177             'duration': 6951,
178             'timestamp': 1419028564,
179             'upload_date': '20141219',
180             'uploader': 'KSPTV',
181             'uploader_id': 'ksptv',
182             'view_count': int,
183         },
184         'params': {
185             # m3u8 download
186             'skip_download': True,
187         },
188     }
189
190     def _real_extract(self, url):
191         item_id = self._match_id(url)
192         info = self._download_info(self._ITEM_SHORTCUT, item_id)
193         access_token = self._download_json(
194             '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
195             'Downloading %s access token' % self._ITEM_TYPE)
196         formats = self._extract_m3u8_formats(
197             'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
198             % (item_id, access_token['token'], access_token['sig']),
199             item_id, 'mp4')
200         info['formats'] = formats
201         return info
202
203
204 class TwitchPlaylistBaseIE(TwitchBaseIE):
205     _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
206     _PAGE_LIMIT = 100
207
208     def _extract_playlist(self, channel_id):
209         info = self._download_json(
210             '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
211             channel_id, 'Downloading channel info JSON')
212         channel_name = info.get('display_name') or info.get('name')
213         entries = []
214         offset = 0
215         limit = self._PAGE_LIMIT
216         for counter in itertools.count(1):
217             response = self._download_json(
218                 self._PLAYLIST_URL % (channel_id, offset, limit),
219                 channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
220             videos = response['videos']
221             if not videos:
222                 break
223             entries.extend([self.url_result(video['url']) for video in videos])
224             offset += limit
225         return self.playlist_result(entries, channel_id, channel_name)
226
227     def _real_extract(self, url):
228         return self._extract_playlist(self._match_id(url))
229
230
231 class TwitchProfileIE(TwitchPlaylistBaseIE):
232     IE_NAME = 'twitch:profile'
233     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
234     _PLAYLIST_TYPE = 'profile'
235
236     _TEST = {
237         'url': 'http://www.twitch.tv/vanillatv/profile',
238         'info_dict': {
239             'id': 'vanillatv',
240             'title': 'VanillaTV',
241         },
242         'playlist_mincount': 412,
243     }
244
245
246 class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
247     IE_NAME = 'twitch:past_broadcasts'
248     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
249     _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
250     _PLAYLIST_TYPE = 'past broadcasts'
251
252     _TEST = {
253         'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
254         'info_dict': {
255             'id': 'spamfish',
256             'title': 'Spamfish',
257         },
258         'playlist_mincount': 54,
259     }