]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/InfoExtractors.py
don't catch YT user URLs in YoutubePlaylistIE (fix #754, fix #763)
[youtube-dl.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119         try:
120             return compat_urllib_request.urlopen(url_or_request)
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127         """ Returns the data of the page as a string """
128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129         content_type = urlh.headers.get('Content-Type', '')
130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131         if m:
132             encoding = m.group(1)
133         else:
134             encoding = 'utf-8'
135         webpage_bytes = urlh.read()
136         return webpage_bytes.decode(encoding, 'replace')
137
138
139 class YoutubeIE(InfoExtractor):
140     """Information extractor for youtube.com."""
141
142     _VALID_URL = r"""^
143                      (
144                          (?:https?://)?                                       # http(s):// (optional)
145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
148                          (?:                                                  # the various things that can precede the ID:
149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
150                              |(?:                                             # or the v= param in all its forms
151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
154                                  v=
155                              )
156                          )?                                                   # optional -> youtube.com/xxxx is OK
157                      )?                                                       # all until now is optional -> you can pass the naked ID
158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
159                      (?(1).+)?                                                # if we found the ID, everything can follow
160                      $"""
161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165     _NETRC_MACHINE = 'youtube'
166     # Listed in order of quality
167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169     _video_extensions = {
170         '13': '3gp',
171         '17': 'mp4',
172         '18': 'mp4',
173         '22': 'mp4',
174         '37': 'mp4',
175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176         '43': 'webm',
177         '44': 'webm',
178         '45': 'webm',
179         '46': 'webm',
180     }
181     _video_dimensions = {
182         '5': '240x400',
183         '6': '???',
184         '13': '???',
185         '17': '144x176',
186         '18': '360x640',
187         '22': '720x1280',
188         '34': '360x640',
189         '35': '480x854',
190         '37': '1080x1920',
191         '38': '3072x4096',
192         '43': '360x640',
193         '44': '480x854',
194         '45': '720x1280',
195         '46': '1080x1920',
196     }
197     IE_NAME = u'youtube'
198
199     @classmethod
200     def suitable(cls, url):
201         """Receives a URL and returns True if suitable for this IE."""
202         if YoutubePlaylistIE.suitable(url): return False
203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
204
205     def report_lang(self):
206         """Report attempt to set language."""
207         self._downloader.to_screen(u'[youtube] Setting language')
208
209     def report_login(self):
210         """Report attempt to log in."""
211         self._downloader.to_screen(u'[youtube] Logging in')
212
213     def report_age_confirmation(self):
214         """Report attempt to confirm age."""
215         self._downloader.to_screen(u'[youtube] Confirming age')
216
217     def report_video_webpage_download(self, video_id):
218         """Report attempt to download video webpage."""
219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221     def report_video_info_webpage_download(self, video_id):
222         """Report attempt to download video info webpage."""
223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225     def report_video_subtitles_download(self, video_id):
226         """Report attempt to download video info webpage."""
227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
228
229     def report_video_subtitles_request(self, video_id, sub_lang, format):
230         """Report attempt to download video info webpage."""
231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
232
233     def report_video_subtitles_available(self, video_id, sub_lang_list):
234         """Report available subtitles."""
235         sub_lang = ",".join(list(sub_lang_list.keys()))
236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
237
238     def report_information_extraction(self, video_id):
239         """Report attempt to extract video information."""
240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
241
242     def report_unavailable_format(self, video_id, format):
243         """Report extracted video URL."""
244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
245
246     def report_rtmp_download(self):
247         """Indicate the download will use the RTMP protocol."""
248         self._downloader.to_screen(u'[youtube] RTMP download detected')
249
250     def _get_available_subtitles(self, video_id):
251         self.report_video_subtitles_download(video_id)
252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253         try:
254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259         if not sub_lang_list:
260             return (u'WARNING: video doesn\'t have subtitles', None)
261         return sub_lang_list
262
263     def _list_available_subtitles(self, video_id):
264         sub_lang_list = self._get_available_subtitles(video_id)
265         self.report_video_subtitles_available(video_id, sub_lang_list)
266
267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268         self.report_video_subtitles_request(video_id, sub_lang, format)
269         params = compat_urllib_parse.urlencode({
270             'lang': sub_lang,
271             'name': sub_name,
272             'v': video_id,
273             'fmt': format,
274         })
275         url = 'http://www.youtube.com/api/timedtext?' + params
276         try:
277             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
280         if not sub:
281             return (u'WARNING: Did not fetch video subtitles', None)
282         return (None, sub_lang, sub)
283
284     def _extract_subtitle(self, video_id):
285         sub_lang_list = self._get_available_subtitles(video_id)
286         sub_format = self._downloader.params.get('subtitlesformat')
287         if self._downloader.params.get('subtitleslang', False):
288             sub_lang = self._downloader.params.get('subtitleslang')
289         elif 'en' in sub_lang_list:
290             sub_lang = 'en'
291         else:
292             sub_lang = list(sub_lang_list.keys())[0]
293         if not sub_lang in sub_lang_list:
294             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
295
296         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
297         return [subtitle]
298
299     def _extract_all_subtitles(self, video_id):
300         sub_lang_list = self._get_available_subtitles(video_id)
301         sub_format = self._downloader.params.get('subtitlesformat')
302         subtitles = []
303         for sub_lang in sub_lang_list:
304             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
305             subtitles.append(subtitle)
306         return subtitles
307
308     def _print_formats(self, formats):
309         print('Available formats:')
310         for x in formats:
311             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
312
313     def _real_initialize(self):
314         if self._downloader is None:
315             return
316
317         username = None
318         password = None
319         downloader_params = self._downloader.params
320
321         # Attempt to use provided username and password or .netrc data
322         if downloader_params.get('username', None) is not None:
323             username = downloader_params['username']
324             password = downloader_params['password']
325         elif downloader_params.get('usenetrc', False):
326             try:
327                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
328                 if info is not None:
329                     username = info[0]
330                     password = info[2]
331                 else:
332                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
333             except (IOError, netrc.NetrcParseError) as err:
334                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
335                 return
336
337         # Set language
338         request = compat_urllib_request.Request(self._LANG_URL)
339         try:
340             self.report_lang()
341             compat_urllib_request.urlopen(request).read()
342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
344             return
345
346         # No authentication to be performed
347         if username is None:
348             return
349
350         request = compat_urllib_request.Request(self._LOGIN_URL)
351         try:
352             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
354             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
355             return
356
357         galx = None
358         dsh = None
359         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
360         if match:
361           galx = match.group(1)
362
363         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
364         if match:
365           dsh = match.group(1)
366
367         # Log in
368         login_form_strs = {
369                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
370                 u'Email': username,
371                 u'GALX': galx,
372                 u'Passwd': password,
373                 u'PersistentCookie': u'yes',
374                 u'_utf8': u'霱',
375                 u'bgresponse': u'js_disabled',
376                 u'checkConnection': u'',
377                 u'checkedDomains': u'youtube',
378                 u'dnConn': u'',
379                 u'dsh': dsh,
380                 u'pstMsg': u'0',
381                 u'rmShown': u'1',
382                 u'secTok': u'',
383                 u'signIn': u'Sign in',
384                 u'timeStmp': u'',
385                 u'service': u'youtube',
386                 u'uilel': u'3',
387                 u'hl': u'en_US',
388         }
389         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
390         # chokes on unicode
391         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
392         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
393         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
394         try:
395             self.report_login()
396             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
397             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
398                 self._downloader.report_warning(u'unable to log in: bad username or password')
399                 return
400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
401             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
402             return
403
404         # Confirm age
405         age_form = {
406                 'next_url':     '/',
407                 'action_confirm':   'Confirm',
408                 }
409         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
410         try:
411             self.report_age_confirmation()
412             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
415             return
416
417     def _extract_id(self, url):
418         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
419         if mobj is None:
420             self._downloader.report_error(u'invalid URL: %s' % url)
421             return
422         video_id = mobj.group(2)
423         return video_id
424
425     def _real_extract(self, url):
426         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
427         mobj = re.search(self._NEXT_URL_RE, url)
428         if mobj:
429             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
430         video_id = self._extract_id(url)
431
432         # Get video webpage
433         self.report_video_webpage_download(video_id)
434         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
435         request = compat_urllib_request.Request(url)
436         try:
437             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
438         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
439             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
440             return
441
442         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
443
444         # Attempt to extract SWF player URL
445         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
446         if mobj is not None:
447             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
448         else:
449             player_url = None
450
451         # Get video info
452         self.report_video_info_webpage_download(video_id)
453         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
454             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
455                     % (video_id, el_type))
456             request = compat_urllib_request.Request(video_info_url)
457             try:
458                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
459                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
460                 video_info = compat_parse_qs(video_info_webpage)
461                 if 'token' in video_info:
462                     break
463             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
464                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
465                 return
466         if 'token' not in video_info:
467             if 'reason' in video_info:
468                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
469             else:
470                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
471             return
472
473         # Check for "rental" videos
474         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
475             self._downloader.report_error(u'"rental" videos not supported')
476             return
477
478         # Start extracting information
479         self.report_information_extraction(video_id)
480
481         # uploader
482         if 'author' not in video_info:
483             self._downloader.report_error(u'unable to extract uploader name')
484             return
485         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
486
487         # uploader_id
488         video_uploader_id = None
489         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
490         if mobj is not None:
491             video_uploader_id = mobj.group(1)
492         else:
493             self._downloader.report_warning(u'unable to extract uploader nickname')
494
495         # title
496         if 'title' not in video_info:
497             self._downloader.report_error(u'unable to extract video title')
498             return
499         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
500
501         # thumbnail image
502         if 'thumbnail_url' not in video_info:
503             self._downloader.report_warning(u'unable to extract video thumbnail')
504             video_thumbnail = ''
505         else:   # don't panic if we can't find it
506             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
507
508         # upload date
509         upload_date = None
510         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
511         if mobj is not None:
512             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
513             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
514             for expression in format_expressions:
515                 try:
516                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
517                 except:
518                     pass
519
520         # description
521         video_description = get_element_by_id("eow-description", video_webpage)
522         if video_description:
523             video_description = clean_html(video_description)
524         else:
525             video_description = ''
526
527         # subtitles
528         video_subtitles = None
529
530         if self._downloader.params.get('writesubtitles', False):
531             video_subtitles = self._extract_subtitle(video_id)
532             if video_subtitles:
533                 (sub_error, sub_lang, sub) = video_subtitles[0]
534                 if sub_error:
535                     self._downloader.trouble(sub_error)
536
537         if self._downloader.params.get('allsubtitles', False):
538             video_subtitles = self._extract_all_subtitles(video_id)
539             for video_subtitle in video_subtitles:
540                 (sub_error, sub_lang, sub) = video_subtitle
541                 if sub_error:
542                     self._downloader.trouble(sub_error)
543
544         if self._downloader.params.get('listsubtitles', False):
545             sub_lang_list = self._list_available_subtitles(video_id)
546             return
547
548         if 'length_seconds' not in video_info:
549             self._downloader.report_warning(u'unable to extract video duration')
550             video_duration = ''
551         else:
552             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
553
554         # token
555         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
556
557         # Decide which formats to download
558         req_format = self._downloader.params.get('format', None)
559
560         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
561             self.report_rtmp_download()
562             video_url_list = [(None, video_info['conn'][0])]
563         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
564             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
565             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
566             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
567             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
568
569             format_limit = self._downloader.params.get('format_limit', None)
570             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
571             if format_limit is not None and format_limit in available_formats:
572                 format_list = available_formats[available_formats.index(format_limit):]
573             else:
574                 format_list = available_formats
575             existing_formats = [x for x in format_list if x in url_map]
576             if len(existing_formats) == 0:
577                 self._downloader.report_error(u'no known formats available for video')
578                 return
579             if self._downloader.params.get('listformats', None):
580                 self._print_formats(existing_formats)
581                 return
582             if req_format is None or req_format == 'best':
583                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
584             elif req_format == 'worst':
585                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
586             elif req_format in ('-1', 'all'):
587                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
588             else:
589                 # Specific formats. We pick the first in a slash-delimeted sequence.
590                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
591                 req_formats = req_format.split('/')
592                 video_url_list = None
593                 for rf in req_formats:
594                     if rf in url_map:
595                         video_url_list = [(rf, url_map[rf])]
596                         break
597                 if video_url_list is None:
598                     self._downloader.report_error(u'requested format not available')
599                     return
600         else:
601             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
602             return
603
604         results = []
605         for format_param, video_real_url in video_url_list:
606             # Extension
607             video_extension = self._video_extensions.get(format_param, 'flv')
608
609             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
610                                               self._video_dimensions.get(format_param, '???'))
611
612             results.append({
613                 'id':       video_id,
614                 'url':      video_real_url,
615                 'uploader': video_uploader,
616                 'uploader_id': video_uploader_id,
617                 'upload_date':  upload_date,
618                 'title':    video_title,
619                 'ext':      video_extension,
620                 'format':   video_format,
621                 'thumbnail':    video_thumbnail,
622                 'description':  video_description,
623                 'player_url':   player_url,
624                 'subtitles':    video_subtitles,
625                 'duration':     video_duration
626             })
627         return results
628
629
630 class MetacafeIE(InfoExtractor):
631     """Information Extractor for metacafe.com."""
632
633     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
634     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
635     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
636     IE_NAME = u'metacafe'
637
638     def __init__(self, downloader=None):
639         InfoExtractor.__init__(self, downloader)
640
641     def report_disclaimer(self):
642         """Report disclaimer retrieval."""
643         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
644
645     def report_age_confirmation(self):
646         """Report attempt to confirm age."""
647         self._downloader.to_screen(u'[metacafe] Confirming age')
648
649     def report_download_webpage(self, video_id):
650         """Report webpage download."""
651         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
652
653     def report_extraction(self, video_id):
654         """Report information extraction."""
655         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
656
657     def _real_initialize(self):
658         # Retrieve disclaimer
659         request = compat_urllib_request.Request(self._DISCLAIMER)
660         try:
661             self.report_disclaimer()
662             disclaimer = compat_urllib_request.urlopen(request).read()
663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
665             return
666
667         # Confirm age
668         disclaimer_form = {
669             'filters': '0',
670             'submit': "Continue - I'm over 18",
671             }
672         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
673         try:
674             self.report_age_confirmation()
675             disclaimer = compat_urllib_request.urlopen(request).read()
676         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
677             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
678             return
679
680     def _real_extract(self, url):
681         # Extract id and simplified title from URL
682         mobj = re.match(self._VALID_URL, url)
683         if mobj is None:
684             self._downloader.report_error(u'invalid URL: %s' % url)
685             return
686
687         video_id = mobj.group(1)
688
689         # Check if video comes from YouTube
690         mobj2 = re.match(r'^yt-(.*)$', video_id)
691         if mobj2 is not None:
692             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
693             return
694
695         # Retrieve video webpage to extract further information
696         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
697         try:
698             self.report_download_webpage(video_id)
699             webpage = compat_urllib_request.urlopen(request).read()
700         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
701             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
702             return
703
704         # Extract URL, uploader and title from webpage
705         self.report_extraction(video_id)
706         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
707         if mobj is not None:
708             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
709             video_extension = mediaURL[-3:]
710
711             # Extract gdaKey if available
712             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
713             if mobj is None:
714                 video_url = mediaURL
715             else:
716                 gdaKey = mobj.group(1)
717                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
718         else:
719             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
720             if mobj is None:
721                 self._downloader.report_error(u'unable to extract media URL')
722                 return
723             vardict = compat_parse_qs(mobj.group(1))
724             if 'mediaData' not in vardict:
725                 self._downloader.report_error(u'unable to extract media URL')
726                 return
727             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
728             if mobj is None:
729                 self._downloader.report_error(u'unable to extract media URL')
730                 return
731             mediaURL = mobj.group(1).replace('\\/', '/')
732             video_extension = mediaURL[-3:]
733             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
734
735         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
736         if mobj is None:
737             self._downloader.report_error(u'unable to extract title')
738             return
739         video_title = mobj.group(1).decode('utf-8')
740
741         mobj = re.search(r'submitter=(.*?);', webpage)
742         if mobj is None:
743             self._downloader.report_error(u'unable to extract uploader nickname')
744             return
745         video_uploader = mobj.group(1)
746
747         return [{
748             'id':       video_id.decode('utf-8'),
749             'url':      video_url.decode('utf-8'),
750             'uploader': video_uploader.decode('utf-8'),
751             'upload_date':  None,
752             'title':    video_title,
753             'ext':      video_extension.decode('utf-8'),
754         }]
755
756
757 class DailymotionIE(InfoExtractor):
758     """Information Extractor for Dailymotion"""
759
760     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
761     IE_NAME = u'dailymotion'
762     _WORKING = False
763
764     def __init__(self, downloader=None):
765         InfoExtractor.__init__(self, downloader)
766
767     def report_extraction(self, video_id):
768         """Report information extraction."""
769         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
770
771     def _real_extract(self, url):
772         # Extract id and simplified title from URL
773         mobj = re.match(self._VALID_URL, url)
774         if mobj is None:
775             self._downloader.report_error(u'invalid URL: %s' % url)
776             return
777
778         video_id = mobj.group(1).split('_')[0].split('?')[0]
779
780         video_extension = 'mp4'
781
782         # Retrieve video webpage to extract further information
783         request = compat_urllib_request.Request(url)
784         request.add_header('Cookie', 'family_filter=off')
785         webpage = self._download_webpage(request, video_id)
786
787         # Extract URL, uploader and title from webpage
788         self.report_extraction(video_id)
789         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
790         if mobj is None:
791             self._downloader.report_error(u'unable to extract media URL')
792             return
793         flashvars = compat_urllib_parse.unquote(mobj.group(1))
794
795         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
796             if key in flashvars:
797                 max_quality = key
798                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
799                 break
800         else:
801             self._downloader.report_error(u'unable to extract video URL')
802             return
803
804         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
805         if mobj is None:
806             self._downloader.report_error(u'unable to extract video URL')
807             return
808
809         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
810
811         # TODO: support choosing qualities
812
813         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
814         if mobj is None:
815             self._downloader.report_error(u'unable to extract title')
816             return
817         video_title = unescapeHTML(mobj.group('title'))
818
819         video_uploader = None
820         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
821         if mobj is None:
822             # lookin for official user
823             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
824             if mobj_official is None:
825                 self._downloader.report_warning(u'unable to extract uploader nickname')
826             else:
827                 video_uploader = mobj_official.group(1)
828         else:
829             video_uploader = mobj.group(1)
830
831         video_upload_date = None
832         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
833         if mobj is not None:
834             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
835
836         return [{
837             'id':       video_id,
838             'url':      video_url,
839             'uploader': video_uploader,
840             'upload_date':  video_upload_date,
841             'title':    video_title,
842             'ext':      video_extension,
843         }]
844
845
846 class PhotobucketIE(InfoExtractor):
847     """Information extractor for photobucket.com."""
848
849     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
850     IE_NAME = u'photobucket'
851
852     def __init__(self, downloader=None):
853         InfoExtractor.__init__(self, downloader)
854
855     def report_download_webpage(self, video_id):
856         """Report webpage download."""
857         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
858
859     def report_extraction(self, video_id):
860         """Report information extraction."""
861         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
862
863     def _real_extract(self, url):
864         # Extract id from URL
865         mobj = re.match(self._VALID_URL, url)
866         if mobj is None:
867             self._downloader.report_error(u'Invalid URL: %s' % url)
868             return
869
870         video_id = mobj.group(1)
871
872         video_extension = 'flv'
873
874         # Retrieve video webpage to extract further information
875         request = compat_urllib_request.Request(url)
876         try:
877             self.report_download_webpage(video_id)
878             webpage = compat_urllib_request.urlopen(request).read()
879         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
880             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
881             return
882
883         # Extract URL, uploader, and title from webpage
884         self.report_extraction(video_id)
885         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
886         if mobj is None:
887             self._downloader.report_error(u'unable to extract media URL')
888             return
889         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
890
891         video_url = mediaURL
892
893         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
894         if mobj is None:
895             self._downloader.report_error(u'unable to extract title')
896             return
897         video_title = mobj.group(1).decode('utf-8')
898
899         video_uploader = mobj.group(2).decode('utf-8')
900
901         return [{
902             'id':       video_id.decode('utf-8'),
903             'url':      video_url.decode('utf-8'),
904             'uploader': video_uploader,
905             'upload_date':  None,
906             'title':    video_title,
907             'ext':      video_extension.decode('utf-8'),
908         }]
909
910
911 class YahooIE(InfoExtractor):
912     """Information extractor for video.yahoo.com."""
913
914     _WORKING = False
915     # _VALID_URL matches all Yahoo! Video URLs
916     # _VPAGE_URL matches only the extractable '/watch/' URLs
917     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919     IE_NAME = u'video.yahoo'
920
921     def __init__(self, downloader=None):
922         InfoExtractor.__init__(self, downloader)
923
924     def report_download_webpage(self, video_id):
925         """Report webpage download."""
926         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
927
928     def report_extraction(self, video_id):
929         """Report information extraction."""
930         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
931
932     def _real_extract(self, url, new_video=True):
933         # Extract ID from URL
934         mobj = re.match(self._VALID_URL, url)
935         if mobj is None:
936             self._downloader.report_error(u'Invalid URL: %s' % url)
937             return
938
939         video_id = mobj.group(2)
940         video_extension = 'flv'
941
942         # Rewrite valid but non-extractable URLs as
943         # extractable English language /watch/ URLs
944         if re.match(self._VPAGE_URL, url) is None:
945             request = compat_urllib_request.Request(url)
946             try:
947                 webpage = compat_urllib_request.urlopen(request).read()
948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
950                 return
951
952             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
953             if mobj is None:
954                 self._downloader.report_error(u'Unable to extract id field')
955                 return
956             yahoo_id = mobj.group(1)
957
958             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
959             if mobj is None:
960                 self._downloader.report_error(u'Unable to extract vid field')
961                 return
962             yahoo_vid = mobj.group(1)
963
964             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965             return self._real_extract(url, new_video=False)
966
967         # Retrieve video webpage to extract further information
968         request = compat_urllib_request.Request(url)
969         try:
970             self.report_download_webpage(video_id)
971             webpage = compat_urllib_request.urlopen(request).read()
972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
974             return
975
976         # Extract uploader and title from webpage
977         self.report_extraction(video_id)
978         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
979         if mobj is None:
980             self._downloader.report_error(u'unable to extract video title')
981             return
982         video_title = mobj.group(1).decode('utf-8')
983
984         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
985         if mobj is None:
986             self._downloader.report_error(u'unable to extract video uploader')
987             return
988         video_uploader = mobj.group(1).decode('utf-8')
989
990         # Extract video thumbnail
991         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
992         if mobj is None:
993             self._downloader.report_error(u'unable to extract video thumbnail')
994             return
995         video_thumbnail = mobj.group(1).decode('utf-8')
996
997         # Extract video description
998         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video description')
1001             return
1002         video_description = mobj.group(1).decode('utf-8')
1003         if not video_description:
1004             video_description = 'No description available.'
1005
1006         # Extract video height and width
1007         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video height')
1010             return
1011         yv_video_height = mobj.group(1)
1012
1013         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video width')
1016             return
1017         yv_video_width = mobj.group(1)
1018
1019         # Retrieve video playlist to extract media URL
1020         # I'm not completely sure what all these options are, but we
1021         # seem to need most of them, otherwise the server sends a 401.
1022         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1023         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1024         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1027         try:
1028             self.report_download_webpage(video_id)
1029             webpage = compat_urllib_request.urlopen(request).read()
1030         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1032             return
1033
1034         # Extract media URL from playlist XML
1035         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'Unable to extract media URL')
1038             return
1039         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040         video_url = unescapeHTML(video_url)
1041
1042         return [{
1043             'id':       video_id.decode('utf-8'),
1044             'url':      video_url,
1045             'uploader': video_uploader,
1046             'upload_date':  None,
1047             'title':    video_title,
1048             'ext':      video_extension.decode('utf-8'),
1049             'thumbnail':    video_thumbnail.decode('utf-8'),
1050             'description':  video_description,
1051         }]
1052
1053
1054 class VimeoIE(InfoExtractor):
1055     """Information extractor for vimeo.com."""
1056
1057     # _VALID_URL matches Vimeo URLs
1058     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1059     IE_NAME = u'vimeo'
1060
1061     def __init__(self, downloader=None):
1062         InfoExtractor.__init__(self, downloader)
1063
1064     def report_download_webpage(self, video_id):
1065         """Report webpage download."""
1066         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1067
1068     def report_extraction(self, video_id):
1069         """Report information extraction."""
1070         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1071
1072     def _real_extract(self, url, new_video=True):
1073         # Extract ID from URL
1074         mobj = re.match(self._VALID_URL, url)
1075         if mobj is None:
1076             self._downloader.report_error(u'Invalid URL: %s' % url)
1077             return
1078
1079         video_id = mobj.group('id')
1080         if not mobj.group('proto'):
1081             url = 'https://' + url
1082         if mobj.group('direct_link'):
1083             url = 'https://vimeo.com/' + video_id
1084
1085         # Retrieve video webpage to extract further information
1086         request = compat_urllib_request.Request(url, None, std_headers)
1087         try:
1088             self.report_download_webpage(video_id)
1089             webpage_bytes = compat_urllib_request.urlopen(request).read()
1090             webpage = webpage_bytes.decode('utf-8')
1091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1092             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1093             return
1094
1095         # Now we begin extracting as much information as we can from what we
1096         # retrieved. First we extract the information common to all extractors,
1097         # and latter we extract those that are Vimeo specific.
1098         self.report_extraction(video_id)
1099
1100         # Extract the config JSON
1101         try:
1102             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1103             config = json.loads(config)
1104         except:
1105             self._downloader.report_error(u'unable to extract info section')
1106             return
1107
1108         # Extract title
1109         video_title = config["video"]["title"]
1110
1111         # Extract uploader and uploader_id
1112         video_uploader = config["video"]["owner"]["name"]
1113         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1114
1115         # Extract video thumbnail
1116         video_thumbnail = config["video"]["thumbnail"]
1117
1118         # Extract video description
1119         video_description = get_element_by_attribute("itemprop", "description", webpage)
1120         if video_description: video_description = clean_html(video_description)
1121         else: video_description = ''
1122
1123         # Extract upload date
1124         video_upload_date = None
1125         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1126         if mobj is not None:
1127             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1128
1129         # Vimeo specific: extract request signature and timestamp
1130         sig = config['request']['signature']
1131         timestamp = config['request']['timestamp']
1132
1133         # Vimeo specific: extract video codec and quality information
1134         # First consider quality, then codecs, then take everything
1135         # TODO bind to format param
1136         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1137         files = { 'hd': [], 'sd': [], 'other': []}
1138         for codec_name, codec_extension in codecs:
1139             if codec_name in config["video"]["files"]:
1140                 if 'hd' in config["video"]["files"][codec_name]:
1141                     files['hd'].append((codec_name, codec_extension, 'hd'))
1142                 elif 'sd' in config["video"]["files"][codec_name]:
1143                     files['sd'].append((codec_name, codec_extension, 'sd'))
1144                 else:
1145                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1146
1147         for quality in ('hd', 'sd', 'other'):
1148             if len(files[quality]) > 0:
1149                 video_quality = files[quality][0][2]
1150                 video_codec = files[quality][0][0]
1151                 video_extension = files[quality][0][1]
1152                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1153                 break
1154         else:
1155             self._downloader.report_error(u'no known codec found')
1156             return
1157
1158         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1159                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1160
1161         return [{
1162             'id':       video_id,
1163             'url':      video_url,
1164             'uploader': video_uploader,
1165             'uploader_id': video_uploader_id,
1166             'upload_date':  video_upload_date,
1167             'title':    video_title,
1168             'ext':      video_extension,
1169             'thumbnail':    video_thumbnail,
1170             'description':  video_description,
1171         }]
1172
1173
1174 class ArteTvIE(InfoExtractor):
1175     """arte.tv information extractor."""
1176
1177     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1178     _LIVE_URL = r'index-[0-9]+\.html$'
1179
1180     IE_NAME = u'arte.tv'
1181
1182     def __init__(self, downloader=None):
1183         InfoExtractor.__init__(self, downloader)
1184
1185     def report_download_webpage(self, video_id):
1186         """Report webpage download."""
1187         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1188
1189     def report_extraction(self, video_id):
1190         """Report information extraction."""
1191         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1192
1193     def fetch_webpage(self, url):
1194         request = compat_urllib_request.Request(url)
1195         try:
1196             self.report_download_webpage(url)
1197             webpage = compat_urllib_request.urlopen(request).read()
1198         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1199             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1200             return
1201         except ValueError as err:
1202             self._downloader.report_error(u'Invalid URL: %s' % url)
1203             return
1204         return webpage
1205
1206     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1207         page = self.fetch_webpage(url)
1208         mobj = re.search(regex, page, regexFlags)
1209         info = {}
1210
1211         if mobj is None:
1212             self._downloader.report_error(u'Invalid URL: %s' % url)
1213             return
1214
1215         for (i, key, err) in matchTuples:
1216             if mobj.group(i) is None:
1217                 self._downloader.trouble(err)
1218                 return
1219             else:
1220                 info[key] = mobj.group(i)
1221
1222         return info
1223
1224     def extractLiveStream(self, url):
1225         video_lang = url.split('/')[-4]
1226         info = self.grep_webpage(
1227             url,
1228             r'src="(.*?/videothek_js.*?\.js)',
1229             0,
1230             [
1231                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1232             ]
1233         )
1234         http_host = url.split('/')[2]
1235         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239                 '(http://.*?\.swf).*?' +
1240                 '(rtmp://.*?)\'',
1241             re.DOTALL,
1242             [
1243                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1244                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1245                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1249
1250     def extractPlus7Stream(self, url):
1251         video_lang = url.split('/')[-3]
1252         info = self.grep_webpage(
1253             url,
1254             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1255             0,
1256             [
1257                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1258             ]
1259         )
1260         next_url = compat_urllib_parse.unquote(info.get('url'))
1261         info = self.grep_webpage(
1262             next_url,
1263             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1264             0,
1265             [
1266                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video id="(.*?)".*?>.*?' +
1274                 '<name>(.*?)</name>.*?' +
1275                 '<dateVideo>(.*?)</dateVideo>.*?' +
1276                 '<url quality="hd">(.*?)</url>',
1277             re.DOTALL,
1278             [
1279                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1280                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1281                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1282                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1283             ]
1284         )
1285
1286         return {
1287             'id':           info.get('id'),
1288             'url':          compat_urllib_parse.unquote(info.get('url')),
1289             'uploader':     u'arte.tv',
1290             'upload_date':  info.get('date'),
1291             'title':        info.get('title').decode('utf-8'),
1292             'ext':          u'mp4',
1293             'format':       u'NA',
1294             'player_url':   None,
1295         }
1296
1297     def _real_extract(self, url):
1298         video_id = url.split('/')[-1]
1299         self.report_extraction(video_id)
1300
1301         if re.search(self._LIVE_URL, video_id) is not None:
1302             self.extractLiveStream(url)
1303             return
1304         else:
1305             info = self.extractPlus7Stream(url)
1306
1307         return [info]
1308
1309
1310 class GenericIE(InfoExtractor):
1311     """Generic last-resort information extractor."""
1312
1313     _VALID_URL = r'.*'
1314     IE_NAME = u'generic'
1315
1316     def __init__(self, downloader=None):
1317         InfoExtractor.__init__(self, downloader)
1318
1319     def report_download_webpage(self, video_id):
1320         """Report webpage download."""
1321         if not self._downloader.params.get('test', False):
1322             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1323         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1324
1325     def report_extraction(self, video_id):
1326         """Report information extraction."""
1327         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1328
1329     def report_following_redirect(self, new_url):
1330         """Report information extraction."""
1331         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1332
1333     def _test_redirect(self, url):
1334         """Check if it is a redirect, like url shorteners, in case restart chain."""
1335         class HeadRequest(compat_urllib_request.Request):
1336             def get_method(self):
1337                 return "HEAD"
1338
1339         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1340             """
1341             Subclass the HTTPRedirectHandler to make it use our
1342             HeadRequest also on the redirected URL
1343             """
1344             def redirect_request(self, req, fp, code, msg, headers, newurl):
1345                 if code in (301, 302, 303, 307):
1346                     newurl = newurl.replace(' ', '%20')
1347                     newheaders = dict((k,v) for k,v in req.headers.items()
1348                                       if k.lower() not in ("content-length", "content-type"))
1349                     return HeadRequest(newurl,
1350                                        headers=newheaders,
1351                                        origin_req_host=req.get_origin_req_host(),
1352                                        unverifiable=True)
1353                 else:
1354                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1355
1356         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1357             """
1358             Fallback to GET if HEAD is not allowed (405 HTTP error)
1359             """
1360             def http_error_405(self, req, fp, code, msg, headers):
1361                 fp.read()
1362                 fp.close()
1363
1364                 newheaders = dict((k,v) for k,v in req.headers.items()
1365                                   if k.lower() not in ("content-length", "content-type"))
1366                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1367                                                  headers=newheaders,
1368                                                  origin_req_host=req.get_origin_req_host(),
1369                                                  unverifiable=True))
1370
1371         # Build our opener
1372         opener = compat_urllib_request.OpenerDirector()
1373         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1374                         HTTPMethodFallback, HEADRedirectHandler,
1375                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1376             opener.add_handler(handler())
1377
1378         response = opener.open(HeadRequest(url))
1379         new_url = response.geturl()
1380
1381         if url == new_url:
1382             return False
1383
1384         self.report_following_redirect(new_url)
1385         self._downloader.download([new_url])
1386         return True
1387
1388     def _real_extract(self, url):
1389         if self._test_redirect(url): return
1390
1391         video_id = url.split('/')[-1]
1392         try:
1393             webpage = self._download_webpage(url, video_id)
1394         except ValueError as err:
1395             # since this is the last-resort InfoExtractor, if
1396             # this error is thrown, it'll be thrown here
1397             self._downloader.report_error(u'Invalid URL: %s' % url)
1398             return
1399
1400         self.report_extraction(video_id)
1401         # Start with something easy: JW Player in SWFObject
1402         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit
1405             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit: JWPlayer JS loader
1408             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             self._downloader.report_error(u'Invalid URL: %s' % url)
1411             return
1412
1413         # It's possible that one of the regexes
1414         # matched, but returned an empty group:
1415         if mobj.group(1) is None:
1416             self._downloader.report_error(u'Invalid URL: %s' % url)
1417             return
1418
1419         video_url = compat_urllib_parse.unquote(mobj.group(1))
1420         video_id = os.path.basename(video_url)
1421
1422         # here's a fun little line of code for you:
1423         video_extension = os.path.splitext(video_id)[1][1:]
1424         video_id = os.path.splitext(video_id)[0]
1425
1426         # it's tempting to parse this further, but you would
1427         # have to take into account all the variations like
1428         #   Video Title - Site Name
1429         #   Site Name | Video Title
1430         #   Video Title - Tagline | Site Name
1431         # and so on and so forth; it's just not practical
1432         mobj = re.search(r'<title>(.*)</title>', webpage)
1433         if mobj is None:
1434             self._downloader.report_error(u'unable to extract title')
1435             return
1436         video_title = mobj.group(1)
1437
1438         # video uploader is domain name
1439         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1440         if mobj is None:
1441             self._downloader.report_error(u'unable to extract title')
1442             return
1443         video_uploader = mobj.group(1)
1444
1445         return [{
1446             'id':       video_id,
1447             'url':      video_url,
1448             'uploader': video_uploader,
1449             'upload_date':  None,
1450             'title':    video_title,
1451             'ext':      video_extension,
1452         }]
1453
1454
1455 class YoutubeSearchIE(InfoExtractor):
1456     """Information Extractor for YouTube search queries."""
1457     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1458     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1459     _max_youtube_results = 1000
1460     IE_NAME = u'youtube:search'
1461
1462     def __init__(self, downloader=None):
1463         InfoExtractor.__init__(self, downloader)
1464
1465     def report_download_page(self, query, pagenum):
1466         """Report attempt to download search page with given number."""
1467         query = query.decode(preferredencoding())
1468         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1469
1470     def _real_extract(self, query):
1471         mobj = re.match(self._VALID_URL, query)
1472         if mobj is None:
1473             self._downloader.report_error(u'invalid search query "%s"' % query)
1474             return
1475
1476         prefix, query = query.split(':')
1477         prefix = prefix[8:]
1478         query = query.encode('utf-8')
1479         if prefix == '':
1480             self._download_n_results(query, 1)
1481             return
1482         elif prefix == 'all':
1483             self._download_n_results(query, self._max_youtube_results)
1484             return
1485         else:
1486             try:
1487                 n = int(prefix)
1488                 if n <= 0:
1489                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1490                     return
1491                 elif n > self._max_youtube_results:
1492                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1493                     n = self._max_youtube_results
1494                 self._download_n_results(query, n)
1495                 return
1496             except ValueError: # parsing prefix as integer fails
1497                 self._download_n_results(query, 1)
1498                 return
1499
1500     def _download_n_results(self, query, n):
1501         """Downloads a specified number of results for a query"""
1502
1503         video_ids = []
1504         pagenum = 0
1505         limit = n
1506
1507         while (50 * pagenum) < limit:
1508             self.report_download_page(query, pagenum+1)
1509             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1510             request = compat_urllib_request.Request(result_url)
1511             try:
1512                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1513             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1514                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1515                 return
1516             api_response = json.loads(data)['data']
1517
1518             if not 'items' in api_response:
1519                 self._downloader.trouble(u'[youtube] No video results')
1520                 return
1521
1522             new_ids = list(video['id'] for video in api_response['items'])
1523             video_ids += new_ids
1524
1525             limit = min(n, api_response['totalItems'])
1526             pagenum += 1
1527
1528         if len(video_ids) > n:
1529             video_ids = video_ids[:n]
1530         for id in video_ids:
1531             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1532         return
1533
1534
1535 class GoogleSearchIE(InfoExtractor):
1536     """Information Extractor for Google Video search queries."""
1537     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1538     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1539     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1540     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1541     _max_google_results = 1000
1542     IE_NAME = u'video.google:search'
1543
1544     def __init__(self, downloader=None):
1545         InfoExtractor.__init__(self, downloader)
1546
1547     def report_download_page(self, query, pagenum):
1548         """Report attempt to download playlist page with given number."""
1549         query = query.decode(preferredencoding())
1550         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1551
1552     def _real_extract(self, query):
1553         mobj = re.match(self._VALID_URL, query)
1554         if mobj is None:
1555             self._downloader.report_error(u'invalid search query "%s"' % query)
1556             return
1557
1558         prefix, query = query.split(':')
1559         prefix = prefix[8:]
1560         query = query.encode('utf-8')
1561         if prefix == '':
1562             self._download_n_results(query, 1)
1563             return
1564         elif prefix == 'all':
1565             self._download_n_results(query, self._max_google_results)
1566             return
1567         else:
1568             try:
1569                 n = int(prefix)
1570                 if n <= 0:
1571                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1572                     return
1573                 elif n > self._max_google_results:
1574                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1575                     n = self._max_google_results
1576                 self._download_n_results(query, n)
1577                 return
1578             except ValueError: # parsing prefix as integer fails
1579                 self._download_n_results(query, 1)
1580                 return
1581
1582     def _download_n_results(self, query, n):
1583         """Downloads a specified number of results for a query"""
1584
1585         video_ids = []
1586         pagenum = 0
1587
1588         while True:
1589             self.report_download_page(query, pagenum)
1590             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1591             request = compat_urllib_request.Request(result_url)
1592             try:
1593                 page = compat_urllib_request.urlopen(request).read()
1594             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1595                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1596                 return
1597
1598             # Extract video identifiers
1599             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1600                 video_id = mobj.group(1)
1601                 if video_id not in video_ids:
1602                     video_ids.append(video_id)
1603                     if len(video_ids) == n:
1604                         # Specified n videos reached
1605                         for id in video_ids:
1606                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1607                         return
1608
1609             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610                 for id in video_ids:
1611                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1612                 return
1613
1614             pagenum = pagenum + 1
1615
1616
1617 class YahooSearchIE(InfoExtractor):
1618     """Information Extractor for Yahoo! Video search queries."""
1619
1620     _WORKING = False
1621     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1622     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1623     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1624     _MORE_PAGES_INDICATOR = r'\s*Next'
1625     _max_yahoo_results = 1000
1626     IE_NAME = u'video.yahoo:search'
1627
1628     def __init__(self, downloader=None):
1629         InfoExtractor.__init__(self, downloader)
1630
1631     def report_download_page(self, query, pagenum):
1632         """Report attempt to download playlist page with given number."""
1633         query = query.decode(preferredencoding())
1634         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1635
1636     def _real_extract(self, query):
1637         mobj = re.match(self._VALID_URL, query)
1638         if mobj is None:
1639             self._downloader.report_error(u'invalid search query "%s"' % query)
1640             return
1641
1642         prefix, query = query.split(':')
1643         prefix = prefix[8:]
1644         query = query.encode('utf-8')
1645         if prefix == '':
1646             self._download_n_results(query, 1)
1647             return
1648         elif prefix == 'all':
1649             self._download_n_results(query, self._max_yahoo_results)
1650             return
1651         else:
1652             try:
1653                 n = int(prefix)
1654                 if n <= 0:
1655                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1656                     return
1657                 elif n > self._max_yahoo_results:
1658                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1659                     n = self._max_yahoo_results
1660                 self._download_n_results(query, n)
1661                 return
1662             except ValueError: # parsing prefix as integer fails
1663                 self._download_n_results(query, 1)
1664                 return
1665
1666     def _download_n_results(self, query, n):
1667         """Downloads a specified number of results for a query"""
1668
1669         video_ids = []
1670         already_seen = set()
1671         pagenum = 1
1672
1673         while True:
1674             self.report_download_page(query, pagenum)
1675             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1676             request = compat_urllib_request.Request(result_url)
1677             try:
1678                 page = compat_urllib_request.urlopen(request).read()
1679             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1680                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1681                 return
1682
1683             # Extract video identifiers
1684             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1685                 video_id = mobj.group(1)
1686                 if video_id not in already_seen:
1687                     video_ids.append(video_id)
1688                     already_seen.add(video_id)
1689                     if len(video_ids) == n:
1690                         # Specified n videos reached
1691                         for id in video_ids:
1692                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1693                         return
1694
1695             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1696                 for id in video_ids:
1697                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1698                 return
1699
1700             pagenum = pagenum + 1
1701
1702
1703 class YoutubePlaylistIE(InfoExtractor):
1704     """Information Extractor for YouTube playlists."""
1705
1706     _VALID_URL = r"""(?:
1707                         (?:https?://)?
1708                         (?:\w+\.)?
1709                         youtube\.com/
1710                         (?:
1711                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1712                            \? (?:.*?&)*? (?:p|a|list)=
1713                         |  p/
1714                         )
1715                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1716                         .*
1717                      |
1718                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1719                      )"""
1720     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1721     _MAX_RESULTS = 50
1722     IE_NAME = u'youtube:playlist'
1723
1724     def __init__(self, downloader=None):
1725         InfoExtractor.__init__(self, downloader)
1726
1727     @classmethod
1728     def suitable(cls, url):
1729         """Receives a URL and returns True if suitable for this IE."""
1730         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1731
1732     def report_download_page(self, playlist_id, pagenum):
1733         """Report attempt to download playlist page with given number."""
1734         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1735
1736     def _real_extract(self, url):
1737         # Extract playlist id
1738         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1739         if mobj is None:
1740             self._downloader.report_error(u'invalid url: %s' % url)
1741             return
1742
1743         # Download playlist videos from API
1744         playlist_id = mobj.group(1) or mobj.group(2)
1745         page_num = 1
1746         videos = []
1747
1748         while True:
1749             self.report_download_page(playlist_id, page_num)
1750
1751             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1752             try:
1753                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1754             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1755                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1756                 return
1757
1758             try:
1759                 response = json.loads(page)
1760             except ValueError as err:
1761                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1762                 return
1763
1764             if not 'feed' in response or not 'entry' in response['feed']:
1765                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1766                 return
1767             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1768                         for entry in response['feed']['entry']
1769                         if 'content' in entry ]
1770
1771             if len(response['feed']['entry']) < self._MAX_RESULTS:
1772                 break
1773             page_num += 1
1774
1775         videos = [v[1] for v in sorted(videos)]
1776         total = len(videos)
1777
1778         playliststart = self._downloader.params.get('playliststart', 1) - 1
1779         playlistend = self._downloader.params.get('playlistend', -1)
1780         if playlistend == -1:
1781             videos = videos[playliststart:]
1782         else:
1783             videos = videos[playliststart:playlistend]
1784
1785         if len(videos) == total:
1786             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1787         else:
1788             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1789
1790         for video in videos:
1791             self._downloader.download([video])
1792         return
1793
1794
1795 class YoutubeChannelIE(InfoExtractor):
1796     """Information Extractor for YouTube channels."""
1797
1798     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1799     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1800     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1801     IE_NAME = u'youtube:channel'
1802
1803     def report_download_page(self, channel_id, pagenum):
1804         """Report attempt to download channel page with given number."""
1805         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1806
1807     def _real_extract(self, url):
1808         # Extract channel id
1809         mobj = re.match(self._VALID_URL, url)
1810         if mobj is None:
1811             self._downloader.report_error(u'invalid url: %s' % url)
1812             return
1813
1814         # Download channel pages
1815         channel_id = mobj.group(1)
1816         video_ids = []
1817         pagenum = 1
1818
1819         while True:
1820             self.report_download_page(channel_id, pagenum)
1821             url = self._TEMPLATE_URL % (channel_id, pagenum)
1822             request = compat_urllib_request.Request(url)
1823             try:
1824                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1825             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1826                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1827                 return
1828
1829             # Extract video identifiers
1830             ids_in_page = []
1831             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1832                 if mobj.group(1) not in ids_in_page:
1833                     ids_in_page.append(mobj.group(1))
1834             video_ids.extend(ids_in_page)
1835
1836             if self._MORE_PAGES_INDICATOR not in page:
1837                 break
1838             pagenum = pagenum + 1
1839
1840         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1841
1842         for id in video_ids:
1843             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1844         return
1845
1846
1847 class YoutubeUserIE(InfoExtractor):
1848     """Information Extractor for YouTube users."""
1849
1850     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1851     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1852     _GDATA_PAGE_SIZE = 50
1853     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1854     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1855     IE_NAME = u'youtube:user'
1856
1857     def __init__(self, downloader=None):
1858         InfoExtractor.__init__(self, downloader)
1859
1860     def report_download_page(self, username, start_index):
1861         """Report attempt to download user page."""
1862         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1863                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1864
1865     def _real_extract(self, url):
1866         # Extract username
1867         mobj = re.match(self._VALID_URL, url)
1868         if mobj is None:
1869             self._downloader.report_error(u'invalid url: %s' % url)
1870             return
1871
1872         username = mobj.group(1)
1873
1874         # Download video ids using YouTube Data API. Result size per
1875         # query is limited (currently to 50 videos) so we need to query
1876         # page by page until there are no video ids - it means we got
1877         # all of them.
1878
1879         video_ids = []
1880         pagenum = 0
1881
1882         while True:
1883             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1884             self.report_download_page(username, start_index)
1885
1886             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1887
1888             try:
1889                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1890             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1891                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1892                 return
1893
1894             # Extract video identifiers
1895             ids_in_page = []
1896
1897             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1898                 if mobj.group(1) not in ids_in_page:
1899                     ids_in_page.append(mobj.group(1))
1900
1901             video_ids.extend(ids_in_page)
1902
1903             # A little optimization - if current page is not
1904             # "full", ie. does not contain PAGE_SIZE video ids then
1905             # we can assume that this page is the last one - there
1906             # are no more ids on further pages - no need to query
1907             # again.
1908
1909             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1910                 break
1911
1912             pagenum += 1
1913
1914         all_ids_count = len(video_ids)
1915         playliststart = self._downloader.params.get('playliststart', 1) - 1
1916         playlistend = self._downloader.params.get('playlistend', -1)
1917
1918         if playlistend == -1:
1919             video_ids = video_ids[playliststart:]
1920         else:
1921             video_ids = video_ids[playliststart:playlistend]
1922
1923         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1924                 (username, all_ids_count, len(video_ids)))
1925
1926         for video_id in video_ids:
1927             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1928
1929
1930 class BlipTVUserIE(InfoExtractor):
1931     """Information Extractor for blip.tv users."""
1932
1933     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1934     _PAGE_SIZE = 12
1935     IE_NAME = u'blip.tv:user'
1936
1937     def __init__(self, downloader=None):
1938         InfoExtractor.__init__(self, downloader)
1939
1940     def report_download_page(self, username, pagenum):
1941         """Report attempt to download user page."""
1942         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1943                 (self.IE_NAME, username, pagenum))
1944
1945     def _real_extract(self, url):
1946         # Extract username
1947         mobj = re.match(self._VALID_URL, url)
1948         if mobj is None:
1949             self._downloader.report_error(u'invalid url: %s' % url)
1950             return
1951
1952         username = mobj.group(1)
1953
1954         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1955
1956         request = compat_urllib_request.Request(url)
1957
1958         try:
1959             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1960             mobj = re.search(r'data-users-id="([^"]+)"', page)
1961             page_base = page_base % mobj.group(1)
1962         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1963             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1964             return
1965
1966
1967         # Download video ids using BlipTV Ajax calls. Result size per
1968         # query is limited (currently to 12 videos) so we need to query
1969         # page by page until there are no video ids - it means we got
1970         # all of them.
1971
1972         video_ids = []
1973         pagenum = 1
1974
1975         while True:
1976             self.report_download_page(username, pagenum)
1977             url = page_base + "&page=" + str(pagenum)
1978             request = compat_urllib_request.Request( url )
1979             try:
1980                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1983                 return
1984
1985             # Extract video identifiers
1986             ids_in_page = []
1987
1988             for mobj in re.finditer(r'href="/([^"]+)"', page):
1989                 if mobj.group(1) not in ids_in_page:
1990                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1991
1992             video_ids.extend(ids_in_page)
1993
1994             # A little optimization - if current page is not
1995             # "full", ie. does not contain PAGE_SIZE video ids then
1996             # we can assume that this page is the last one - there
1997             # are no more ids on further pages - no need to query
1998             # again.
1999
2000             if len(ids_in_page) < self._PAGE_SIZE:
2001                 break
2002
2003             pagenum += 1
2004
2005         all_ids_count = len(video_ids)
2006         playliststart = self._downloader.params.get('playliststart', 1) - 1
2007         playlistend = self._downloader.params.get('playlistend', -1)
2008
2009         if playlistend == -1:
2010             video_ids = video_ids[playliststart:]
2011         else:
2012             video_ids = video_ids[playliststart:playlistend]
2013
2014         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2015                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2016
2017         for video_id in video_ids:
2018             self._downloader.download([u'http://blip.tv/'+video_id])
2019
2020
2021 class DepositFilesIE(InfoExtractor):
2022     """Information extractor for depositfiles.com"""
2023
2024     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2025
2026     def report_download_webpage(self, file_id):
2027         """Report webpage download."""
2028         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2029
2030     def report_extraction(self, file_id):
2031         """Report information extraction."""
2032         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2033
2034     def _real_extract(self, url):
2035         file_id = url.split('/')[-1]
2036         # Rebuild url in english locale
2037         url = 'http://depositfiles.com/en/files/' + file_id
2038
2039         # Retrieve file webpage with 'Free download' button pressed
2040         free_download_indication = { 'gateway_result' : '1' }
2041         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2042         try:
2043             self.report_download_webpage(file_id)
2044             webpage = compat_urllib_request.urlopen(request).read()
2045         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2046             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2047             return
2048
2049         # Search for the real file URL
2050         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2051         if (mobj is None) or (mobj.group(1) is None):
2052             # Try to figure out reason of the error.
2053             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2054             if (mobj is not None) and (mobj.group(1) is not None):
2055                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2056                 self._downloader.report_error(u'%s' % restriction_message)
2057             else:
2058                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2059             return
2060
2061         file_url = mobj.group(1)
2062         file_extension = os.path.splitext(file_url)[1][1:]
2063
2064         # Search for file title
2065         mobj = re.search(r'<b title="(.*?)">', webpage)
2066         if mobj is None:
2067             self._downloader.report_error(u'unable to extract title')
2068             return
2069         file_title = mobj.group(1).decode('utf-8')
2070
2071         return [{
2072             'id':       file_id.decode('utf-8'),
2073             'url':      file_url.decode('utf-8'),
2074             'uploader': None,
2075             'upload_date':  None,
2076             'title':    file_title,
2077             'ext':      file_extension.decode('utf-8'),
2078         }]
2079
2080
2081 class FacebookIE(InfoExtractor):
2082     """Information Extractor for Facebook"""
2083
2084     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2085     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2086     _NETRC_MACHINE = 'facebook'
2087     IE_NAME = u'facebook'
2088
2089     def report_login(self):
2090         """Report attempt to log in."""
2091         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2092
2093     def _real_initialize(self):
2094         if self._downloader is None:
2095             return
2096
2097         useremail = None
2098         password = None
2099         downloader_params = self._downloader.params
2100
2101         # Attempt to use provided username and password or .netrc data
2102         if downloader_params.get('username', None) is not None:
2103             useremail = downloader_params['username']
2104             password = downloader_params['password']
2105         elif downloader_params.get('usenetrc', False):
2106             try:
2107                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2108                 if info is not None:
2109                     useremail = info[0]
2110                     password = info[2]
2111                 else:
2112                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2113             except (IOError, netrc.NetrcParseError) as err:
2114                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2115                 return
2116
2117         if useremail is None:
2118             return
2119
2120         # Log in
2121         login_form = {
2122             'email': useremail,
2123             'pass': password,
2124             'login': 'Log+In'
2125             }
2126         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2127         try:
2128             self.report_login()
2129             login_results = compat_urllib_request.urlopen(request).read()
2130             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2131                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2132                 return
2133         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2134             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2135             return
2136
2137     def _real_extract(self, url):
2138         mobj = re.match(self._VALID_URL, url)
2139         if mobj is None:
2140             self._downloader.report_error(u'invalid URL: %s' % url)
2141             return
2142         video_id = mobj.group('ID')
2143
2144         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2145         webpage = self._download_webpage(url, video_id)
2146
2147         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2148         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2149         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2150         if not m:
2151             raise ExtractorError(u'Cannot parse data')
2152         data = dict(json.loads(m.group(1)))
2153         params_raw = compat_urllib_parse.unquote(data['params'])
2154         params = json.loads(params_raw)
2155         video_url = params['hd_src']
2156         if not video_url:
2157             video_url = params['sd_src']
2158         if not video_url:
2159             raise ExtractorError(u'Cannot find video URL')
2160         video_duration = int(params['video_duration'])
2161
2162         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2163         if not m:
2164             raise ExtractorError(u'Cannot find title in webpage')
2165         video_title = unescapeHTML(m.group(1))
2166
2167         info = {
2168             'id': video_id,
2169             'title': video_title,
2170             'url': video_url,
2171             'ext': 'mp4',
2172             'duration': video_duration,
2173             'thumbnail': params['thumbnail_src'],
2174         }
2175         return [info]
2176
2177
2178 class BlipTVIE(InfoExtractor):
2179     """Information extractor for blip.tv"""
2180
2181     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2182     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2183     IE_NAME = u'blip.tv'
2184
2185     def report_extraction(self, file_id):
2186         """Report information extraction."""
2187         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2188
2189     def report_direct_download(self, title):
2190         """Report information extraction."""
2191         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2192
2193     def _real_extract(self, url):
2194         mobj = re.match(self._VALID_URL, url)
2195         if mobj is None:
2196             self._downloader.report_error(u'invalid URL: %s' % url)
2197             return
2198
2199         urlp = compat_urllib_parse_urlparse(url)
2200         if urlp.path.startswith('/play/'):
2201             request = compat_urllib_request.Request(url)
2202             response = compat_urllib_request.urlopen(request)
2203             redirecturl = response.geturl()
2204             rurlp = compat_urllib_parse_urlparse(redirecturl)
2205             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2206             url = 'http://blip.tv/a/a-' + file_id
2207             return self._real_extract(url)
2208
2209
2210         if '?' in url:
2211             cchar = '&'
2212         else:
2213             cchar = '?'
2214         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215         request = compat_urllib_request.Request(json_url)
2216         request.add_header('User-Agent', 'iTunes/10.6.1')
2217         self.report_extraction(mobj.group(1))
2218         info = None
2219         try:
2220             urlh = compat_urllib_request.urlopen(request)
2221             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222                 basename = url.split('/')[-1]
2223                 title,ext = os.path.splitext(basename)
2224                 title = title.decode('UTF-8')
2225                 ext = ext.replace('.', '')
2226                 self.report_direct_download(title)
2227                 info = {
2228                     'id': title,
2229                     'url': url,
2230                     'uploader': None,
2231                     'upload_date': None,
2232                     'title': title,
2233                     'ext': ext,
2234                     'urlhandle': urlh
2235                 }
2236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238         if info is None: # Regular URL
2239             try:
2240                 json_code_bytes = urlh.read()
2241                 json_code = json_code_bytes.decode('utf-8')
2242             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2244                 return
2245
2246             try:
2247                 json_data = json.loads(json_code)
2248                 if 'Post' in json_data:
2249                     data = json_data['Post']
2250                 else:
2251                     data = json_data
2252
2253                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254                 video_url = data['media']['url']
2255                 umobj = re.match(self._URL_EXT, video_url)
2256                 if umobj is None:
2257                     raise ValueError('Can not determine filename extension')
2258                 ext = umobj.group(1)
2259
2260                 info = {
2261                     'id': data['item_id'],
2262                     'url': video_url,
2263                     'uploader': data['display_name'],
2264                     'upload_date': upload_date,
2265                     'title': data['title'],
2266                     'ext': ext,
2267                     'format': data['media']['mimeType'],
2268                     'thumbnail': data['thumbnailUrl'],
2269                     'description': data['description'],
2270                     'player_url': data['embedUrl'],
2271                     'user_agent': 'iTunes/10.6.1',
2272                 }
2273             except (ValueError,KeyError) as err:
2274                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2275                 return
2276
2277         return [info]
2278
2279
2280 class MyVideoIE(InfoExtractor):
2281     """Information Extractor for myvideo.de."""
2282
2283     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284     IE_NAME = u'myvideo'
2285
2286     def __init__(self, downloader=None):
2287         InfoExtractor.__init__(self, downloader)
2288
2289     def report_extraction(self, video_id):
2290         """Report information extraction."""
2291         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2292
2293     def _real_extract(self,url):
2294         mobj = re.match(self._VALID_URL, url)
2295         if mobj is None:
2296             self._download.report_error(u'invalid URL: %s' % url)
2297             return
2298
2299         video_id = mobj.group(1)
2300
2301         # Get video webpage
2302         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2303         webpage = self._download_webpage(webpage_url, video_id)
2304
2305         self.report_extraction(video_id)
2306         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2307                  webpage)
2308         if mobj is None:
2309             self._downloader.report_error(u'unable to extract media URL')
2310             return
2311         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2312
2313         mobj = re.search('<title>([^<]+)</title>', webpage)
2314         if mobj is None:
2315             self._downloader.report_error(u'unable to extract title')
2316             return
2317
2318         video_title = mobj.group(1)
2319
2320         return [{
2321             'id':       video_id,
2322             'url':      video_url,
2323             'uploader': None,
2324             'upload_date':  None,
2325             'title':    video_title,
2326             'ext':      u'flv',
2327         }]
2328
2329 class ComedyCentralIE(InfoExtractor):
2330     """Information extractor for The Daily Show and Colbert Report """
2331
2332     # urls can be abbreviations like :thedailyshow or :colbert
2333     # urls for episodes like:
2334     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338                       |(https?://)?(www\.)?
2339                           (?P<showname>thedailyshow|colbertnation)\.com/
2340                          (full-episodes/(?P<episode>.*)|
2341                           (?P<clip>
2342                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2344                      $"""
2345
2346     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2347
2348     _video_extensions = {
2349         '3500': 'mp4',
2350         '2200': 'mp4',
2351         '1700': 'mp4',
2352         '1200': 'mp4',
2353         '750': 'mp4',
2354         '400': 'mp4',
2355     }
2356     _video_dimensions = {
2357         '3500': '1280x720',
2358         '2200': '960x540',
2359         '1700': '768x432',
2360         '1200': '640x360',
2361         '750': '512x288',
2362         '400': '384x216',
2363     }
2364
2365     @classmethod
2366     def suitable(cls, url):
2367         """Receives a URL and returns True if suitable for this IE."""
2368         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2369
2370     def report_extraction(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2372
2373     def report_config_download(self, episode_id, media_id):
2374         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2375
2376     def report_index_download(self, episode_id):
2377         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2378
2379     def _print_formats(self, formats):
2380         print('Available formats:')
2381         for x in formats:
2382             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2383
2384
2385     def _real_extract(self, url):
2386         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387         if mobj is None:
2388             self._downloader.report_error(u'invalid URL: %s' % url)
2389             return
2390
2391         if mobj.group('shortname'):
2392             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2393                 url = u'http://www.thedailyshow.com/full-episodes/'
2394             else:
2395                 url = u'http://www.colbertnation.com/full-episodes/'
2396             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2397             assert mobj is not None
2398
2399         if mobj.group('clip'):
2400             if mobj.group('showname') == 'thedailyshow':
2401                 epTitle = mobj.group('tdstitle')
2402             else:
2403                 epTitle = mobj.group('cntitle')
2404             dlNewest = False
2405         else:
2406             dlNewest = not mobj.group('episode')
2407             if dlNewest:
2408                 epTitle = mobj.group('showname')
2409             else:
2410                 epTitle = mobj.group('episode')
2411
2412         req = compat_urllib_request.Request(url)
2413         self.report_extraction(epTitle)
2414         try:
2415             htmlHandle = compat_urllib_request.urlopen(req)
2416             html = htmlHandle.read()
2417             webpage = html.decode('utf-8')
2418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2420             return
2421         if dlNewest:
2422             url = htmlHandle.geturl()
2423             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2424             if mobj is None:
2425                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2426                 return
2427             if mobj.group('episode') == '':
2428                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2429                 return
2430             epTitle = mobj.group('episode')
2431
2432         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2433
2434         if len(mMovieParams) == 0:
2435             # The Colbert Report embeds the information in a without
2436             # a URL prefix; so extract the alternate reference
2437             # and then add the URL prefix manually.
2438
2439             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2440             if len(altMovieParams) == 0:
2441                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2442                 return
2443             else:
2444                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2445
2446         uri = mMovieParams[0][1]
2447         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2448         self.report_index_download(epTitle)
2449         try:
2450             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2453             return
2454
2455         results = []
2456
2457         idoc = xml.etree.ElementTree.fromstring(indexXml)
2458         itemEls = idoc.findall('.//item')
2459         for partNum,itemEl in enumerate(itemEls):
2460             mediaId = itemEl.findall('./guid')[0].text
2461             shortMediaId = mediaId.split(':')[-1]
2462             showId = mediaId.split(':')[-2].replace('.com', '')
2463             officialTitle = itemEl.findall('./title')[0].text
2464             officialDate = itemEl.findall('./pubDate')[0].text
2465
2466             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2467                         compat_urllib_parse.urlencode({'uri': mediaId}))
2468             configReq = compat_urllib_request.Request(configUrl)
2469             self.report_config_download(epTitle, shortMediaId)
2470             try:
2471                 configXml = compat_urllib_request.urlopen(configReq).read()
2472             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2474                 return
2475
2476             cdoc = xml.etree.ElementTree.fromstring(configXml)
2477             turls = []
2478             for rendition in cdoc.findall('.//rendition'):
2479                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2480                 turls.append(finfo)
2481
2482             if len(turls) == 0:
2483                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2484                 continue
2485
2486             if self._downloader.params.get('listformats', None):
2487                 self._print_formats([i[0] for i in turls])
2488                 return
2489
2490             # For now, just pick the highest bitrate
2491             format,rtmp_video_url = turls[-1]
2492
2493             # Get the format arg from the arg stream
2494             req_format = self._downloader.params.get('format', None)
2495
2496             # Select format if we can find one
2497             for f,v in turls:
2498                 if f == req_format:
2499                     format, rtmp_video_url = f, v
2500                     break
2501
2502             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2503             if not m:
2504                 raise ExtractorError(u'Cannot transform RTMP url')
2505             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2506             video_url = base + m.group('finalid')
2507
2508             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2509             info = {
2510                 'id': shortMediaId,
2511                 'url': video_url,
2512                 'uploader': showId,
2513                 'upload_date': officialDate,
2514                 'title': effTitle,
2515                 'ext': 'mp4',
2516                 'format': format,
2517                 'thumbnail': None,
2518                 'description': officialTitle,
2519             }
2520             results.append(info)
2521
2522         return results
2523
2524
2525 class EscapistIE(InfoExtractor):
2526     """Information extractor for The Escapist """
2527
2528     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2529     IE_NAME = u'escapist'
2530
2531     def report_extraction(self, showName):
2532         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2533
2534     def report_config_download(self, showName):
2535         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2536
2537     def _real_extract(self, url):
2538         mobj = re.match(self._VALID_URL, url)
2539         if mobj is None:
2540             self._downloader.report_error(u'invalid URL: %s' % url)
2541             return
2542         showName = mobj.group('showname')
2543         videoId = mobj.group('episode')
2544
2545         self.report_extraction(showName)
2546         try:
2547             webPage = compat_urllib_request.urlopen(url)
2548             webPageBytes = webPage.read()
2549             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2550             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2551         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2552             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2553             return
2554
2555         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2556         description = unescapeHTML(descMatch.group(1))
2557         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2558         imgUrl = unescapeHTML(imgMatch.group(1))
2559         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2560         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2561         configUrlMatch = re.search('config=(.*)$', playerUrl)
2562         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2563
2564         self.report_config_download(showName)
2565         try:
2566             configJSON = compat_urllib_request.urlopen(configUrl)
2567             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2568             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2569         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2570             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2571             return
2572
2573         # Technically, it's JavaScript, not JSON
2574         configJSON = configJSON.replace("'", '"')
2575
2576         try:
2577             config = json.loads(configJSON)
2578         except (ValueError,) as err:
2579             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2580             return
2581
2582         playlist = config['playlist']
2583         videoUrl = playlist[1]['url']
2584
2585         info = {
2586             'id': videoId,
2587             'url': videoUrl,
2588             'uploader': showName,
2589             'upload_date': None,
2590             'title': showName,
2591             'ext': 'mp4',
2592             'thumbnail': imgUrl,
2593             'description': description,
2594             'player_url': playerUrl,
2595         }
2596
2597         return [info]
2598
2599 class CollegeHumorIE(InfoExtractor):
2600     """Information extractor for collegehumor.com"""
2601
2602     _WORKING = False
2603     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2604     IE_NAME = u'collegehumor'
2605
2606     def report_manifest(self, video_id):
2607         """Report information extraction."""
2608         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2609
2610     def report_extraction(self, video_id):
2611         """Report information extraction."""
2612         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2613
2614     def _real_extract(self, url):
2615         mobj = re.match(self._VALID_URL, url)
2616         if mobj is None:
2617             self._downloader.report_error(u'invalid URL: %s' % url)
2618             return
2619         video_id = mobj.group('videoid')
2620
2621         info = {
2622             'id': video_id,
2623             'uploader': None,
2624             'upload_date': None,
2625         }
2626
2627         self.report_extraction(video_id)
2628         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2629         try:
2630             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2631         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2632             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2633             return
2634
2635         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2636         try:
2637             videoNode = mdoc.findall('./video')[0]
2638             info['description'] = videoNode.findall('./description')[0].text
2639             info['title'] = videoNode.findall('./caption')[0].text
2640             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2641             manifest_url = videoNode.findall('./file')[0].text
2642         except IndexError:
2643             self._downloader.report_error(u'Invalid metadata XML file')
2644             return
2645
2646         manifest_url += '?hdcore=2.10.3'
2647         self.report_manifest(video_id)
2648         try:
2649             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2650         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2652             return
2653
2654         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2655         try:
2656             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2657             node_id = media_node.attrib['url']
2658             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2659         except IndexError as err:
2660             self._downloader.report_error(u'Invalid manifest file')
2661             return
2662
2663         url_pr = compat_urllib_parse_urlparse(manifest_url)
2664         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2665
2666         info['url'] = url
2667         info['ext'] = 'f4f'
2668         return [info]
2669
2670
2671 class XVideosIE(InfoExtractor):
2672     """Information extractor for xvideos.com"""
2673
2674     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2675     IE_NAME = u'xvideos'
2676
2677     def report_extraction(self, video_id):
2678         """Report information extraction."""
2679         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2680
2681     def _real_extract(self, url):
2682         mobj = re.match(self._VALID_URL, url)
2683         if mobj is None:
2684             self._downloader.report_error(u'invalid URL: %s' % url)
2685             return
2686         video_id = mobj.group(1)
2687
2688         webpage = self._download_webpage(url, video_id)
2689
2690         self.report_extraction(video_id)
2691
2692
2693         # Extract video URL
2694         mobj = re.search(r'flv_url=(.+?)&', webpage)
2695         if mobj is None:
2696             self._downloader.report_error(u'unable to extract video url')
2697             return
2698         video_url = compat_urllib_parse.unquote(mobj.group(1))
2699
2700
2701         # Extract title
2702         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2703         if mobj is None:
2704             self._downloader.report_error(u'unable to extract video title')
2705             return
2706         video_title = mobj.group(1)
2707
2708
2709         # Extract video thumbnail
2710         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2711         if mobj is None:
2712             self._downloader.report_error(u'unable to extract video thumbnail')
2713             return
2714         video_thumbnail = mobj.group(0)
2715
2716         info = {
2717             'id': video_id,
2718             'url': video_url,
2719             'uploader': None,
2720             'upload_date': None,
2721             'title': video_title,
2722             'ext': 'flv',
2723             'thumbnail': video_thumbnail,
2724             'description': None,
2725         }
2726
2727         return [info]
2728
2729
2730 class SoundcloudIE(InfoExtractor):
2731     """Information extractor for soundcloud.com
2732        To access the media, the uid of the song and a stream token
2733        must be extracted from the page source and the script must make
2734        a request to media.soundcloud.com/crossdomain.xml. Then
2735        the media can be grabbed by requesting from an url composed
2736        of the stream token and uid
2737      """
2738
2739     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2740     IE_NAME = u'soundcloud'
2741
2742     def __init__(self, downloader=None):
2743         InfoExtractor.__init__(self, downloader)
2744
2745     def report_resolve(self, video_id):
2746         """Report information extraction."""
2747         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2748
2749     def report_extraction(self, video_id):
2750         """Report information extraction."""
2751         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2752
2753     def _real_extract(self, url):
2754         mobj = re.match(self._VALID_URL, url)
2755         if mobj is None:
2756             self._downloader.report_error(u'invalid URL: %s' % url)
2757             return
2758
2759         # extract uploader (which is in the url)
2760         uploader = mobj.group(1)
2761         # extract simple title (uploader + slug of song title)
2762         slug_title =  mobj.group(2)
2763         simple_title = uploader + u'-' + slug_title
2764
2765         self.report_resolve('%s/%s' % (uploader, slug_title))
2766
2767         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2768         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2769         request = compat_urllib_request.Request(resolv_url)
2770         try:
2771             info_json_bytes = compat_urllib_request.urlopen(request).read()
2772             info_json = info_json_bytes.decode('utf-8')
2773         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2775             return
2776
2777         info = json.loads(info_json)
2778         video_id = info['id']
2779         self.report_extraction('%s/%s' % (uploader, slug_title))
2780
2781         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2782         request = compat_urllib_request.Request(streams_url)
2783         try:
2784             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2785             stream_json = stream_json_bytes.decode('utf-8')
2786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2787             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2788             return
2789
2790         streams = json.loads(stream_json)
2791         mediaURL = streams['http_mp3_128_url']
2792
2793         return [{
2794             'id':       info['id'],
2795             'url':      mediaURL,
2796             'uploader': info['user']['username'],
2797             'upload_date':  info['created_at'],
2798             'title':    info['title'],
2799             'ext':      u'mp3',
2800             'description': info['description'],
2801         }]
2802
2803 class SoundcloudSetIE(InfoExtractor):
2804     """Information extractor for soundcloud.com sets
2805        To access the media, the uid of the song and a stream token
2806        must be extracted from the page source and the script must make
2807        a request to media.soundcloud.com/crossdomain.xml. Then
2808        the media can be grabbed by requesting from an url composed
2809        of the stream token and uid
2810      """
2811
2812     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2813     IE_NAME = u'soundcloud'
2814
2815     def __init__(self, downloader=None):
2816         InfoExtractor.__init__(self, downloader)
2817
2818     def report_resolve(self, video_id):
2819         """Report information extraction."""
2820         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2821
2822     def report_extraction(self, video_id):
2823         """Report information extraction."""
2824         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2825
2826     def _real_extract(self, url):
2827         mobj = re.match(self._VALID_URL, url)
2828         if mobj is None:
2829             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2830             return
2831
2832         # extract uploader (which is in the url)
2833         uploader = mobj.group(1)
2834         # extract simple title (uploader + slug of song title)
2835         slug_title =  mobj.group(2)
2836         simple_title = uploader + u'-' + slug_title
2837
2838         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2839
2840         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2841         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2842         request = compat_urllib_request.Request(resolv_url)
2843         try:
2844             info_json_bytes = compat_urllib_request.urlopen(request).read()
2845             info_json = info_json_bytes.decode('utf-8')
2846         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2848             return
2849
2850         videos = []
2851         info = json.loads(info_json)
2852         if 'errors' in info:
2853             for err in info['errors']:
2854                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2855             return
2856
2857         for track in info['tracks']:
2858             video_id = track['id']
2859             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2860
2861             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2862             request = compat_urllib_request.Request(streams_url)
2863             try:
2864                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2865                 stream_json = stream_json_bytes.decode('utf-8')
2866             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2868                 return
2869
2870             streams = json.loads(stream_json)
2871             mediaURL = streams['http_mp3_128_url']
2872
2873             videos.append({
2874                 'id':       video_id,
2875                 'url':      mediaURL,
2876                 'uploader': track['user']['username'],
2877                 'upload_date':  track['created_at'],
2878                 'title':    track['title'],
2879                 'ext':      u'mp3',
2880                 'description': track['description'],
2881             })
2882         return videos
2883
2884
2885 class InfoQIE(InfoExtractor):
2886     """Information extractor for infoq.com"""
2887     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2888
2889     def report_extraction(self, video_id):
2890         """Report information extraction."""
2891         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2892
2893     def _real_extract(self, url):
2894         mobj = re.match(self._VALID_URL, url)
2895         if mobj is None:
2896             self._downloader.report_error(u'invalid URL: %s' % url)
2897             return
2898
2899         webpage = self._download_webpage(url, video_id=url)
2900         self.report_extraction(url)
2901
2902         # Extract video URL
2903         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2904         if mobj is None:
2905             self._downloader.report_error(u'unable to extract video url')
2906             return
2907         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2908         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2909
2910         # Extract title
2911         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2912         if mobj is None:
2913             self._downloader.report_error(u'unable to extract video title')
2914             return
2915         video_title = mobj.group(1)
2916
2917         # Extract description
2918         video_description = u'No description available.'
2919         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2920         if mobj is not None:
2921             video_description = mobj.group(1)
2922
2923         video_filename = video_url.split('/')[-1]
2924         video_id, extension = video_filename.split('.')
2925
2926         info = {
2927             'id': video_id,
2928             'url': video_url,
2929             'uploader': None,
2930             'upload_date': None,
2931             'title': video_title,
2932             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2933             'thumbnail': None,
2934             'description': video_description,
2935         }
2936
2937         return [info]
2938
2939 class MixcloudIE(InfoExtractor):
2940     """Information extractor for www.mixcloud.com"""
2941
2942     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2943     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2944     IE_NAME = u'mixcloud'
2945
2946     def __init__(self, downloader=None):
2947         InfoExtractor.__init__(self, downloader)
2948
2949     def report_download_json(self, file_id):
2950         """Report JSON download."""
2951         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2952
2953     def report_extraction(self, file_id):
2954         """Report information extraction."""
2955         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2956
2957     def get_urls(self, jsonData, fmt, bitrate='best'):
2958         """Get urls from 'audio_formats' section in json"""
2959         file_url = None
2960         try:
2961             bitrate_list = jsonData[fmt]
2962             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2963                 bitrate = max(bitrate_list) # select highest
2964
2965             url_list = jsonData[fmt][bitrate]
2966         except TypeError: # we have no bitrate info.
2967             url_list = jsonData[fmt]
2968         return url_list
2969
2970     def check_urls(self, url_list):
2971         """Returns 1st active url from list"""
2972         for url in url_list:
2973             try:
2974                 compat_urllib_request.urlopen(url)
2975                 return url
2976             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2977                 url = None
2978
2979         return None
2980
2981     def _print_formats(self, formats):
2982         print('Available formats:')
2983         for fmt in formats.keys():
2984             for b in formats[fmt]:
2985                 try:
2986                     ext = formats[fmt][b][0]
2987                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2988                 except TypeError: # we have no bitrate info
2989                     ext = formats[fmt][0]
2990                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2991                     break
2992
2993     def _real_extract(self, url):
2994         mobj = re.match(self._VALID_URL, url)
2995         if mobj is None:
2996             self._downloader.report_error(u'invalid URL: %s' % url)
2997             return
2998         # extract uploader & filename from url
2999         uploader = mobj.group(1).decode('utf-8')
3000         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3001
3002         # construct API request
3003         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3004         # retrieve .json file with links to files
3005         request = compat_urllib_request.Request(file_url)
3006         try:
3007             self.report_download_json(file_url)
3008             jsonData = compat_urllib_request.urlopen(request).read()
3009         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3010             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3011             return
3012
3013         # parse JSON
3014         json_data = json.loads(jsonData)
3015         player_url = json_data['player_swf_url']
3016         formats = dict(json_data['audio_formats'])
3017
3018         req_format = self._downloader.params.get('format', None)
3019         bitrate = None
3020
3021         if self._downloader.params.get('listformats', None):
3022             self._print_formats(formats)
3023             return
3024
3025         if req_format is None or req_format == 'best':
3026             for format_param in formats.keys():
3027                 url_list = self.get_urls(formats, format_param)
3028                 # check urls
3029                 file_url = self.check_urls(url_list)
3030                 if file_url is not None:
3031                     break # got it!
3032         else:
3033             if req_format not in formats:
3034                 self._downloader.report_error(u'format is not available')
3035                 return
3036
3037             url_list = self.get_urls(formats, req_format)
3038             file_url = self.check_urls(url_list)
3039             format_param = req_format
3040
3041         return [{
3042             'id': file_id.decode('utf-8'),
3043             'url': file_url.decode('utf-8'),
3044             'uploader': uploader.decode('utf-8'),
3045             'upload_date': None,
3046             'title': json_data['name'],
3047             'ext': file_url.split('.')[-1].decode('utf-8'),
3048             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3049             'thumbnail': json_data['thumbnail_url'],
3050             'description': json_data['description'],
3051             'player_url': player_url.decode('utf-8'),
3052         }]
3053
3054 class StanfordOpenClassroomIE(InfoExtractor):
3055     """Information extractor for Stanford's Open ClassRoom"""
3056
3057     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3058     IE_NAME = u'stanfordoc'
3059
3060     def report_download_webpage(self, objid):
3061         """Report information extraction."""
3062         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3063
3064     def report_extraction(self, video_id):
3065         """Report information extraction."""
3066         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3067
3068     def _real_extract(self, url):
3069         mobj = re.match(self._VALID_URL, url)
3070         if mobj is None:
3071             raise ExtractorError(u'Invalid URL: %s' % url)
3072
3073         if mobj.group('course') and mobj.group('video'): # A specific video
3074             course = mobj.group('course')
3075             video = mobj.group('video')
3076             info = {
3077                 'id': course + '_' + video,
3078                 'uploader': None,
3079                 'upload_date': None,
3080             }
3081
3082             self.report_extraction(info['id'])
3083             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3084             xmlUrl = baseUrl + video + '.xml'
3085             try:
3086                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3087             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3089                 return
3090             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3091             try:
3092                 info['title'] = mdoc.findall('./title')[0].text
3093                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3094             except IndexError:
3095                 self._downloader.report_error(u'Invalid metadata XML file')
3096                 return
3097             info['ext'] = info['url'].rpartition('.')[2]
3098             return [info]
3099         elif mobj.group('course'): # A course page
3100             course = mobj.group('course')
3101             info = {
3102                 'id': course,
3103                 'type': 'playlist',
3104                 'uploader': None,
3105                 'upload_date': None,
3106             }
3107
3108             coursepage = self._download_webpage(url, info['id'],
3109                                         note='Downloading course info page',
3110                                         errnote='Unable to download course info page')
3111
3112             m = re.search('<h1>([^<]+)</h1>', coursepage)
3113             if m:
3114                 info['title'] = unescapeHTML(m.group(1))
3115             else:
3116                 info['title'] = info['id']
3117
3118             m = re.search('<description>([^<]+)</description>', coursepage)
3119             if m:
3120                 info['description'] = unescapeHTML(m.group(1))
3121
3122             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3123             info['list'] = [
3124                 {
3125                     'type': 'reference',
3126                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3127                 }
3128                     for vpage in links]
3129             results = []
3130             for entry in info['list']:
3131                 assert entry['type'] == 'reference'
3132                 results += self.extract(entry['url'])
3133             return results
3134         else: # Root page
3135             info = {
3136                 'id': 'Stanford OpenClassroom',
3137                 'type': 'playlist',
3138                 'uploader': None,
3139                 'upload_date': None,
3140             }
3141
3142             self.report_download_webpage(info['id'])
3143             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3144             try:
3145                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3146             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3147                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3148                 return
3149
3150             info['title'] = info['id']
3151
3152             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3153             info['list'] = [
3154                 {
3155                     'type': 'reference',
3156                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3157                 }
3158                     for cpage in links]
3159
3160             results = []
3161             for entry in info['list']:
3162                 assert entry['type'] == 'reference'
3163                 results += self.extract(entry['url'])
3164             return results
3165
3166 class MTVIE(InfoExtractor):
3167     """Information extractor for MTV.com"""
3168
3169     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3170     IE_NAME = u'mtv'
3171
3172     def report_extraction(self, video_id):
3173         """Report information extraction."""
3174         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3175
3176     def _real_extract(self, url):
3177         mobj = re.match(self._VALID_URL, url)
3178         if mobj is None:
3179             self._downloader.report_error(u'invalid URL: %s' % url)
3180             return
3181         if not mobj.group('proto'):
3182             url = 'http://' + url
3183         video_id = mobj.group('videoid')
3184
3185         webpage = self._download_webpage(url, video_id)
3186
3187         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3188         if mobj is None:
3189             self._downloader.report_error(u'unable to extract song name')
3190             return
3191         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3192         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3193         if mobj is None:
3194             self._downloader.report_error(u'unable to extract performer')
3195             return
3196         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3197         video_title = performer + ' - ' + song_name
3198
3199         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3200         if mobj is None:
3201             self._downloader.report_error(u'unable to mtvn_uri')
3202             return
3203         mtvn_uri = mobj.group(1)
3204
3205         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3206         if mobj is None:
3207             self._downloader.report_error(u'unable to extract content id')
3208             return
3209         content_id = mobj.group(1)
3210
3211         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3212         self.report_extraction(video_id)
3213         request = compat_urllib_request.Request(videogen_url)
3214         try:
3215             metadataXml = compat_urllib_request.urlopen(request).read()
3216         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3217             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3218             return
3219
3220         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3221         renditions = mdoc.findall('.//rendition')
3222
3223         # For now, always pick the highest quality.
3224         rendition = renditions[-1]
3225
3226         try:
3227             _,_,ext = rendition.attrib['type'].partition('/')
3228             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3229             video_url = rendition.find('./src').text
3230         except KeyError:
3231             self._downloader.trouble('Invalid rendition field.')
3232             return
3233
3234         info = {
3235             'id': video_id,
3236             'url': video_url,
3237             'uploader': performer,
3238             'upload_date': None,
3239             'title': video_title,
3240             'ext': ext,
3241             'format': format,
3242         }
3243
3244         return [info]
3245
3246
3247 class YoukuIE(InfoExtractor):
3248     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3249
3250     def report_download_webpage(self, file_id):
3251         """Report webpage download."""
3252         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3253
3254     def report_extraction(self, file_id):
3255         """Report information extraction."""
3256         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3257
3258     def _gen_sid(self):
3259         nowTime = int(time.time() * 1000)
3260         random1 = random.randint(1000,1998)
3261         random2 = random.randint(1000,9999)
3262
3263         return "%d%d%d" %(nowTime,random1,random2)
3264
3265     def _get_file_ID_mix_string(self, seed):
3266         mixed = []
3267         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3268         seed = float(seed)
3269         for i in range(len(source)):
3270             seed  =  (seed * 211 + 30031 ) % 65536
3271             index  =  math.floor(seed / 65536 * len(source) )
3272             mixed.append(source[int(index)])
3273             source.remove(source[int(index)])
3274         #return ''.join(mixed)
3275         return mixed
3276
3277     def _get_file_id(self, fileId, seed):
3278         mixed = self._get_file_ID_mix_string(seed)
3279         ids = fileId.split('*')
3280         realId = []
3281         for ch in ids:
3282             if ch:
3283                 realId.append(mixed[int(ch)])
3284         return ''.join(realId)
3285
3286     def _real_extract(self, url):
3287         mobj = re.match(self._VALID_URL, url)
3288         if mobj is None:
3289             self._downloader.report_error(u'invalid URL: %s' % url)
3290             return
3291         video_id = mobj.group('ID')
3292
3293         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3294
3295         request = compat_urllib_request.Request(info_url, None, std_headers)
3296         try:
3297             self.report_download_webpage(video_id)
3298             jsondata = compat_urllib_request.urlopen(request).read()
3299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3300             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3301             return
3302
3303         self.report_extraction(video_id)
3304         try:
3305             jsonstr = jsondata.decode('utf-8')
3306             config = json.loads(jsonstr)
3307
3308             video_title =  config['data'][0]['title']
3309             seed = config['data'][0]['seed']
3310
3311             format = self._downloader.params.get('format', None)
3312             supported_format = list(config['data'][0]['streamfileids'].keys())
3313
3314             if format is None or format == 'best':
3315                 if 'hd2' in supported_format:
3316                     format = 'hd2'
3317                 else:
3318                     format = 'flv'
3319                 ext = u'flv'
3320             elif format == 'worst':
3321                 format = 'mp4'
3322                 ext = u'mp4'
3323             else:
3324                 format = 'flv'
3325                 ext = u'flv'
3326
3327
3328             fileid = config['data'][0]['streamfileids'][format]
3329             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3330         except (UnicodeDecodeError, ValueError, KeyError):
3331             self._downloader.report_error(u'unable to extract info section')
3332             return
3333
3334         files_info=[]
3335         sid = self._gen_sid()
3336         fileid = self._get_file_id(fileid, seed)
3337
3338         #column 8,9 of fileid represent the segment number
3339         #fileid[7:9] should be changed
3340         for index, key in enumerate(keys):
3341
3342             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3343             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3344
3345             info = {
3346                 'id': '%s_part%02d' % (video_id, index),
3347                 'url': download_url,
3348                 'uploader': None,
3349                 'upload_date': None,
3350                 'title': video_title,
3351                 'ext': ext,
3352             }
3353             files_info.append(info)
3354
3355         return files_info
3356
3357
3358 class XNXXIE(InfoExtractor):
3359     """Information extractor for xnxx.com"""
3360
3361     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3362     IE_NAME = u'xnxx'
3363     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3364     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3365     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3366
3367     def report_webpage(self, video_id):
3368         """Report information extraction"""
3369         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3370
3371     def report_extraction(self, video_id):
3372         """Report information extraction"""
3373         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3374
3375     def _real_extract(self, url):
3376         mobj = re.match(self._VALID_URL, url)
3377         if mobj is None:
3378             self._downloader.report_error(u'invalid URL: %s' % url)
3379             return
3380         video_id = mobj.group(1)
3381
3382         self.report_webpage(video_id)
3383
3384         # Get webpage content
3385         try:
3386             webpage_bytes = compat_urllib_request.urlopen(url).read()
3387             webpage = webpage_bytes.decode('utf-8')
3388         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3390             return
3391
3392         result = re.search(self.VIDEO_URL_RE, webpage)
3393         if result is None:
3394             self._downloader.report_error(u'unable to extract video url')
3395             return
3396         video_url = compat_urllib_parse.unquote(result.group(1))
3397
3398         result = re.search(self.VIDEO_TITLE_RE, webpage)
3399         if result is None:
3400             self._downloader.report_error(u'unable to extract video title')
3401             return
3402         video_title = result.group(1)
3403
3404         result = re.search(self.VIDEO_THUMB_RE, webpage)
3405         if result is None:
3406             self._downloader.report_error(u'unable to extract video thumbnail')
3407             return
3408         video_thumbnail = result.group(1)
3409
3410         return [{
3411             'id': video_id,
3412             'url': video_url,
3413             'uploader': None,
3414             'upload_date': None,
3415             'title': video_title,
3416             'ext': 'flv',
3417             'thumbnail': video_thumbnail,
3418             'description': None,
3419         }]
3420
3421
3422 class GooglePlusIE(InfoExtractor):
3423     """Information extractor for plus.google.com."""
3424
3425     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3426     IE_NAME = u'plus.google'
3427
3428     def __init__(self, downloader=None):
3429         InfoExtractor.__init__(self, downloader)
3430
3431     def report_extract_entry(self, url):
3432         """Report downloading extry"""
3433         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3434
3435     def report_date(self, upload_date):
3436         """Report downloading extry"""
3437         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3438
3439     def report_uploader(self, uploader):
3440         """Report downloading extry"""
3441         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3442
3443     def report_title(self, video_title):
3444         """Report downloading extry"""
3445         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3446
3447     def report_extract_vid_page(self, video_page):
3448         """Report information extraction."""
3449         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3450
3451     def _real_extract(self, url):
3452         # Extract id from URL
3453         mobj = re.match(self._VALID_URL, url)
3454         if mobj is None:
3455             self._downloader.report_error(u'Invalid URL: %s' % url)
3456             return
3457
3458         post_url = mobj.group(0)
3459         video_id = mobj.group(1)
3460
3461         video_extension = 'flv'
3462
3463         # Step 1, Retrieve post webpage to extract further information
3464         self.report_extract_entry(post_url)
3465         request = compat_urllib_request.Request(post_url)
3466         try:
3467             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3468         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3469             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3470             return
3471
3472         # Extract update date
3473         upload_date = None
3474         pattern = 'title="Timestamp">(.*?)</a>'
3475         mobj = re.search(pattern, webpage)
3476         if mobj:
3477             upload_date = mobj.group(1)
3478             # Convert timestring to a format suitable for filename
3479             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3480             upload_date = upload_date.strftime('%Y%m%d')
3481         self.report_date(upload_date)
3482
3483         # Extract uploader
3484         uploader = None
3485         pattern = r'rel\="author".*?>(.*?)</a>'
3486         mobj = re.search(pattern, webpage)
3487         if mobj:
3488             uploader = mobj.group(1)
3489         self.report_uploader(uploader)
3490
3491         # Extract title
3492         # Get the first line for title
3493         video_title = u'NA'
3494         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3495         mobj = re.search(pattern, webpage)
3496         if mobj:
3497             video_title = mobj.group(1)
3498         self.report_title(video_title)
3499
3500         # Step 2, Stimulate clicking the image box to launch video
3501         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3502         mobj = re.search(pattern, webpage)
3503         if mobj is None:
3504             self._downloader.report_error(u'unable to extract video page URL')
3505
3506         video_page = mobj.group(1)
3507         request = compat_urllib_request.Request(video_page)
3508         try:
3509             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3510         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3511             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3512             return
3513         self.report_extract_vid_page(video_page)
3514
3515
3516         # Extract video links on video page
3517         """Extract video links of all sizes"""
3518         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3519         mobj = re.findall(pattern, webpage)
3520         if len(mobj) == 0:
3521             self._downloader.report_error(u'unable to extract video links')
3522
3523         # Sort in resolution
3524         links = sorted(mobj)
3525
3526         # Choose the lowest of the sort, i.e. highest resolution
3527         video_url = links[-1]
3528         # Only get the url. The resolution part in the tuple has no use anymore
3529         video_url = video_url[-1]
3530         # Treat escaped \u0026 style hex
3531         try:
3532             video_url = video_url.decode("unicode_escape")
3533         except AttributeError: # Python 3
3534             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3535
3536
3537         return [{
3538             'id':       video_id,
3539             'url':      video_url,
3540             'uploader': uploader,
3541             'upload_date':  upload_date,
3542             'title':    video_title,
3543             'ext':      video_extension,
3544         }]
3545
3546 class NBAIE(InfoExtractor):
3547     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3548     IE_NAME = u'nba'
3549
3550     def _real_extract(self, url):
3551         mobj = re.match(self._VALID_URL, url)
3552         if mobj is None:
3553             self._downloader.report_error(u'invalid URL: %s' % url)
3554             return
3555
3556         video_id = mobj.group(1)
3557         if video_id.endswith('/index.html'):
3558             video_id = video_id[:-len('/index.html')]
3559
3560         webpage = self._download_webpage(url, video_id)
3561
3562         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3563         def _findProp(rexp, default=None):
3564             m = re.search(rexp, webpage)
3565             if m:
3566                 return unescapeHTML(m.group(1))
3567             else:
3568                 return default
3569
3570         shortened_video_id = video_id.rpartition('/')[2]
3571         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3572         info = {
3573             'id': shortened_video_id,
3574             'url': video_url,
3575             'ext': 'mp4',
3576             'title': title,
3577             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3578             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3579         }
3580         return [info]
3581
3582 class JustinTVIE(InfoExtractor):
3583     """Information extractor for justin.tv and twitch.tv"""
3584     # TODO: One broadcast may be split into multiple videos. The key
3585     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3586     # starts at 1 and increases. Can we treat all parts as one video?
3587
3588     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3589         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3590     _JUSTIN_PAGE_LIMIT = 100
3591     IE_NAME = u'justin.tv'
3592
3593     def report_extraction(self, file_id):
3594         """Report information extraction."""
3595         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3596
3597     def report_download_page(self, channel, offset):
3598         """Report attempt to download a single page of videos."""
3599         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3600                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3601
3602     # Return count of items, list of *valid* items
3603     def _parse_page(self, url):
3604         try:
3605             urlh = compat_urllib_request.urlopen(url)
3606             webpage_bytes = urlh.read()
3607             webpage = webpage_bytes.decode('utf-8', 'ignore')
3608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3609             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3610             return
3611
3612         response = json.loads(webpage)
3613         if type(response) != list:
3614             error_text = response.get('error', 'unknown error')
3615             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3616             return
3617         info = []
3618         for clip in response:
3619             video_url = clip['video_file_url']
3620             if video_url:
3621                 video_extension = os.path.splitext(video_url)[1][1:]
3622                 video_date = re.sub('-', '', clip['start_time'][:10])
3623                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3624                 video_id = clip['id']
3625                 video_title = clip.get('title', video_id)
3626                 info.append({
3627                     'id': video_id,
3628                     'url': video_url,
3629                     'title': video_title,
3630                     'uploader': clip.get('channel_name', video_uploader_id),
3631                     'uploader_id': video_uploader_id,
3632                     'upload_date': video_date,
3633                     'ext': video_extension,
3634                 })
3635         return (len(response), info)
3636
3637     def _real_extract(self, url):
3638         mobj = re.match(self._VALID_URL, url)
3639         if mobj is None:
3640             self._downloader.report_error(u'invalid URL: %s' % url)
3641             return
3642
3643         api = 'http://api.justin.tv'
3644         video_id = mobj.group(mobj.lastindex)
3645         paged = False
3646         if mobj.lastindex == 1:
3647             paged = True
3648             api += '/channel/archives/%s.json'
3649         else:
3650             api += '/broadcast/by_archive/%s.json'
3651         api = api % (video_id,)
3652
3653         self.report_extraction(video_id)
3654
3655         info = []
3656         offset = 0
3657         limit = self._JUSTIN_PAGE_LIMIT
3658         while True:
3659             if paged:
3660                 self.report_download_page(video_id, offset)
3661             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3662             page_count, page_info = self._parse_page(page_url)
3663             info.extend(page_info)
3664             if not paged or page_count != limit:
3665                 break
3666             offset += limit
3667         return info
3668
3669 class FunnyOrDieIE(InfoExtractor):
3670     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3671
3672     def _real_extract(self, url):
3673         mobj = re.match(self._VALID_URL, url)
3674         if mobj is None:
3675             self._downloader.report_error(u'invalid URL: %s' % url)
3676             return
3677
3678         video_id = mobj.group('id')
3679         webpage = self._download_webpage(url, video_id)
3680
3681         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3682         if not m:
3683             self._downloader.report_error(u'unable to find video information')
3684         video_url = unescapeHTML(m.group('url'))
3685
3686         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3687         if not m:
3688             self._downloader.trouble(u'Cannot find video title')
3689         title = clean_html(m.group('title'))
3690
3691         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3692         if m:
3693             desc = unescapeHTML(m.group('desc'))
3694         else:
3695             desc = None
3696
3697         info = {
3698             'id': video_id,
3699             'url': video_url,
3700             'ext': 'mp4',
3701             'title': title,
3702             'description': desc,
3703         }
3704         return [info]
3705
3706 class SteamIE(InfoExtractor):
3707     _VALID_URL = r"""http://store.steampowered.com/
3708                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3709                 (?P<gameID>\d+)/?
3710                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3711                 """
3712
3713     @classmethod
3714     def suitable(cls, url):
3715         """Receives a URL and returns True if suitable for this IE."""
3716         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3717
3718     def _real_extract(self, url):
3719         m = re.match(self._VALID_URL, url, re.VERBOSE)
3720         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3721         gameID = m.group('gameID')
3722         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3723         webpage = self._download_webpage(videourl, gameID)
3724         mweb = re.finditer(urlRE, webpage)
3725         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3726         titles = re.finditer(namesRE, webpage)
3727         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3728         thumbs = re.finditer(thumbsRE, webpage)
3729         videos = []
3730         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3731             video_id = vid.group('videoID')
3732             title = vtitle.group('videoName')
3733             video_url = vid.group('videoURL')
3734             video_thumb = thumb.group('thumbnail')
3735             if not video_url:
3736                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3737             info = {
3738                 'id':video_id,
3739                 'url':video_url,
3740                 'ext': 'flv',
3741                 'title': unescapeHTML(title),
3742                 'thumbnail': video_thumb
3743                   }
3744             videos.append(info)
3745         return videos
3746
3747 class UstreamIE(InfoExtractor):
3748     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3749     IE_NAME = u'ustream'
3750
3751     def _real_extract(self, url):
3752         m = re.match(self._VALID_URL, url)
3753         video_id = m.group('videoID')
3754         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3755         webpage = self._download_webpage(url, video_id)
3756         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3757         title = m.group('title')
3758         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3759         uploader = m.group('uploader')
3760         info = {
3761                 'id':video_id,
3762                 'url':video_url,
3763                 'ext': 'flv',
3764                 'title': title,
3765                 'uploader': uploader
3766                   }
3767         return [info]
3768
3769 class WorldStarHipHopIE(InfoExtractor):
3770     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3771     IE_NAME = u'WorldStarHipHop'
3772
3773     def _real_extract(self, url):
3774         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3775
3776         webpage_src = compat_urllib_request.urlopen(url).read()
3777         webpage_src = webpage_src.decode('utf-8')
3778
3779         mobj = re.search(_src_url, webpage_src)
3780
3781         m = re.match(self._VALID_URL, url)
3782         video_id = m.group('id')
3783
3784         if mobj is not None:
3785             video_url = mobj.group()
3786             if 'mp4' in video_url:
3787                 ext = 'mp4'
3788             else:
3789                 ext = 'flv'
3790         else:
3791             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3792             return
3793
3794         _title = r"""<title>(.*)</title>"""
3795
3796         mobj = re.search(_title, webpage_src)
3797
3798         if mobj is not None:
3799             title = mobj.group(1)
3800         else:
3801             title = 'World Start Hip Hop - %s' % time.ctime()
3802
3803         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3804         mobj = re.search(_thumbnail, webpage_src)
3805
3806         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3807         if mobj is not None:
3808             thumbnail = mobj.group(1)
3809         else:
3810             _title = r"""candytitles.*>(.*)</span>"""
3811             mobj = re.search(_title, webpage_src)
3812             if mobj is not None:
3813                 title = mobj.group(1)
3814             thumbnail = None
3815
3816         results = [{
3817                     'id': video_id,
3818                     'url' : video_url,
3819                     'title' : title,
3820                     'thumbnail' : thumbnail,
3821                     'ext' : ext,
3822                     }]
3823         return results
3824
3825 class RBMARadioIE(InfoExtractor):
3826     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3827
3828     def _real_extract(self, url):
3829         m = re.match(self._VALID_URL, url)
3830         video_id = m.group('videoID')
3831
3832         webpage = self._download_webpage(url, video_id)
3833         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3834         if not m:
3835             raise ExtractorError(u'Cannot find metadata')
3836         json_data = m.group(1)
3837
3838         try:
3839             data = json.loads(json_data)
3840         except ValueError as e:
3841             raise ExtractorError(u'Invalid JSON: ' + str(e))
3842
3843         video_url = data['akamai_url'] + '&cbr=256'
3844         url_parts = compat_urllib_parse_urlparse(video_url)
3845         video_ext = url_parts.path.rpartition('.')[2]
3846         info = {
3847                 'id': video_id,
3848                 'url': video_url,
3849                 'ext': video_ext,
3850                 'title': data['title'],
3851                 'description': data.get('teaser_text'),
3852                 'location': data.get('country_of_origin'),
3853                 'uploader': data.get('host', {}).get('name'),
3854                 'uploader_id': data.get('host', {}).get('slug'),
3855                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3856                 'duration': data.get('duration'),
3857         }
3858         return [info]
3859
3860
3861 class YouPornIE(InfoExtractor):
3862     """Information extractor for youporn.com."""
3863     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3864
3865     def _print_formats(self, formats):
3866         """Print all available formats"""
3867         print(u'Available formats:')
3868         print(u'ext\t\tformat')
3869         print(u'---------------------------------')
3870         for format in formats:
3871             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3872
3873     def _specific(self, req_format, formats):
3874         for x in formats:
3875             if(x["format"]==req_format):
3876                 return x
3877         return None
3878
3879     def _real_extract(self, url):
3880         mobj = re.match(self._VALID_URL, url)
3881         if mobj is None:
3882             self._downloader.report_error(u'invalid URL: %s' % url)
3883             return
3884
3885         video_id = mobj.group('videoid')
3886
3887         req = compat_urllib_request.Request(url)
3888         req.add_header('Cookie', 'age_verified=1')
3889         webpage = self._download_webpage(req, video_id)
3890
3891         # Get the video title
3892         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3893         if result is None:
3894             raise ExtractorError(u'Unable to extract video title')
3895         video_title = result.group('title').strip()
3896
3897         # Get the video date
3898         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3899         if result is None:
3900             self._downloader.report_warning(u'unable to extract video date')
3901             upload_date = None
3902         else:
3903             upload_date = result.group('date').strip()
3904
3905         # Get the video uploader
3906         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3907         if result is None:
3908             self._downloader.report_warning(u'unable to extract uploader')
3909             video_uploader = None
3910         else:
3911             video_uploader = result.group('uploader').strip()
3912             video_uploader = clean_html( video_uploader )
3913
3914         # Get all of the formats available
3915         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3916         result = re.search(DOWNLOAD_LIST_RE, webpage)
3917         if result is None:
3918             raise ExtractorError(u'Unable to extract download list')
3919         download_list_html = result.group('download_list').strip()
3920
3921         # Get all of the links from the page
3922         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3923         links = re.findall(LINK_RE, download_list_html)
3924         if(len(links) == 0):
3925             raise ExtractorError(u'ERROR: no known formats available for video')
3926
3927         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3928
3929         formats = []
3930         for link in links:
3931
3932             # A link looks like this:
3933             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3934             # A path looks like this:
3935             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3936             video_url = unescapeHTML( link )
3937             path = compat_urllib_parse_urlparse( video_url ).path
3938             extension = os.path.splitext( path )[1][1:]
3939             format = path.split('/')[4].split('_')[:2]
3940             size = format[0]
3941             bitrate = format[1]
3942             format = "-".join( format )
3943             title = u'%s-%s-%s' % (video_title, size, bitrate)
3944
3945             formats.append({
3946                 'id': video_id,
3947                 'url': video_url,
3948                 'uploader': video_uploader,
3949                 'upload_date': upload_date,
3950                 'title': title,
3951                 'ext': extension,
3952                 'format': format,
3953                 'thumbnail': None,
3954                 'description': None,
3955                 'player_url': None
3956             })
3957
3958         if self._downloader.params.get('listformats', None):
3959             self._print_formats(formats)
3960             return
3961
3962         req_format = self._downloader.params.get('format', None)
3963         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3964
3965         if req_format is None or req_format == 'best':
3966             return [formats[0]]
3967         elif req_format == 'worst':
3968             return [formats[-1]]
3969         elif req_format in ('-1', 'all'):
3970             return formats
3971         else:
3972             format = self._specific( req_format, formats )
3973             if result is None:
3974                 self._downloader.report_error(u'requested format not available')
3975                 return
3976             return [format]
3977
3978
3979
3980 class PornotubeIE(InfoExtractor):
3981     """Information extractor for pornotube.com."""
3982     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3983
3984     def _real_extract(self, url):
3985         mobj = re.match(self._VALID_URL, url)
3986         if mobj is None:
3987             self._downloader.report_error(u'invalid URL: %s' % url)
3988             return
3989
3990         video_id = mobj.group('videoid')
3991         video_title = mobj.group('title')
3992
3993         # Get webpage content
3994         webpage = self._download_webpage(url, video_id)
3995
3996         # Get the video URL
3997         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3998         result = re.search(VIDEO_URL_RE, webpage)
3999         if result is None:
4000             self._downloader.report_error(u'unable to extract video url')
4001             return
4002         video_url = compat_urllib_parse.unquote(result.group('url'))
4003
4004         #Get the uploaded date
4005         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4006         result = re.search(VIDEO_UPLOADED_RE, webpage)
4007         if result is None:
4008             self._downloader.report_error(u'unable to extract video title')
4009             return
4010         upload_date = result.group('date')
4011
4012         info = {'id': video_id,
4013                 'url': video_url,
4014                 'uploader': None,
4015                 'upload_date': upload_date,
4016                 'title': video_title,
4017                 'ext': 'flv',
4018                 'format': 'flv'}
4019
4020         return [info]
4021
4022 class YouJizzIE(InfoExtractor):
4023     """Information extractor for youjizz.com."""
4024     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4025
4026     def _real_extract(self, url):
4027         mobj = re.match(self._VALID_URL, url)
4028         if mobj is None:
4029             self._downloader.report_error(u'invalid URL: %s' % url)
4030             return
4031
4032         video_id = mobj.group('videoid')
4033
4034         # Get webpage content
4035         webpage = self._download_webpage(url, video_id)
4036
4037         # Get the video title
4038         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4039         if result is None:
4040             raise ExtractorError(u'ERROR: unable to extract video title')
4041         video_title = result.group('title').strip()
4042
4043         # Get the embed page
4044         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4045         if result is None:
4046             raise ExtractorError(u'ERROR: unable to extract embed page')
4047
4048         embed_page_url = result.group(0).strip()
4049         video_id = result.group('videoid')
4050
4051         webpage = self._download_webpage(embed_page_url, video_id)
4052
4053         # Get the video URL
4054         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4055         if result is None:
4056             raise ExtractorError(u'ERROR: unable to extract video url')
4057         video_url = result.group('source')
4058
4059         info = {'id': video_id,
4060                 'url': video_url,
4061                 'title': video_title,
4062                 'ext': 'flv',
4063                 'format': 'flv',
4064                 'player_url': embed_page_url}
4065
4066         return [info]
4067
4068 class EightTracksIE(InfoExtractor):
4069     IE_NAME = '8tracks'
4070     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4071
4072     def _real_extract(self, url):
4073         mobj = re.match(self._VALID_URL, url)
4074         if mobj is None:
4075             raise ExtractorError(u'Invalid URL: %s' % url)
4076         playlist_id = mobj.group('id')
4077
4078         webpage = self._download_webpage(url, playlist_id)
4079
4080         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4081         if not m:
4082             raise ExtractorError(u'Cannot find trax information')
4083         json_like = m.group(1)
4084         data = json.loads(json_like)
4085
4086         session = str(random.randint(0, 1000000000))
4087         mix_id = data['id']
4088         track_count = data['tracks_count']
4089         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4090         next_url = first_url
4091         res = []
4092         for i in itertools.count():
4093             api_json = self._download_webpage(next_url, playlist_id,
4094                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4095                 errnote=u'Failed to download song information')
4096             api_data = json.loads(api_json)
4097             track_data = api_data[u'set']['track']
4098             info = {
4099                 'id': track_data['id'],
4100                 'url': track_data['track_file_stream_url'],
4101                 'title': track_data['performer'] + u' - ' + track_data['name'],
4102                 'raw_title': track_data['name'],
4103                 'uploader_id': data['user']['login'],
4104                 'ext': 'm4a',
4105             }
4106             res.append(info)
4107             if api_data['set']['at_last_track']:
4108                 break
4109             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4110         return res
4111
4112 class KeekIE(InfoExtractor):
4113     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4114     IE_NAME = u'keek'
4115
4116     def _real_extract(self, url):
4117         m = re.match(self._VALID_URL, url)
4118         video_id = m.group('videoID')
4119         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4120         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4121         webpage = self._download_webpage(url, video_id)
4122         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4123         title = unescapeHTML(m.group('title'))
4124         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4125         uploader = clean_html(m.group('uploader'))
4126         info = {
4127                 'id': video_id,
4128                 'url': video_url,
4129                 'ext': 'mp4',
4130                 'title': title,
4131                 'thumbnail': thumbnail,
4132                 'uploader': uploader
4133         }
4134         return [info]
4135
4136 class TEDIE(InfoExtractor):
4137     _VALID_URL=r'''http://www.ted.com/
4138                    (
4139                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4140                         |
4141                         ((?P<type_talk>talks)) # We have a simple talk
4142                    )
4143                    /(?P<name>\w+) # Here goes the name and then ".html"
4144                    '''
4145
4146     @classmethod
4147     def suitable(cls, url):
4148         """Receives a URL and returns True if suitable for this IE."""
4149         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4150
4151     def _real_extract(self, url):
4152         m=re.match(self._VALID_URL, url, re.VERBOSE)
4153         if m.group('type_talk'):
4154             return [self._talk_info(url)]
4155         else :
4156             playlist_id=m.group('playlist_id')
4157             name=m.group('name')
4158             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4159             return self._playlist_videos_info(url,name,playlist_id)
4160
4161     def _talk_video_link(self,mediaSlug):
4162         '''Returns the video link for that mediaSlug'''
4163         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4164
4165     def _playlist_videos_info(self,url,name,playlist_id=0):
4166         '''Returns the videos of the playlist'''
4167         video_RE=r'''
4168                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4169                      ([.\s]*?)data-playlist_item_id="(\d+)"
4170                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4171                      '''
4172         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4173         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4174         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4175         m_names=re.finditer(video_name_RE,webpage)
4176         info=[]
4177         for m_video, m_name in zip(m_videos,m_names):
4178             video_id=m_video.group('video_id')
4179             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4180             info.append(self._talk_info(talk_url,video_id))
4181         return info
4182
4183     def _talk_info(self, url, video_id=0):
4184         """Return the video for the talk in the url"""
4185         m=re.match(self._VALID_URL, url,re.VERBOSE)
4186         videoName=m.group('name')
4187         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4188         # If the url includes the language we get the title translated
4189         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4190         title=re.search(title_RE, webpage).group('title')
4191         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4192                         "id":(?P<videoID>[\d]+).*?
4193                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4194         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4195         thumb_match=re.search(thumb_RE,webpage)
4196         info_match=re.search(info_RE,webpage,re.VERBOSE)
4197         video_id=info_match.group('videoID')
4198         mediaSlug=info_match.group('mediaSlug')
4199         video_url=self._talk_video_link(mediaSlug)
4200         info = {
4201                 'id': video_id,
4202                 'url': video_url,
4203                 'ext': 'mp4',
4204                 'title': title,
4205                 'thumbnail': thumb_match.group('thumbnail')
4206                 }
4207         return info
4208
4209 class MySpassIE(InfoExtractor):
4210     _VALID_URL = r'http://www.myspass.de/.*'
4211
4212     def _real_extract(self, url):
4213         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4214
4215         # video id is the last path element of the URL
4216         # usually there is a trailing slash, so also try the second but last
4217         url_path = compat_urllib_parse_urlparse(url).path
4218         url_parent_path, video_id = os.path.split(url_path)
4219         if not video_id:
4220             _, video_id = os.path.split(url_parent_path)
4221
4222         # get metadata
4223         metadata_url = META_DATA_URL_TEMPLATE % video_id
4224         metadata_text = self._download_webpage(metadata_url, video_id)
4225         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4226
4227         # extract values from metadata
4228         url_flv_el = metadata.find('url_flv')
4229         if url_flv_el is None:
4230             self._downloader.report_error(u'unable to extract download url')
4231             return
4232         video_url = url_flv_el.text
4233         extension = os.path.splitext(video_url)[1][1:]
4234         title_el = metadata.find('title')
4235         if title_el is None:
4236             self._downloader.report_error(u'unable to extract title')
4237             return
4238         title = title_el.text
4239         format_id_el = metadata.find('format_id')
4240         if format_id_el is None:
4241             format = ext
4242         else:
4243             format = format_id_el.text
4244         description_el = metadata.find('description')
4245         if description_el is not None:
4246             description = description_el.text
4247         else:
4248             description = None
4249         imagePreview_el = metadata.find('imagePreview')
4250         if imagePreview_el is not None:
4251             thumbnail = imagePreview_el.text
4252         else:
4253             thumbnail = None
4254         info = {
4255             'id': video_id,
4256             'url': video_url,
4257             'title': title,
4258             'ext': extension,
4259             'format': format,
4260             'thumbnail': thumbnail,
4261             'description': description
4262         }
4263         return [info]
4264
4265 class SpiegelIE(InfoExtractor):
4266     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4267
4268     def _real_extract(self, url):
4269         m = re.match(self._VALID_URL, url)
4270         video_id = m.group('videoID')
4271
4272         webpage = self._download_webpage(url, video_id)
4273         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4274         if not m:
4275             raise ExtractorError(u'Cannot find title')
4276         video_title = unescapeHTML(m.group(1))
4277
4278         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4279         xml_code = self._download_webpage(xml_url, video_id,
4280                     note=u'Downloading XML', errnote=u'Failed to download XML')
4281
4282         idoc = xml.etree.ElementTree.fromstring(xml_code)
4283         last_type = idoc[-1]
4284         filename = last_type.findall('./filename')[0].text
4285         duration = float(last_type.findall('./duration')[0].text)
4286
4287         video_url = 'http://video2.spiegel.de/flash/' + filename
4288         video_ext = filename.rpartition('.')[2]
4289         info = {
4290             'id': video_id,
4291             'url': video_url,
4292             'ext': video_ext,
4293             'title': video_title,
4294             'duration': duration,
4295         }
4296         return [info]
4297
4298 class LiveLeakIE(InfoExtractor):
4299
4300     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4301     IE_NAME = u'liveleak'
4302
4303     def _real_extract(self, url):
4304         mobj = re.match(self._VALID_URL, url)
4305         if mobj is None:
4306             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4307             return
4308
4309         video_id = mobj.group('video_id')
4310
4311         webpage = self._download_webpage(url, video_id)
4312
4313         m = re.search(r'file: "(.*?)",', webpage)
4314         if not m:
4315             self._downloader.report_error(u'unable to find video url')
4316             return
4317         video_url = m.group(1)
4318
4319         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4320         if not m:
4321             self._downloader.trouble(u'Cannot find video title')
4322         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4323
4324         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4325         if m:
4326             desc = unescapeHTML(m.group('desc'))
4327         else:
4328             desc = None
4329
4330         m = re.search(r'By:.*?(\w+)</a>', webpage)
4331         if m:
4332             uploader = clean_html(m.group(1))
4333         else:
4334             uploader = None
4335
4336         info = {
4337             'id':  video_id,
4338             'url': video_url,
4339             'ext': 'mp4',
4340             'title': title,
4341             'description': desc,
4342             'uploader': uploader
4343         }
4344
4345         return [info]
4346
4347
4348 def gen_extractors():
4349     """ Return a list of an instance of every supported extractor.
4350     The order does matter; the first extractor matched is the one handling the URL.
4351     """
4352     return [
4353         YoutubePlaylistIE(),
4354         YoutubeChannelIE(),
4355         YoutubeUserIE(),
4356         YoutubeSearchIE(),
4357         YoutubeIE(),
4358         MetacafeIE(),
4359         DailymotionIE(),
4360         GoogleSearchIE(),
4361         PhotobucketIE(),
4362         YahooIE(),
4363         YahooSearchIE(),
4364         DepositFilesIE(),
4365         FacebookIE(),
4366         BlipTVUserIE(),
4367         BlipTVIE(),
4368         VimeoIE(),
4369         MyVideoIE(),
4370         ComedyCentralIE(),
4371         EscapistIE(),
4372         CollegeHumorIE(),
4373         XVideosIE(),
4374         SoundcloudSetIE(),
4375         SoundcloudIE(),
4376         InfoQIE(),
4377         MixcloudIE(),
4378         StanfordOpenClassroomIE(),
4379         MTVIE(),
4380         YoukuIE(),
4381         XNXXIE(),
4382         YouJizzIE(),
4383         PornotubeIE(),
4384         YouPornIE(),
4385         GooglePlusIE(),
4386         ArteTvIE(),
4387         NBAIE(),
4388         WorldStarHipHopIE(),
4389         JustinTVIE(),
4390         FunnyOrDieIE(),
4391         SteamIE(),
4392         UstreamIE(),
4393         RBMARadioIE(),
4394         EightTracksIE(),
4395         KeekIE(),
4396         TEDIE(),
4397         MySpassIE(),
4398         SpiegelIE(),
4399         LiveLeakIE(),
4400         GenericIE()
4401     ]