youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import netrc
   9 import os
  10 import re
  11 import socket
  12 import time
  13 import email.utils
  14 import xml.etree.ElementTree
  15 import random
  16 import math
  17
  18 from .utils import *
  19
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38     uploader:       Full name of the video uploader.
  39     upload_date:    Video upload date (YYYYMMDD).
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader_id:    Nickname or id of the video uploader.
  47     player_url:     SWF Player URL (used for rtmpdump).
  48     subtitles:      The .srt file contents.
  49     urlhandle:      [internal] The urlHandle to be used to download the file,
  50                     like returned by urllib.request.urlopen
  51
  52     The fields should all be Unicode strings.
  53
  54     Subclasses of this one should re-define the _real_initialize() and
  55     _real_extract() methods and define a _VALID_URL regexp.
  56     Probably, they should also be added to the list of extractors.
  57
  58     _real_extract() must return a *list* of information dictionaries as
  59     described above.
  60
  61     Finally, the _WORKING attribute should be set to False for broken IEs
  62     in order to warn the users and skip the tests.
  63     """
  64
  65     _ready = False
  66     _downloader = None
  67     _WORKING = True
  68
  69     def __init__(self, downloader=None):
  70         """Constructor. Receives an optional downloader."""
  71         self._ready = False
  72         self.set_downloader(downloader)
  73
  74     def suitable(self, url):
  75         """Receives a URL and returns True if suitable for this IE."""
  76         return re.match(self._VALID_URL, url) is not None
  77
  78     def working(self):
  79         """Getter method for _WORKING."""
  80         return self._WORKING
  81
  82     def initialize(self):
  83         """Initializes an instance (authentication, etc)."""
  84         if not self._ready:
  85             self._real_initialize()
  86             self._ready = True
  87
  88     def extract(self, url):
  89         """Extracts URL information and returns it in list of dicts."""
  90         self.initialize()
  91         return self._real_extract(url)
  92
  93     def set_downloader(self, downloader):
  94         """Sets the downloader for this IE."""
  95         self._downloader = downloader
  96
  97     def _real_initialize(self):
  98         """Real initialization process. Redefine in subclasses."""
  99         pass
 100
 101     def _real_extract(self, url):
 102         """Real extraction process. Redefine in subclasses."""
 103         pass
 104
 105     @property
 106     def IE_NAME(self):
 107         return type(self).__name__[:-2]
 108
 109     def _download_webpage(self, url, video_id, note=None, errnote=None):
 110         if note is None:
 111             note = u'Downloading video webpage'
 112         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 113         try:
 114             urlh = compat_urllib_request.urlopen(url)
 115             webpage_bytes = urlh.read()
 116             return webpage_bytes.decode('utf-8', 'replace')
 117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 118             if errnote is None:
 119                 errnote = u'Unable to download webpage'
 120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
 121
 122
 123 class YoutubeIE(InfoExtractor):
 124     """Information extractor for youtube.com."""
 125
 126     _VALID_URL = r"""^
 127                      (
 128                          (?:https?://)?                                       # http(s):// (optional)
 129                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 130                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 131                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 132                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 133                          (?:                                                  # the various things that can precede the ID:
 134                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 135                              |(?:                                             # or the v= param in all its forms
 136                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 137                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 138                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 139                                  v=
 140                              )
 141                          )?                                                   # optional -> youtube.com/xxxx is OK
 142                      )?                                                       # all until now is optional -> you can pass the naked ID
 143                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 144                      (?(1).+)?                                                # if we found the ID, everything can follow
 145                      $"""
 146     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 147     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 148     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 149     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 150     _NETRC_MACHINE = 'youtube'
 151     # Listed in order of quality
 152     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 153     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 154     _video_extensions = {
 155         '13': '3gp',
 156         '17': 'mp4',
 157         '18': 'mp4',
 158         '22': 'mp4',
 159         '37': 'mp4',
 160         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 161         '43': 'webm',
 162         '44': 'webm',
 163         '45': 'webm',
 164         '46': 'webm',
 165     }
 166     _video_dimensions = {
 167         '5': '240x400',
 168         '6': '???',
 169         '13': '???',
 170         '17': '144x176',
 171         '18': '360x640',
 172         '22': '720x1280',
 173         '34': '360x640',
 174         '35': '480x854',
 175         '37': '1080x1920',
 176         '38': '3072x4096',
 177         '43': '360x640',
 178         '44': '480x854',
 179         '45': '720x1280',
 180         '46': '1080x1920',
 181     }
 182     IE_NAME = u'youtube'
 183
 184     def suitable(self, url):
 185         """Receives a URL and returns True if suitable for this IE."""
 186         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 187
 188     def report_lang(self):
 189         """Report attempt to set language."""
 190         self._downloader.to_screen(u'[youtube] Setting language')
 191
 192     def report_login(self):
 193         """Report attempt to log in."""
 194         self._downloader.to_screen(u'[youtube] Logging in')
 195
 196     def report_age_confirmation(self):
 197         """Report attempt to confirm age."""
 198         self._downloader.to_screen(u'[youtube] Confirming age')
 199
 200     def report_video_webpage_download(self, video_id):
 201         """Report attempt to download video webpage."""
 202         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 203
 204     def report_video_info_webpage_download(self, video_id):
 205         """Report attempt to download video info webpage."""
 206         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 207
 208     def report_video_subtitles_download(self, video_id):
 209         """Report attempt to download video info webpage."""
 210         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 211
 212     def report_information_extraction(self, video_id):
 213         """Report attempt to extract video information."""
 214         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 215
 216     def report_unavailable_format(self, video_id, format):
 217         """Report extracted video URL."""
 218         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 219
 220     def report_rtmp_download(self):
 221         """Indicate the download will use the RTMP protocol."""
 222         self._downloader.to_screen(u'[youtube] RTMP download detected')
 223
 224     def _closed_captions_xml_to_srt(self, xml_string):
 225         srt = ''
 226         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 227         # TODO parse xml instead of regex
 228         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 229             if not dur: dur = '4'
 230             start = float(start)
 231             end = start + float(dur)
 232             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 233             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 234             caption = unescapeHTML(caption)
 235             caption = unescapeHTML(caption) # double cycle, intentional
 236             srt += str(n+1) + '\n'
 237             srt += start + ' --> ' + end + '\n'
 238             srt += caption + '\n\n'
 239         return srt
 240
 241     def _extract_subtitles(self, video_id):
 242         self.report_video_subtitles_download(video_id)
 243         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 244         try:
 245             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 247             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 248         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 249         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 250         if not srt_lang_list:
 251             return (u'WARNING: video has no closed captions', None)
 252         if self._downloader.params.get('subtitleslang', False):
 253             srt_lang = self._downloader.params.get('subtitleslang')
 254         elif 'en' in srt_lang_list:
 255             srt_lang = 'en'
 256         else:
 257             srt_lang = list(srt_lang_list.keys())[0]
 258         if not srt_lang in srt_lang_list:
 259             return (u'WARNING: no closed captions found in the specified language', None)
 260         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 261         try:
 262             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 264             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 265         if not srt_xml:
 266             return (u'WARNING: unable to download video subtitles', None)
 267         return (None, self._closed_captions_xml_to_srt(srt_xml))
 268
 269     def _print_formats(self, formats):
 270         print('Available formats:')
 271         for x in formats:
 272             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 273
 274     def _real_initialize(self):
 275         if self._downloader is None:
 276             return
 277
 278         username = None
 279         password = None
 280         downloader_params = self._downloader.params
 281
 282         # Attempt to use provided username and password or .netrc data
 283         if downloader_params.get('username', None) is not None:
 284             username = downloader_params['username']
 285             password = downloader_params['password']
 286         elif downloader_params.get('usenetrc', False):
 287             try:
 288                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 289                 if info is not None:
 290                     username = info[0]
 291                     password = info[2]
 292                 else:
 293                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 294             except (IOError, netrc.NetrcParseError) as err:
 295                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 296                 return
 297
 298         # Set language
 299         request = compat_urllib_request.Request(self._LANG_URL)
 300         try:
 301             self.report_lang()
 302             compat_urllib_request.urlopen(request).read()
 303         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 304             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 305             return
 306
 307         # No authentication to be performed
 308         if username is None:
 309             return
 310
 311         # Log in
 312         login_form = {
 313                 'current_form': 'loginForm',
 314                 'next':     '/',
 315                 'action_login': 'Log In',
 316                 'username': username,
 317                 'password': password,
 318                 }
 319         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 320         try:
 321             self.report_login()
 322             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 323             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 324                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 325                 return
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 328             return
 329
 330         # Confirm age
 331         age_form = {
 332                 'next_url':     '/',
 333                 'action_confirm':   'Confirm',
 334                 }
 335         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 336         try:
 337             self.report_age_confirmation()
 338             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 339         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 340             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 341             return
 342
 343     def _extract_id(self, url):
 344         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 345         if mobj is None:
 346             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 347             return
 348         video_id = mobj.group(2)
 349         return video_id
 350
 351     def _real_extract(self, url):
 352         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 353         mobj = re.search(self._NEXT_URL_RE, url)
 354         if mobj:
 355             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 356         video_id = self._extract_id(url)
 357
 358         # Get video webpage
 359         self.report_video_webpage_download(video_id)
 360         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 361         request = compat_urllib_request.Request(url)
 362         try:
 363             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 365             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 366             return
 367
 368         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 369
 370         # Attempt to extract SWF player URL
 371         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 372         if mobj is not None:
 373             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 374         else:
 375             player_url = None
 376
 377         # Get video info
 378         self.report_video_info_webpage_download(video_id)
 379         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 380             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 381                     % (video_id, el_type))
 382             request = compat_urllib_request.Request(video_info_url)
 383             try:
 384                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 385                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 386                 video_info = compat_parse_qs(video_info_webpage)
 387                 if 'token' in video_info:
 388                     break
 389             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 390                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 391                 return
 392         if 'token' not in video_info:
 393             if 'reason' in video_info:
 394                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 395             else:
 396                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 397             return
 398
 399         # Check for "rental" videos
 400         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 401             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 402             return
 403
 404         # Start extracting information
 405         self.report_information_extraction(video_id)
 406
 407         # uploader
 408         if 'author' not in video_info:
 409             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 410             return
 411         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 412
 413         # uploader_id
 414         video_uploader_id = None
 415         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 416         if mobj is not None:
 417             video_uploader_id = mobj.group(1)
 418         else:
 419             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 420
 421         # title
 422         if 'title' not in video_info:
 423             self._downloader.trouble(u'ERROR: unable to extract video title')
 424             return
 425         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 426
 427         # thumbnail image
 428         if 'thumbnail_url' not in video_info:
 429             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 430             video_thumbnail = ''
 431         else:   # don't panic if we can't find it
 432             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 433
 434         # upload date
 435         upload_date = None
 436         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 437         if mobj is not None:
 438             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 439             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 440             for expression in format_expressions:
 441                 try:
 442                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 443                 except:
 444                     pass
 445
 446         # description
 447         video_description = get_element_by_id("eow-description", video_webpage)
 448         if video_description:
 449             video_description = clean_html(video_description)
 450         else:
 451             video_description = ''
 452
 453         # closed captions
 454         video_subtitles = None
 455         if self._downloader.params.get('writesubtitles', False):
 456             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 457             if srt_error:
 458                 self._downloader.trouble(srt_error)
 459
 460         if 'length_seconds' not in video_info:
 461             self._downloader.trouble(u'WARNING: unable to extract video duration')
 462             video_duration = ''
 463         else:
 464             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 465
 466         # token
 467         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 468
 469         # Decide which formats to download
 470         req_format = self._downloader.params.get('format', None)
 471
 472         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 473             self.report_rtmp_download()
 474             video_url_list = [(None, video_info['conn'][0])]
 475         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 476             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 477             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 478             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 479             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 480
 481             format_limit = self._downloader.params.get('format_limit', None)
 482             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 483             if format_limit is not None and format_limit in available_formats:
 484                 format_list = available_formats[available_formats.index(format_limit):]
 485             else:
 486                 format_list = available_formats
 487             existing_formats = [x for x in format_list if x in url_map]
 488             if len(existing_formats) == 0:
 489                 self._downloader.trouble(u'ERROR: no known formats available for video')
 490                 return
 491             if self._downloader.params.get('listformats', None):
 492                 self._print_formats(existing_formats)
 493                 return
 494             if req_format is None or req_format == 'best':
 495                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 496             elif req_format == 'worst':
 497                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 498             elif req_format in ('-1', 'all'):
 499                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 500             else:
 501                 # Specific formats. We pick the first in a slash-delimeted sequence.
 502                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 503                 req_formats = req_format.split('/')
 504                 video_url_list = None
 505                 for rf in req_formats:
 506                     if rf in url_map:
 507                         video_url_list = [(rf, url_map[rf])]
 508                         break
 509                 if video_url_list is None:
 510                     self._downloader.trouble(u'ERROR: requested format not available')
 511                     return
 512         else:
 513             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 514             return
 515
 516         results = []
 517         for format_param, video_real_url in video_url_list:
 518             # Extension
 519             video_extension = self._video_extensions.get(format_param, 'flv')
 520
 521             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 522                                               self._video_dimensions.get(format_param, '???'))
 523
 524             results.append({
 525                 'id':       video_id,
 526                 'url':      video_real_url,
 527                 'uploader': video_uploader,
 528                 'uploader_id': video_uploader_id,
 529                 'upload_date':  upload_date,
 530                 'title':    video_title,
 531                 'ext':      video_extension,
 532                 'format':   video_format,
 533                 'thumbnail':    video_thumbnail,
 534                 'description':  video_description,
 535                 'player_url':   player_url,
 536                 'subtitles':    video_subtitles,
 537                 'duration':     video_duration
 538             })
 539         return results
 540
 541
 542 class MetacafeIE(InfoExtractor):
 543     """Information Extractor for metacafe.com."""
 544
 545     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 546     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 547     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 548     IE_NAME = u'metacafe'
 549
 550     def __init__(self, downloader=None):
 551         InfoExtractor.__init__(self, downloader)
 552
 553     def report_disclaimer(self):
 554         """Report disclaimer retrieval."""
 555         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 556
 557     def report_age_confirmation(self):
 558         """Report attempt to confirm age."""
 559         self._downloader.to_screen(u'[metacafe] Confirming age')
 560
 561     def report_download_webpage(self, video_id):
 562         """Report webpage download."""
 563         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 564
 565     def report_extraction(self, video_id):
 566         """Report information extraction."""
 567         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 568
 569     def _real_initialize(self):
 570         # Retrieve disclaimer
 571         request = compat_urllib_request.Request(self._DISCLAIMER)
 572         try:
 573             self.report_disclaimer()
 574             disclaimer = compat_urllib_request.urlopen(request).read()
 575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 576             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 577             return
 578
 579         # Confirm age
 580         disclaimer_form = {
 581             'filters': '0',
 582             'submit': "Continue - I'm over 18",
 583             }
 584         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 585         try:
 586             self.report_age_confirmation()
 587             disclaimer = compat_urllib_request.urlopen(request).read()
 588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 589             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 590             return
 591
 592     def _real_extract(self, url):
 593         # Extract id and simplified title from URL
 594         mobj = re.match(self._VALID_URL, url)
 595         if mobj is None:
 596             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 597             return
 598
 599         video_id = mobj.group(1)
 600
 601         # Check if video comes from YouTube
 602         mobj2 = re.match(r'^yt-(.*)$', video_id)
 603         if mobj2 is not None:
 604             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 605             return
 606
 607         # Retrieve video webpage to extract further information
 608         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 609         try:
 610             self.report_download_webpage(video_id)
 611             webpage = compat_urllib_request.urlopen(request).read()
 612         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 613             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 614             return
 615
 616         # Extract URL, uploader and title from webpage
 617         self.report_extraction(video_id)
 618         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 619         if mobj is not None:
 620             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 621             video_extension = mediaURL[-3:]
 622
 623             # Extract gdaKey if available
 624             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 625             if mobj is None:
 626                 video_url = mediaURL
 627             else:
 628                 gdaKey = mobj.group(1)
 629                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 630         else:
 631             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 632             if mobj is None:
 633                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 634                 return
 635             vardict = compat_parse_qs(mobj.group(1))
 636             if 'mediaData' not in vardict:
 637                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 638                 return
 639             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 640             if mobj is None:
 641                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                 return
 643             mediaURL = mobj.group(1).replace('\\/', '/')
 644             video_extension = mediaURL[-3:]
 645             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 646
 647         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 648         if mobj is None:
 649             self._downloader.trouble(u'ERROR: unable to extract title')
 650             return
 651         video_title = mobj.group(1).decode('utf-8')
 652
 653         mobj = re.search(r'submitter=(.*?);', webpage)
 654         if mobj is None:
 655             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656             return
 657         video_uploader = mobj.group(1)
 658
 659         return [{
 660             'id':       video_id.decode('utf-8'),
 661             'url':      video_url.decode('utf-8'),
 662             'uploader': video_uploader.decode('utf-8'),
 663             'upload_date':  None,
 664             'title':    video_title,
 665             'ext':      video_extension.decode('utf-8'),
 666         }]
 667
 668
 669 class DailymotionIE(InfoExtractor):
 670     """Information Extractor for Dailymotion"""
 671
 672     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 673     IE_NAME = u'dailymotion'
 674
 675     def __init__(self, downloader=None):
 676         InfoExtractor.__init__(self, downloader)
 677
 678     def report_download_webpage(self, video_id):
 679         """Report webpage download."""
 680         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 681
 682     def report_extraction(self, video_id):
 683         """Report information extraction."""
 684         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 685
 686     def _real_extract(self, url):
 687         # Extract id and simplified title from URL
 688         mobj = re.match(self._VALID_URL, url)
 689         if mobj is None:
 690             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 691             return
 692
 693         video_id = mobj.group(1).split('_')[0].split('?')[0]
 694
 695         video_extension = 'mp4'
 696
 697         # Retrieve video webpage to extract further information
 698         request = compat_urllib_request.Request(url)
 699         request.add_header('Cookie', 'family_filter=off')
 700         try:
 701             self.report_download_webpage(video_id)
 702             webpage_bytes = compat_urllib_request.urlopen(request).read()
 703             webpage = webpage_bytes.decode('utf-8')
 704         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 705             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 706             return
 707
 708         # Extract URL, uploader and title from webpage
 709         self.report_extraction(video_id)
 710         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 711         if mobj is None:
 712             self._downloader.trouble(u'ERROR: unable to extract media URL')
 713             return
 714         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 715
 716         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 717             if key in flashvars:
 718                 max_quality = key
 719                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 720                 break
 721         else:
 722             self._downloader.trouble(u'ERROR: unable to extract video URL')
 723             return
 724
 725         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 726         if mobj is None:
 727             self._downloader.trouble(u'ERROR: unable to extract video URL')
 728             return
 729
 730         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 731
 732         # TODO: support choosing qualities
 733
 734         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 735         if mobj is None:
 736             self._downloader.trouble(u'ERROR: unable to extract title')
 737             return
 738         video_title = unescapeHTML(mobj.group('title'))
 739
 740         video_uploader = None
 741         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 742         if mobj is None:
 743             # lookin for official user
 744             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 745             if mobj_official is None:
 746                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 747             else:
 748                 video_uploader = mobj_official.group(1)
 749         else:
 750             video_uploader = mobj.group(1)
 751
 752         video_upload_date = None
 753         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 754         if mobj is not None:
 755             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 756
 757         return [{
 758             'id':       video_id,
 759             'url':      video_url,
 760             'uploader': video_uploader,
 761             'upload_date':  video_upload_date,
 762             'title':    video_title,
 763             'ext':      video_extension,
 764         }]
 765
 766
 767 class PhotobucketIE(InfoExtractor):
 768     """Information extractor for photobucket.com."""
 769
 770     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 771     IE_NAME = u'photobucket'
 772
 773     def __init__(self, downloader=None):
 774         InfoExtractor.__init__(self, downloader)
 775
 776     def report_download_webpage(self, video_id):
 777         """Report webpage download."""
 778         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 779
 780     def report_extraction(self, video_id):
 781         """Report information extraction."""
 782         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 783
 784     def _real_extract(self, url):
 785         # Extract id from URL
 786         mobj = re.match(self._VALID_URL, url)
 787         if mobj is None:
 788             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 789             return
 790
 791         video_id = mobj.group(1)
 792
 793         video_extension = 'flv'
 794
 795         # Retrieve video webpage to extract further information
 796         request = compat_urllib_request.Request(url)
 797         try:
 798             self.report_download_webpage(video_id)
 799             webpage = compat_urllib_request.urlopen(request).read()
 800         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 801             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 802             return
 803
 804         # Extract URL, uploader, and title from webpage
 805         self.report_extraction(video_id)
 806         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 807         if mobj is None:
 808             self._downloader.trouble(u'ERROR: unable to extract media URL')
 809             return
 810         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 811
 812         video_url = mediaURL
 813
 814         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 815         if mobj is None:
 816             self._downloader.trouble(u'ERROR: unable to extract title')
 817             return
 818         video_title = mobj.group(1).decode('utf-8')
 819
 820         video_uploader = mobj.group(2).decode('utf-8')
 821
 822         return [{
 823             'id':       video_id.decode('utf-8'),
 824             'url':      video_url.decode('utf-8'),
 825             'uploader': video_uploader,
 826             'upload_date':  None,
 827             'title':    video_title,
 828             'ext':      video_extension.decode('utf-8'),
 829         }]
 830
 831
 832 class YahooIE(InfoExtractor):
 833     """Information extractor for video.yahoo.com."""
 834
 835     _WORKING = False
 836     # _VALID_URL matches all Yahoo! Video URLs
 837     # _VPAGE_URL matches only the extractable '/watch/' URLs
 838     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 839     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 840     IE_NAME = u'video.yahoo'
 841
 842     def __init__(self, downloader=None):
 843         InfoExtractor.__init__(self, downloader)
 844
 845     def report_download_webpage(self, video_id):
 846         """Report webpage download."""
 847         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 848
 849     def report_extraction(self, video_id):
 850         """Report information extraction."""
 851         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 852
 853     def _real_extract(self, url, new_video=True):
 854         # Extract ID from URL
 855         mobj = re.match(self._VALID_URL, url)
 856         if mobj is None:
 857             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 858             return
 859
 860         video_id = mobj.group(2)
 861         video_extension = 'flv'
 862
 863         # Rewrite valid but non-extractable URLs as
 864         # extractable English language /watch/ URLs
 865         if re.match(self._VPAGE_URL, url) is None:
 866             request = compat_urllib_request.Request(url)
 867             try:
 868                 webpage = compat_urllib_request.urlopen(request).read()
 869             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 870                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 871                 return
 872
 873             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 874             if mobj is None:
 875                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 876                 return
 877             yahoo_id = mobj.group(1)
 878
 879             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 880             if mobj is None:
 881                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 882                 return
 883             yahoo_vid = mobj.group(1)
 884
 885             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 886             return self._real_extract(url, new_video=False)
 887
 888         # Retrieve video webpage to extract further information
 889         request = compat_urllib_request.Request(url)
 890         try:
 891             self.report_download_webpage(video_id)
 892             webpage = compat_urllib_request.urlopen(request).read()
 893         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 894             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 895             return
 896
 897         # Extract uploader and title from webpage
 898         self.report_extraction(video_id)
 899         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 900         if mobj is None:
 901             self._downloader.trouble(u'ERROR: unable to extract video title')
 902             return
 903         video_title = mobj.group(1).decode('utf-8')
 904
 905         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 906         if mobj is None:
 907             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 908             return
 909         video_uploader = mobj.group(1).decode('utf-8')
 910
 911         # Extract video thumbnail
 912         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 913         if mobj is None:
 914             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 915             return
 916         video_thumbnail = mobj.group(1).decode('utf-8')
 917
 918         # Extract video description
 919         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 920         if mobj is None:
 921             self._downloader.trouble(u'ERROR: unable to extract video description')
 922             return
 923         video_description = mobj.group(1).decode('utf-8')
 924         if not video_description:
 925             video_description = 'No description available.'
 926
 927         # Extract video height and width
 928         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 929         if mobj is None:
 930             self._downloader.trouble(u'ERROR: unable to extract video height')
 931             return
 932         yv_video_height = mobj.group(1)
 933
 934         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 935         if mobj is None:
 936             self._downloader.trouble(u'ERROR: unable to extract video width')
 937             return
 938         yv_video_width = mobj.group(1)
 939
 940         # Retrieve video playlist to extract media URL
 941         # I'm not completely sure what all these options are, but we
 942         # seem to need most of them, otherwise the server sends a 401.
 943         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 944         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 945         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 946                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 947                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 948         try:
 949             self.report_download_webpage(video_id)
 950             webpage = compat_urllib_request.urlopen(request).read()
 951         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 952             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 953             return
 954
 955         # Extract media URL from playlist XML
 956         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 957         if mobj is None:
 958             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 959             return
 960         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 961         video_url = unescapeHTML(video_url)
 962
 963         return [{
 964             'id':       video_id.decode('utf-8'),
 965             'url':      video_url,
 966             'uploader': video_uploader,
 967             'upload_date':  None,
 968             'title':    video_title,
 969             'ext':      video_extension.decode('utf-8'),
 970             'thumbnail':    video_thumbnail.decode('utf-8'),
 971             'description':  video_description,
 972         }]
 973
 974
 975 class VimeoIE(InfoExtractor):
 976     """Information extractor for vimeo.com."""
 977
 978     # _VALID_URL matches Vimeo URLs
 979     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 980     IE_NAME = u'vimeo'
 981
 982     def __init__(self, downloader=None):
 983         InfoExtractor.__init__(self, downloader)
 984
 985     def report_download_webpage(self, video_id):
 986         """Report webpage download."""
 987         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 988
 989     def report_extraction(self, video_id):
 990         """Report information extraction."""
 991         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 992
 993     def _real_extract(self, url, new_video=True):
 994         # Extract ID from URL
 995         mobj = re.match(self._VALID_URL, url)
 996         if mobj is None:
 997             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 998             return
 999
1000         video_id = mobj.group(1)
1001
1002         # Retrieve video webpage to extract further information
1003         request = compat_urllib_request.Request(url, None, std_headers)
1004         try:
1005             self.report_download_webpage(video_id)
1006             webpage_bytes = compat_urllib_request.urlopen(request).read()
1007             webpage = webpage_bytes.decode('utf-8')
1008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1010             return
1011
1012         # Now we begin extracting as much information as we can from what we
1013         # retrieved. First we extract the information common to all extractors,
1014         # and latter we extract those that are Vimeo specific.
1015         self.report_extraction(video_id)
1016
1017         # Extract the config JSON
1018         try:
1019             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020             config = json.loads(config)
1021         except:
1022             self._downloader.trouble(u'ERROR: unable to extract info section')
1023             return
1024
1025         # Extract title
1026         video_title = config["video"]["title"]
1027
1028         # Extract uploader and uploader_id
1029         video_uploader = config["video"]["owner"]["name"]
1030         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1031
1032         # Extract video thumbnail
1033         video_thumbnail = config["video"]["thumbnail"]
1034
1035         # Extract video description
1036         video_description = get_element_by_attribute("itemprop", "description", webpage)
1037         if video_description: video_description = clean_html(video_description)
1038         else: video_description = ''
1039
1040         # Extract upload date
1041         video_upload_date = None
1042         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1043         if mobj is not None:
1044             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1045
1046         # Vimeo specific: extract request signature and timestamp
1047         sig = config['request']['signature']
1048         timestamp = config['request']['timestamp']
1049
1050         # Vimeo specific: extract video codec and quality information
1051         # First consider quality, then codecs, then take everything
1052         # TODO bind to format param
1053         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1054         files = { 'hd': [], 'sd': [], 'other': []}
1055         for codec_name, codec_extension in codecs:
1056             if codec_name in config["video"]["files"]:
1057                 if 'hd' in config["video"]["files"][codec_name]:
1058                     files['hd'].append((codec_name, codec_extension, 'hd'))
1059                 elif 'sd' in config["video"]["files"][codec_name]:
1060                     files['sd'].append((codec_name, codec_extension, 'sd'))
1061                 else:
1062                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1063
1064         for quality in ('hd', 'sd', 'other'):
1065             if len(files[quality]) > 0:
1066                 video_quality = files[quality][0][2]
1067                 video_codec = files[quality][0][0]
1068                 video_extension = files[quality][0][1]
1069                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1070                 break
1071         else:
1072             self._downloader.trouble(u'ERROR: no known codec found')
1073             return
1074
1075         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1076                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1077
1078         return [{
1079             'id':       video_id,
1080             'url':      video_url,
1081             'uploader': video_uploader,
1082             'uploader_id': video_uploader_id,
1083             'upload_date':  video_upload_date,
1084             'title':    video_title,
1085             'ext':      video_extension,
1086             'thumbnail':    video_thumbnail,
1087             'description':  video_description,
1088         }]
1089
1090
1091 class ArteTvIE(InfoExtractor):
1092     """arte.tv information extractor."""
1093
1094     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1095     _LIVE_URL = r'index-[0-9]+\.html$'
1096
1097     IE_NAME = u'arte.tv'
1098
1099     def __init__(self, downloader=None):
1100         InfoExtractor.__init__(self, downloader)
1101
1102     def report_download_webpage(self, video_id):
1103         """Report webpage download."""
1104         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1105
1106     def report_extraction(self, video_id):
1107         """Report information extraction."""
1108         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1109
1110     def fetch_webpage(self, url):
1111         request = compat_urllib_request.Request(url)
1112         try:
1113             self.report_download_webpage(url)
1114             webpage = compat_urllib_request.urlopen(request).read()
1115         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1116             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1117             return
1118         except ValueError as err:
1119             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120             return
1121         return webpage
1122
1123     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1124         page = self.fetch_webpage(url)
1125         mobj = re.search(regex, page, regexFlags)
1126         info = {}
1127
1128         if mobj is None:
1129             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1130             return
1131
1132         for (i, key, err) in matchTuples:
1133             if mobj.group(i) is None:
1134                 self._downloader.trouble(err)
1135                 return
1136             else:
1137                 info[key] = mobj.group(i)
1138
1139         return info
1140
1141     def extractLiveStream(self, url):
1142         video_lang = url.split('/')[-4]
1143         info = self.grep_webpage(
1144             url,
1145             r'src="(.*?/videothek_js.*?\.js)',
1146             0,
1147             [
1148                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1149             ]
1150         )
1151         http_host = url.split('/')[2]
1152         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1153         info = self.grep_webpage(
1154             next_url,
1155             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1156                 '(http://.*?\.swf).*?' +
1157                 '(rtmp://.*?)\'',
1158             re.DOTALL,
1159             [
1160                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1161                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1162                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1163             ]
1164         )
1165         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1166
1167     def extractPlus7Stream(self, url):
1168         video_lang = url.split('/')[-3]
1169         info = self.grep_webpage(
1170             url,
1171             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1172             0,
1173             [
1174                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1175             ]
1176         )
1177         next_url = compat_urllib_parse.unquote(info.get('url'))
1178         info = self.grep_webpage(
1179             next_url,
1180             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1181             0,
1182             [
1183                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1184             ]
1185         )
1186         next_url = compat_urllib_parse.unquote(info.get('url'))
1187
1188         info = self.grep_webpage(
1189             next_url,
1190             r'<video id="(.*?)".*?>.*?' +
1191                 '<name>(.*?)</name>.*?' +
1192                 '<dateVideo>(.*?)</dateVideo>.*?' +
1193                 '<url quality="hd">(.*?)</url>',
1194             re.DOTALL,
1195             [
1196                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1197                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1198                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1199                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1200             ]
1201         )
1202
1203         return {
1204             'id':           info.get('id'),
1205             'url':          compat_urllib_parse.unquote(info.get('url')),
1206             'uploader':     u'arte.tv',
1207             'upload_date':  info.get('date'),
1208             'title':        info.get('title').decode('utf-8'),
1209             'ext':          u'mp4',
1210             'format':       u'NA',
1211             'player_url':   None,
1212         }
1213
1214     def _real_extract(self, url):
1215         video_id = url.split('/')[-1]
1216         self.report_extraction(video_id)
1217
1218         if re.search(self._LIVE_URL, video_id) is not None:
1219             self.extractLiveStream(url)
1220             return
1221         else:
1222             info = self.extractPlus7Stream(url)
1223
1224         return [info]
1225
1226
1227 class GenericIE(InfoExtractor):
1228     """Generic last-resort information extractor."""
1229
1230     _VALID_URL = r'.*'
1231     IE_NAME = u'generic'
1232
1233     def __init__(self, downloader=None):
1234         InfoExtractor.__init__(self, downloader)
1235
1236     def report_download_webpage(self, video_id):
1237         """Report webpage download."""
1238         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1239         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1240
1241     def report_extraction(self, video_id):
1242         """Report information extraction."""
1243         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1244
1245     def report_following_redirect(self, new_url):
1246         """Report information extraction."""
1247         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1248
1249     def _test_redirect(self, url):
1250         """Check if it is a redirect, like url shorteners, in case restart chain."""
1251         class HeadRequest(compat_urllib_request.Request):
1252             def get_method(self):
1253                 return "HEAD"
1254
1255         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1256             """
1257             Subclass the HTTPRedirectHandler to make it use our
1258             HeadRequest also on the redirected URL
1259             """
1260             def redirect_request(self, req, fp, code, msg, headers, newurl):
1261                 if code in (301, 302, 303, 307):
1262                     newurl = newurl.replace(' ', '%20')
1263                     newheaders = dict((k,v) for k,v in req.headers.items()
1264                                       if k.lower() not in ("content-length", "content-type"))
1265                     return HeadRequest(newurl,
1266                                        headers=newheaders,
1267                                        origin_req_host=req.get_origin_req_host(),
1268                                        unverifiable=True)
1269                 else:
1270                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1271
1272         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1273             """
1274             Fallback to GET if HEAD is not allowed (405 HTTP error)
1275             """
1276             def http_error_405(self, req, fp, code, msg, headers):
1277                 fp.read()
1278                 fp.close()
1279
1280                 newheaders = dict((k,v) for k,v in req.headers.items()
1281                                   if k.lower() not in ("content-length", "content-type"))
1282                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1283                                                  headers=newheaders,
1284                                                  origin_req_host=req.get_origin_req_host(),
1285                                                  unverifiable=True))
1286
1287         # Build our opener
1288         opener = compat_urllib_request.OpenerDirector()
1289         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1290                         HTTPMethodFallback, HEADRedirectHandler,
1291                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1292             opener.add_handler(handler())
1293
1294         response = opener.open(HeadRequest(url))
1295         new_url = response.geturl()
1296
1297         if url == new_url:
1298             return False
1299
1300         self.report_following_redirect(new_url)
1301         self._downloader.download([new_url])
1302         return True
1303
1304     def _real_extract(self, url):
1305         if self._test_redirect(url): return
1306
1307         video_id = url.split('/')[-1]
1308         request = compat_urllib_request.Request(url)
1309         try:
1310             self.report_download_webpage(video_id)
1311             webpage = compat_urllib_request.urlopen(request).read()
1312         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1313             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1314             return
1315         except ValueError as err:
1316             # since this is the last-resort InfoExtractor, if
1317             # this error is thrown, it'll be thrown here
1318             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319             return
1320
1321         self.report_extraction(video_id)
1322         # Start with something easy: JW Player in SWFObject
1323         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1324         if mobj is None:
1325             # Broaden the search a little bit
1326             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1327         if mobj is None:
1328             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1329             return
1330
1331         # It's possible that one of the regexes
1332         # matched, but returned an empty group:
1333         if mobj.group(1) is None:
1334             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1335             return
1336
1337         video_url = compat_urllib_parse.unquote(mobj.group(1))
1338         video_id = os.path.basename(video_url)
1339
1340         # here's a fun little line of code for you:
1341         video_extension = os.path.splitext(video_id)[1][1:]
1342         video_id = os.path.splitext(video_id)[0]
1343
1344         # it's tempting to parse this further, but you would
1345         # have to take into account all the variations like
1346         #   Video Title - Site Name
1347         #   Site Name | Video Title
1348         #   Video Title - Tagline | Site Name
1349         # and so on and so forth; it's just not practical
1350         mobj = re.search(r'<title>(.*)</title>', webpage)
1351         if mobj is None:
1352             self._downloader.trouble(u'ERROR: unable to extract title')
1353             return
1354         video_title = mobj.group(1)
1355
1356         # video uploader is domain name
1357         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1358         if mobj is None:
1359             self._downloader.trouble(u'ERROR: unable to extract title')
1360             return
1361         video_uploader = mobj.group(1)
1362
1363         return [{
1364             'id':       video_id,
1365             'url':      video_url,
1366             'uploader': video_uploader,
1367             'upload_date':  None,
1368             'title':    video_title,
1369             'ext':      video_extension,
1370         }]
1371
1372
1373 class YoutubeSearchIE(InfoExtractor):
1374     """Information Extractor for YouTube search queries."""
1375     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1376     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1377     _max_youtube_results = 1000
1378     IE_NAME = u'youtube:search'
1379
1380     def __init__(self, downloader=None):
1381         InfoExtractor.__init__(self, downloader)
1382
1383     def report_download_page(self, query, pagenum):
1384         """Report attempt to download search page with given number."""
1385         query = query.decode(preferredencoding())
1386         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1387
1388     def _real_extract(self, query):
1389         mobj = re.match(self._VALID_URL, query)
1390         if mobj is None:
1391             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1392             return
1393
1394         prefix, query = query.split(':')
1395         prefix = prefix[8:]
1396         query = query.encode('utf-8')
1397         if prefix == '':
1398             self._download_n_results(query, 1)
1399             return
1400         elif prefix == 'all':
1401             self._download_n_results(query, self._max_youtube_results)
1402             return
1403         else:
1404             try:
1405                 n = int(prefix)
1406                 if n <= 0:
1407                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1408                     return
1409                 elif n > self._max_youtube_results:
1410                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1411                     n = self._max_youtube_results
1412                 self._download_n_results(query, n)
1413                 return
1414             except ValueError: # parsing prefix as integer fails
1415                 self._download_n_results(query, 1)
1416                 return
1417
1418     def _download_n_results(self, query, n):
1419         """Downloads a specified number of results for a query"""
1420
1421         video_ids = []
1422         pagenum = 0
1423         limit = n
1424
1425         while (50 * pagenum) < limit:
1426             self.report_download_page(query, pagenum+1)
1427             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1428             request = compat_urllib_request.Request(result_url)
1429             try:
1430                 data = compat_urllib_request.urlopen(request).read()
1431             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1432                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1433                 return
1434             api_response = json.loads(data)['data']
1435
1436             new_ids = list(video['id'] for video in api_response['items'])
1437             video_ids += new_ids
1438
1439             limit = min(n, api_response['totalItems'])
1440             pagenum += 1
1441
1442         if len(video_ids) > n:
1443             video_ids = video_ids[:n]
1444         for id in video_ids:
1445             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1446         return
1447
1448
1449 class GoogleSearchIE(InfoExtractor):
1450     """Information Extractor for Google Video search queries."""
1451     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1452     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1453     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1454     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1455     _max_google_results = 1000
1456     IE_NAME = u'video.google:search'
1457
1458     def __init__(self, downloader=None):
1459         InfoExtractor.__init__(self, downloader)
1460
1461     def report_download_page(self, query, pagenum):
1462         """Report attempt to download playlist page with given number."""
1463         query = query.decode(preferredencoding())
1464         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1465
1466     def _real_extract(self, query):
1467         mobj = re.match(self._VALID_URL, query)
1468         if mobj is None:
1469             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1470             return
1471
1472         prefix, query = query.split(':')
1473         prefix = prefix[8:]
1474         query = query.encode('utf-8')
1475         if prefix == '':
1476             self._download_n_results(query, 1)
1477             return
1478         elif prefix == 'all':
1479             self._download_n_results(query, self._max_google_results)
1480             return
1481         else:
1482             try:
1483                 n = int(prefix)
1484                 if n <= 0:
1485                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1486                     return
1487                 elif n > self._max_google_results:
1488                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1489                     n = self._max_google_results
1490                 self._download_n_results(query, n)
1491                 return
1492             except ValueError: # parsing prefix as integer fails
1493                 self._download_n_results(query, 1)
1494                 return
1495
1496     def _download_n_results(self, query, n):
1497         """Downloads a specified number of results for a query"""
1498
1499         video_ids = []
1500         pagenum = 0
1501
1502         while True:
1503             self.report_download_page(query, pagenum)
1504             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1505             request = compat_urllib_request.Request(result_url)
1506             try:
1507                 page = compat_urllib_request.urlopen(request).read()
1508             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1509                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1510                 return
1511
1512             # Extract video identifiers
1513             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514                 video_id = mobj.group(1)
1515                 if video_id not in video_ids:
1516                     video_ids.append(video_id)
1517                     if len(video_ids) == n:
1518                         # Specified n videos reached
1519                         for id in video_ids:
1520                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1521                         return
1522
1523             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524                 for id in video_ids:
1525                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1526                 return
1527
1528             pagenum = pagenum + 1
1529
1530
1531 class YahooSearchIE(InfoExtractor):
1532     """Information Extractor for Yahoo! Video search queries."""
1533
1534     _WORKING = False
1535     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1536     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1537     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1538     _MORE_PAGES_INDICATOR = r'\s*Next'
1539     _max_yahoo_results = 1000
1540     IE_NAME = u'video.yahoo:search'
1541
1542     def __init__(self, downloader=None):
1543         InfoExtractor.__init__(self, downloader)
1544
1545     def report_download_page(self, query, pagenum):
1546         """Report attempt to download playlist page with given number."""
1547         query = query.decode(preferredencoding())
1548         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1549
1550     def _real_extract(self, query):
1551         mobj = re.match(self._VALID_URL, query)
1552         if mobj is None:
1553             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1554             return
1555
1556         prefix, query = query.split(':')
1557         prefix = prefix[8:]
1558         query = query.encode('utf-8')
1559         if prefix == '':
1560             self._download_n_results(query, 1)
1561             return
1562         elif prefix == 'all':
1563             self._download_n_results(query, self._max_yahoo_results)
1564             return
1565         else:
1566             try:
1567                 n = int(prefix)
1568                 if n <= 0:
1569                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1570                     return
1571                 elif n > self._max_yahoo_results:
1572                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1573                     n = self._max_yahoo_results
1574                 self._download_n_results(query, n)
1575                 return
1576             except ValueError: # parsing prefix as integer fails
1577                 self._download_n_results(query, 1)
1578                 return
1579
1580     def _download_n_results(self, query, n):
1581         """Downloads a specified number of results for a query"""
1582
1583         video_ids = []
1584         already_seen = set()
1585         pagenum = 1
1586
1587         while True:
1588             self.report_download_page(query, pagenum)
1589             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1590             request = compat_urllib_request.Request(result_url)
1591             try:
1592                 page = compat_urllib_request.urlopen(request).read()
1593             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1594                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1595                 return
1596
1597             # Extract video identifiers
1598             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1599                 video_id = mobj.group(1)
1600                 if video_id not in already_seen:
1601                     video_ids.append(video_id)
1602                     already_seen.add(video_id)
1603                     if len(video_ids) == n:
1604                         # Specified n videos reached
1605                         for id in video_ids:
1606                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1607                         return
1608
1609             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610                 for id in video_ids:
1611                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1612                 return
1613
1614             pagenum = pagenum + 1
1615
1616
1617 class YoutubePlaylistIE(InfoExtractor):
1618     """Information Extractor for YouTube playlists."""
1619
1620     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1621     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1622     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1623     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1624     IE_NAME = u'youtube:playlist'
1625
1626     def __init__(self, downloader=None):
1627         InfoExtractor.__init__(self, downloader)
1628
1629     def report_download_page(self, playlist_id, pagenum):
1630         """Report attempt to download playlist page with given number."""
1631         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1632
1633     def _real_extract(self, url):
1634         # Extract playlist id
1635         mobj = re.match(self._VALID_URL, url)
1636         if mobj is None:
1637             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1638             return
1639
1640         # Single video case
1641         if mobj.group(3) is not None:
1642             self._downloader.download([mobj.group(3)])
1643             return
1644
1645         # Download playlist pages
1646         # prefix is 'p' as default for playlists but there are other types that need extra care
1647         playlist_prefix = mobj.group(1)
1648         if playlist_prefix == 'a':
1649             playlist_access = 'artist'
1650         else:
1651             playlist_prefix = 'p'
1652             playlist_access = 'view_play_list'
1653         playlist_id = mobj.group(2)
1654         video_ids = []
1655         pagenum = 1
1656
1657         while True:
1658             self.report_download_page(playlist_id, pagenum)
1659             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1660             request = compat_urllib_request.Request(url)
1661             try:
1662                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1663             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1664                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1665                 return
1666
1667             # Extract video identifiers
1668             ids_in_page = []
1669             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1670                 if mobj.group(1) not in ids_in_page:
1671                     ids_in_page.append(mobj.group(1))
1672             video_ids.extend(ids_in_page)
1673
1674             if self._MORE_PAGES_INDICATOR not in page:
1675                 break
1676             pagenum = pagenum + 1
1677
1678         total = len(video_ids)
1679
1680         playliststart = self._downloader.params.get('playliststart', 1) - 1
1681         playlistend = self._downloader.params.get('playlistend', -1)
1682         if playlistend == -1:
1683             video_ids = video_ids[playliststart:]
1684         else:
1685             video_ids = video_ids[playliststart:playlistend]
1686
1687         if len(video_ids) == total:
1688             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1689         else:
1690             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1691
1692         for id in video_ids:
1693             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1694         return
1695
1696
1697 class YoutubeChannelIE(InfoExtractor):
1698     """Information Extractor for YouTube channels."""
1699
1700     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1701     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1702     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1703     IE_NAME = u'youtube:channel'
1704
1705     def report_download_page(self, channel_id, pagenum):
1706         """Report attempt to download channel page with given number."""
1707         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1708
1709     def _real_extract(self, url):
1710         # Extract channel id
1711         mobj = re.match(self._VALID_URL, url)
1712         if mobj is None:
1713             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1714             return
1715
1716         # Download channel pages
1717         channel_id = mobj.group(1)
1718         video_ids = []
1719         pagenum = 1
1720
1721         while True:
1722             self.report_download_page(channel_id, pagenum)
1723             url = self._TEMPLATE_URL % (channel_id, pagenum)
1724             request = compat_urllib_request.Request(url)
1725             try:
1726                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1727             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1728                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1729                 return
1730
1731             # Extract video identifiers
1732             ids_in_page = []
1733             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1734                 if mobj.group(1) not in ids_in_page:
1735                     ids_in_page.append(mobj.group(1))
1736             video_ids.extend(ids_in_page)
1737
1738             if self._MORE_PAGES_INDICATOR not in page:
1739                 break
1740             pagenum = pagenum + 1
1741
1742         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1743
1744         for id in video_ids:
1745             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746         return
1747
1748
1749 class YoutubeUserIE(InfoExtractor):
1750     """Information Extractor for YouTube users."""
1751
1752     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1753     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1754     _GDATA_PAGE_SIZE = 50
1755     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1756     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1757     IE_NAME = u'youtube:user'
1758
1759     def __init__(self, downloader=None):
1760         InfoExtractor.__init__(self, downloader)
1761
1762     def report_download_page(self, username, start_index):
1763         """Report attempt to download user page."""
1764         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1765                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1766
1767     def _real_extract(self, url):
1768         # Extract username
1769         mobj = re.match(self._VALID_URL, url)
1770         if mobj is None:
1771             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1772             return
1773
1774         username = mobj.group(1)
1775
1776         # Download video ids using YouTube Data API. Result size per
1777         # query is limited (currently to 50 videos) so we need to query
1778         # page by page until there are no video ids - it means we got
1779         # all of them.
1780
1781         video_ids = []
1782         pagenum = 0
1783
1784         while True:
1785             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1786             self.report_download_page(username, start_index)
1787
1788             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1789
1790             try:
1791                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794                 return
1795
1796             # Extract video identifiers
1797             ids_in_page = []
1798
1799             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1800                 if mobj.group(1) not in ids_in_page:
1801                     ids_in_page.append(mobj.group(1))
1802
1803             video_ids.extend(ids_in_page)
1804
1805             # A little optimization - if current page is not
1806             # "full", ie. does not contain PAGE_SIZE video ids then
1807             # we can assume that this page is the last one - there
1808             # are no more ids on further pages - no need to query
1809             # again.
1810
1811             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1812                 break
1813
1814             pagenum += 1
1815
1816         all_ids_count = len(video_ids)
1817         playliststart = self._downloader.params.get('playliststart', 1) - 1
1818         playlistend = self._downloader.params.get('playlistend', -1)
1819
1820         if playlistend == -1:
1821             video_ids = video_ids[playliststart:]
1822         else:
1823             video_ids = video_ids[playliststart:playlistend]
1824
1825         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1826                 (username, all_ids_count, len(video_ids)))
1827
1828         for video_id in video_ids:
1829             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1830
1831
1832 class BlipTVUserIE(InfoExtractor):
1833     """Information Extractor for blip.tv users."""
1834
1835     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1836     _PAGE_SIZE = 12
1837     IE_NAME = u'blip.tv:user'
1838
1839     def __init__(self, downloader=None):
1840         InfoExtractor.__init__(self, downloader)
1841
1842     def report_download_page(self, username, pagenum):
1843         """Report attempt to download user page."""
1844         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1845                 (self.IE_NAME, username, pagenum))
1846
1847     def _real_extract(self, url):
1848         # Extract username
1849         mobj = re.match(self._VALID_URL, url)
1850         if mobj is None:
1851             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1852             return
1853
1854         username = mobj.group(1)
1855
1856         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1857
1858         request = compat_urllib_request.Request(url)
1859
1860         try:
1861             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1862             mobj = re.search(r'data-users-id="([^"]+)"', page)
1863             page_base = page_base % mobj.group(1)
1864         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1866             return
1867
1868
1869         # Download video ids using BlipTV Ajax calls. Result size per
1870         # query is limited (currently to 12 videos) so we need to query
1871         # page by page until there are no video ids - it means we got
1872         # all of them.
1873
1874         video_ids = []
1875         pagenum = 1
1876
1877         while True:
1878             self.report_download_page(username, pagenum)
1879
1880             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1881
1882             try:
1883                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1884             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1885                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1886                 return
1887
1888             # Extract video identifiers
1889             ids_in_page = []
1890
1891             for mobj in re.finditer(r'href="/([^"]+)"', page):
1892                 if mobj.group(1) not in ids_in_page:
1893                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1894
1895             video_ids.extend(ids_in_page)
1896
1897             # A little optimization - if current page is not
1898             # "full", ie. does not contain PAGE_SIZE video ids then
1899             # we can assume that this page is the last one - there
1900             # are no more ids on further pages - no need to query
1901             # again.
1902
1903             if len(ids_in_page) < self._PAGE_SIZE:
1904                 break
1905
1906             pagenum += 1
1907
1908         all_ids_count = len(video_ids)
1909         playliststart = self._downloader.params.get('playliststart', 1) - 1
1910         playlistend = self._downloader.params.get('playlistend', -1)
1911
1912         if playlistend == -1:
1913             video_ids = video_ids[playliststart:]
1914         else:
1915             video_ids = video_ids[playliststart:playlistend]
1916
1917         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1918                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1919
1920         for video_id in video_ids:
1921             self._downloader.download([u'http://blip.tv/'+video_id])
1922
1923
1924 class DepositFilesIE(InfoExtractor):
1925     """Information extractor for depositfiles.com"""
1926
1927     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1928
1929     def report_download_webpage(self, file_id):
1930         """Report webpage download."""
1931         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1932
1933     def report_extraction(self, file_id):
1934         """Report information extraction."""
1935         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1936
1937     def _real_extract(self, url):
1938         file_id = url.split('/')[-1]
1939         # Rebuild url in english locale
1940         url = 'http://depositfiles.com/en/files/' + file_id
1941
1942         # Retrieve file webpage with 'Free download' button pressed
1943         free_download_indication = { 'gateway_result' : '1' }
1944         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1945         try:
1946             self.report_download_webpage(file_id)
1947             webpage = compat_urllib_request.urlopen(request).read()
1948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1950             return
1951
1952         # Search for the real file URL
1953         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1954         if (mobj is None) or (mobj.group(1) is None):
1955             # Try to figure out reason of the error.
1956             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1957             if (mobj is not None) and (mobj.group(1) is not None):
1958                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1959                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1960             else:
1961                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1962             return
1963
1964         file_url = mobj.group(1)
1965         file_extension = os.path.splitext(file_url)[1][1:]
1966
1967         # Search for file title
1968         mobj = re.search(r'<b title="(.*?)">', webpage)
1969         if mobj is None:
1970             self._downloader.trouble(u'ERROR: unable to extract title')
1971             return
1972         file_title = mobj.group(1).decode('utf-8')
1973
1974         return [{
1975             'id':       file_id.decode('utf-8'),
1976             'url':      file_url.decode('utf-8'),
1977             'uploader': None,
1978             'upload_date':  None,
1979             'title':    file_title,
1980             'ext':      file_extension.decode('utf-8'),
1981         }]
1982
1983
1984 class FacebookIE(InfoExtractor):
1985     """Information Extractor for Facebook"""
1986
1987     _WORKING = False
1988     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1989     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1990     _NETRC_MACHINE = 'facebook'
1991     _available_formats = ['video', 'highqual', 'lowqual']
1992     _video_extensions = {
1993         'video': 'mp4',
1994         'highqual': 'mp4',
1995         'lowqual': 'mp4',
1996     }
1997     IE_NAME = u'facebook'
1998
1999     def __init__(self, downloader=None):
2000         InfoExtractor.__init__(self, downloader)
2001
2002     def _reporter(self, message):
2003         """Add header and report message."""
2004         self._downloader.to_screen(u'[facebook] %s' % message)
2005
2006     def report_login(self):
2007         """Report attempt to log in."""
2008         self._reporter(u'Logging in')
2009
2010     def report_video_webpage_download(self, video_id):
2011         """Report attempt to download video webpage."""
2012         self._reporter(u'%s: Downloading video webpage' % video_id)
2013
2014     def report_information_extraction(self, video_id):
2015         """Report attempt to extract video information."""
2016         self._reporter(u'%s: Extracting video information' % video_id)
2017
2018     def _parse_page(self, video_webpage):
2019         """Extract video information from page"""
2020         # General data
2021         data = {'title': r'\("video_title", "(.*?)"\)',
2022             'description': r'<div class="datawrap">(.*?)</div>',
2023             'owner': r'\("video_owner_name", "(.*?)"\)',
2024             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2025             }
2026         video_info = {}
2027         for piece in data.keys():
2028             mobj = re.search(data[piece], video_webpage)
2029             if mobj is not None:
2030                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2031
2032         # Video urls
2033         video_urls = {}
2034         for fmt in self._available_formats:
2035             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2036             if mobj is not None:
2037                 # URL is in a Javascript segment inside an escaped Unicode format within
2038                 # the generally utf-8 page
2039                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2040         video_info['video_urls'] = video_urls
2041
2042         return video_info
2043
2044     def _real_initialize(self):
2045         if self._downloader is None:
2046             return
2047
2048         useremail = None
2049         password = None
2050         downloader_params = self._downloader.params
2051
2052         # Attempt to use provided username and password or .netrc data
2053         if downloader_params.get('username', None) is not None:
2054             useremail = downloader_params['username']
2055             password = downloader_params['password']
2056         elif downloader_params.get('usenetrc', False):
2057             try:
2058                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2059                 if info is not None:
2060                     useremail = info[0]
2061                     password = info[2]
2062                 else:
2063                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2064             except (IOError, netrc.NetrcParseError) as err:
2065                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2066                 return
2067
2068         if useremail is None:
2069             return
2070
2071         # Log in
2072         login_form = {
2073             'email': useremail,
2074             'pass': password,
2075             'login': 'Log+In'
2076             }
2077         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2078         try:
2079             self.report_login()
2080             login_results = compat_urllib_request.urlopen(request).read()
2081             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2082                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2083                 return
2084         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2085             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2086             return
2087
2088     def _real_extract(self, url):
2089         mobj = re.match(self._VALID_URL, url)
2090         if mobj is None:
2091             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2092             return
2093         video_id = mobj.group('ID')
2094
2095         # Get video webpage
2096         self.report_video_webpage_download(video_id)
2097         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2098         try:
2099             page = compat_urllib_request.urlopen(request)
2100             video_webpage = page.read()
2101         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2102             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2103             return
2104
2105         # Start extracting information
2106         self.report_information_extraction(video_id)
2107
2108         # Extract information
2109         video_info = self._parse_page(video_webpage)
2110
2111         # uploader
2112         if 'owner' not in video_info:
2113             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2114             return
2115         video_uploader = video_info['owner']
2116
2117         # title
2118         if 'title' not in video_info:
2119             self._downloader.trouble(u'ERROR: unable to extract video title')
2120             return
2121         video_title = video_info['title']
2122         video_title = video_title.decode('utf-8')
2123
2124         # thumbnail image
2125         if 'thumbnail' not in video_info:
2126             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2127             video_thumbnail = ''
2128         else:
2129             video_thumbnail = video_info['thumbnail']
2130
2131         # upload date
2132         upload_date = None
2133         if 'upload_date' in video_info:
2134             upload_time = video_info['upload_date']
2135             timetuple = email.utils.parsedate_tz(upload_time)
2136             if timetuple is not None:
2137                 try:
2138                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2139                 except:
2140                     pass
2141
2142         # description
2143         video_description = video_info.get('description', 'No description available.')
2144
2145         url_map = video_info['video_urls']
2146         if url_map:
2147             # Decide which formats to download
2148             req_format = self._downloader.params.get('format', None)
2149             format_limit = self._downloader.params.get('format_limit', None)
2150
2151             if format_limit is not None and format_limit in self._available_formats:
2152                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2153             else:
2154                 format_list = self._available_formats
2155             existing_formats = [x for x in format_list if x in url_map]
2156             if len(existing_formats) == 0:
2157                 self._downloader.trouble(u'ERROR: no known formats available for video')
2158                 return
2159             if req_format is None:
2160                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2161             elif req_format == 'worst':
2162                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2163             elif req_format == '-1':
2164                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2165             else:
2166                 # Specific format
2167                 if req_format not in url_map:
2168                     self._downloader.trouble(u'ERROR: requested format not available')
2169                     return
2170                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2171
2172         results = []
2173         for format_param, video_real_url in video_url_list:
2174             # Extension
2175             video_extension = self._video_extensions.get(format_param, 'mp4')
2176
2177             results.append({
2178                 'id':       video_id.decode('utf-8'),
2179                 'url':      video_real_url.decode('utf-8'),
2180                 'uploader': video_uploader.decode('utf-8'),
2181                 'upload_date':  upload_date,
2182                 'title':    video_title,
2183                 'ext':      video_extension.decode('utf-8'),
2184                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2185                 'thumbnail':    video_thumbnail.decode('utf-8'),
2186                 'description':  video_description.decode('utf-8'),
2187             })
2188         return results
2189
2190 class BlipTVIE(InfoExtractor):
2191     """Information extractor for blip.tv"""
2192
2193     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195     IE_NAME = u'blip.tv'
2196
2197     def report_extraction(self, file_id):
2198         """Report information extraction."""
2199         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200
2201     def report_direct_download(self, title):
2202         """Report information extraction."""
2203         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204
2205     def _real_extract(self, url):
2206         mobj = re.match(self._VALID_URL, url)
2207         if mobj is None:
2208             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2209             return
2210
2211         if '?' in url:
2212             cchar = '&'
2213         else:
2214             cchar = '?'
2215         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2216         request = compat_urllib_request.Request(json_url)
2217         self.report_extraction(mobj.group(1))
2218         info = None
2219         try:
2220             urlh = compat_urllib_request.urlopen(request)
2221             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222                 basename = url.split('/')[-1]
2223                 title,ext = os.path.splitext(basename)
2224                 title = title.decode('UTF-8')
2225                 ext = ext.replace('.', '')
2226                 self.report_direct_download(title)
2227                 info = {
2228                     'id': title,
2229                     'url': url,
2230                     'uploader': None,
2231                     'upload_date': None,
2232                     'title': title,
2233                     'ext': ext,
2234                     'urlhandle': urlh
2235                 }
2236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238             return
2239         if info is None: # Regular URL
2240             try:
2241                 json_code_bytes = urlh.read()
2242                 json_code = json_code_bytes.decode('utf-8')
2243             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2244                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2245                 return
2246
2247             try:
2248                 json_data = json.loads(json_code)
2249                 if 'Post' in json_data:
2250                     data = json_data['Post']
2251                 else:
2252                     data = json_data
2253
2254                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2255                 video_url = data['media']['url']
2256                 umobj = re.match(self._URL_EXT, video_url)
2257                 if umobj is None:
2258                     raise ValueError('Can not determine filename extension')
2259                 ext = umobj.group(1)
2260
2261                 info = {
2262                     'id': data['item_id'],
2263                     'url': video_url,
2264                     'uploader': data['display_name'],
2265                     'upload_date': upload_date,
2266                     'title': data['title'],
2267                     'ext': ext,
2268                     'format': data['media']['mimeType'],
2269                     'thumbnail': data['thumbnailUrl'],
2270                     'description': data['description'],
2271                     'player_url': data['embedUrl']
2272                 }
2273             except (ValueError,KeyError) as err:
2274                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2275                 return
2276
2277         std_headers['User-Agent'] = 'iTunes/10.6.1'
2278         return [info]
2279
2280
2281 class MyVideoIE(InfoExtractor):
2282     """Information Extractor for myvideo.de."""
2283
2284     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2285     IE_NAME = u'myvideo'
2286
2287     def __init__(self, downloader=None):
2288         InfoExtractor.__init__(self, downloader)
2289
2290     def report_extraction(self, video_id):
2291         """Report information extraction."""
2292         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2293
2294     def _real_extract(self,url):
2295         mobj = re.match(self._VALID_URL, url)
2296         if mobj is None:
2297             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2298             return
2299
2300         video_id = mobj.group(1)
2301
2302         # Get video webpage
2303         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2304         webpage = self._download_webpage(webpage_url, video_id)
2305
2306         self.report_extraction(video_id)
2307         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2308                  webpage)
2309         if mobj is None:
2310             self._downloader.trouble(u'ERROR: unable to extract media URL')
2311             return
2312         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2313
2314         mobj = re.search('<title>([^<]+)</title>', webpage)
2315         if mobj is None:
2316             self._downloader.trouble(u'ERROR: unable to extract title')
2317             return
2318
2319         video_title = mobj.group(1)
2320
2321         return [{
2322             'id':       video_id,
2323             'url':      video_url,
2324             'uploader': None,
2325             'upload_date':  None,
2326             'title':    video_title,
2327             'ext':      u'flv',
2328         }]
2329
2330 class ComedyCentralIE(InfoExtractor):
2331     """Information extractor for The Daily Show and Colbert Report """
2332
2333     # urls can be abbreviations like :thedailyshow or :colbert
2334     # urls for episodes like:
2335     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2336     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2337     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2338     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2339                       |(https?://)?(www\.)?
2340                           (?P<showname>thedailyshow|colbertnation)\.com/
2341                          (full-episodes/(?P<episode>.*)|
2342                           (?P<clip>
2343                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2344                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2345                      $"""
2346     IE_NAME = u'comedycentral'
2347
2348     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349
2350     _video_extensions = {
2351         '3500': 'mp4',
2352         '2200': 'mp4',
2353         '1700': 'mp4',
2354         '1200': 'mp4',
2355         '750': 'mp4',
2356         '400': 'mp4',
2357     }
2358     _video_dimensions = {
2359         '3500': '1280x720',
2360         '2200': '960x540',
2361         '1700': '768x432',
2362         '1200': '640x360',
2363         '750': '512x288',
2364         '400': '384x216',
2365     }
2366
2367     def suitable(self, url):
2368         """Receives a URL and returns True if suitable for this IE."""
2369         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2370
2371     def report_extraction(self, episode_id):
2372         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2373
2374     def report_config_download(self, episode_id):
2375         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2376
2377     def report_index_download(self, episode_id):
2378         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2379
2380     def report_player_url(self, episode_id):
2381         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2382
2383
2384     def _print_formats(self, formats):
2385         print('Available formats:')
2386         for x in formats:
2387             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2388
2389
2390     def _real_extract(self, url):
2391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392         if mobj is None:
2393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2394             return
2395
2396         if mobj.group('shortname'):
2397             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2398                 url = u'http://www.thedailyshow.com/full-episodes/'
2399             else:
2400                 url = u'http://www.colbertnation.com/full-episodes/'
2401             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2402             assert mobj is not None
2403
2404         if mobj.group('clip'):
2405             if mobj.group('showname') == 'thedailyshow':
2406                 epTitle = mobj.group('tdstitle')
2407             else:
2408                 epTitle = mobj.group('cntitle')
2409             dlNewest = False
2410         else:
2411             dlNewest = not mobj.group('episode')
2412             if dlNewest:
2413                 epTitle = mobj.group('showname')
2414             else:
2415                 epTitle = mobj.group('episode')
2416
2417         req = compat_urllib_request.Request(url)
2418         self.report_extraction(epTitle)
2419         try:
2420             htmlHandle = compat_urllib_request.urlopen(req)
2421             html = htmlHandle.read()
2422         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2424             return
2425         if dlNewest:
2426             url = htmlHandle.geturl()
2427             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2428             if mobj is None:
2429                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2430                 return
2431             if mobj.group('episode') == '':
2432                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2433                 return
2434             epTitle = mobj.group('episode')
2435
2436         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2437
2438         if len(mMovieParams) == 0:
2439             # The Colbert Report embeds the information in a without
2440             # a URL prefix; so extract the alternate reference
2441             # and then add the URL prefix manually.
2442
2443             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2444             if len(altMovieParams) == 0:
2445                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2446                 return
2447             else:
2448                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2449
2450         playerUrl_raw = mMovieParams[0][0]
2451         self.report_player_url(epTitle)
2452         try:
2453             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2454             playerUrl = urlHandle.geturl()
2455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2456             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2457             return
2458
2459         uri = mMovieParams[0][1]
2460         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2461         self.report_index_download(epTitle)
2462         try:
2463             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2464         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2466             return
2467
2468         results = []
2469
2470         idoc = xml.etree.ElementTree.fromstring(indexXml)
2471         itemEls = idoc.findall('.//item')
2472         for itemEl in itemEls:
2473             mediaId = itemEl.findall('./guid')[0].text
2474             shortMediaId = mediaId.split(':')[-1]
2475             showId = mediaId.split(':')[-2].replace('.com', '')
2476             officialTitle = itemEl.findall('./title')[0].text
2477             officialDate = itemEl.findall('./pubDate')[0].text
2478
2479             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2480                         compat_urllib_parse.urlencode({'uri': mediaId}))
2481             configReq = compat_urllib_request.Request(configUrl)
2482             self.report_config_download(epTitle)
2483             try:
2484                 configXml = compat_urllib_request.urlopen(configReq).read()
2485             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2487                 return
2488
2489             cdoc = xml.etree.ElementTree.fromstring(configXml)
2490             turls = []
2491             for rendition in cdoc.findall('.//rendition'):
2492                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2493                 turls.append(finfo)
2494
2495             if len(turls) == 0:
2496                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2497                 continue
2498
2499             if self._downloader.params.get('listformats', None):
2500                 self._print_formats([i[0] for i in turls])
2501                 return
2502
2503             # For now, just pick the highest bitrate
2504             format,video_url = turls[-1]
2505
2506             # Get the format arg from the arg stream
2507             req_format = self._downloader.params.get('format', None)
2508
2509             # Select format if we can find one
2510             for f,v in turls:
2511                 if f == req_format:
2512                     format, video_url = f, v
2513                     break
2514
2515             # Patch to download from alternative CDN, which does not
2516             # break on current RTMPDump builds
2517             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2518             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2519
2520             if video_url.startswith(broken_cdn):
2521                 video_url = video_url.replace(broken_cdn, better_cdn)
2522
2523             effTitle = showId + u'-' + epTitle
2524             info = {
2525                 'id': shortMediaId,
2526                 'url': video_url,
2527                 'uploader': showId,
2528                 'upload_date': officialDate,
2529                 'title': effTitle,
2530                 'ext': 'mp4',
2531                 'format': format,
2532                 'thumbnail': None,
2533                 'description': officialTitle,
2534                 'player_url': None #playerUrl
2535             }
2536
2537             results.append(info)
2538
2539         return results
2540
2541
2542 class EscapistIE(InfoExtractor):
2543     """Information extractor for The Escapist """
2544
2545     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2546     IE_NAME = u'escapist'
2547
2548     def report_extraction(self, showName):
2549         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2550
2551     def report_config_download(self, showName):
2552         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2553
2554     def _real_extract(self, url):
2555         mobj = re.match(self._VALID_URL, url)
2556         if mobj is None:
2557             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2558             return
2559         showName = mobj.group('showname')
2560         videoId = mobj.group('episode')
2561
2562         self.report_extraction(showName)
2563         try:
2564             webPage = compat_urllib_request.urlopen(url)
2565             webPageBytes = webPage.read()
2566             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2567             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2568         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2570             return
2571
2572         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2573         description = unescapeHTML(descMatch.group(1))
2574         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2575         imgUrl = unescapeHTML(imgMatch.group(1))
2576         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2577         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2578         configUrlMatch = re.search('config=(.*)$', playerUrl)
2579         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2580
2581         self.report_config_download(showName)
2582         try:
2583             configJSON = compat_urllib_request.urlopen(configUrl)
2584             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2585             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2588             return
2589
2590         # Technically, it's JavaScript, not JSON
2591         configJSON = configJSON.replace("'", '"')
2592
2593         try:
2594             config = json.loads(configJSON)
2595         except (ValueError,) as err:
2596             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2597             return
2598
2599         playlist = config['playlist']
2600         videoUrl = playlist[1]['url']
2601
2602         info = {
2603             'id': videoId,
2604             'url': videoUrl,
2605             'uploader': showName,
2606             'upload_date': None,
2607             'title': showName,
2608             'ext': 'flv',
2609             'thumbnail': imgUrl,
2610             'description': description,
2611             'player_url': playerUrl,
2612         }
2613
2614         return [info]
2615
2616
2617 class CollegeHumorIE(InfoExtractor):
2618     """Information extractor for collegehumor.com"""
2619
2620     _WORKING = False
2621     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2622     IE_NAME = u'collegehumor'
2623
2624     def report_manifest(self, video_id):
2625         """Report information extraction."""
2626         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2627
2628     def report_extraction(self, video_id):
2629         """Report information extraction."""
2630         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631
2632     def _real_extract(self, url):
2633         mobj = re.match(self._VALID_URL, url)
2634         if mobj is None:
2635             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2636             return
2637         video_id = mobj.group('videoid')
2638
2639         info = {
2640             'id': video_id,
2641             'uploader': None,
2642             'upload_date': None,
2643         }
2644
2645         self.report_extraction(video_id)
2646         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2647         try:
2648             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2649         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2651             return
2652
2653         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2654         try:
2655             videoNode = mdoc.findall('./video')[0]
2656             info['description'] = videoNode.findall('./description')[0].text
2657             info['title'] = videoNode.findall('./caption')[0].text
2658             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2659             manifest_url = videoNode.findall('./file')[0].text
2660         except IndexError:
2661             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2662             return
2663
2664         manifest_url += '?hdcore=2.10.3'
2665         self.report_manifest(video_id)
2666         try:
2667             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2668         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2670             return
2671
2672         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2673         try:
2674             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2675             node_id = media_node.attrib['url']
2676             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2677         except IndexError as err:
2678             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2679             return
2680
2681         url_pr = compat_urllib_parse_urlparse(manifest_url)
2682         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2683
2684         info['url'] = url
2685         info['ext'] = 'f4f'
2686         return [info]
2687
2688
2689 class XVideosIE(InfoExtractor):
2690     """Information extractor for xvideos.com"""
2691
2692     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2693     IE_NAME = u'xvideos'
2694
2695     def report_extraction(self, video_id):
2696         """Report information extraction."""
2697         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698
2699     def _real_extract(self, url):
2700         mobj = re.match(self._VALID_URL, url)
2701         if mobj is None:
2702             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703             return
2704         video_id = mobj.group(1)
2705
2706         webpage = self._download_webpage(url, video_id)
2707
2708         self.report_extraction(video_id)
2709
2710
2711         # Extract video URL
2712         mobj = re.search(r'flv_url=(.+?)&', webpage)
2713         if mobj is None:
2714             self._downloader.trouble(u'ERROR: unable to extract video url')
2715             return
2716         video_url = compat_urllib_parse.unquote(mobj.group(1))
2717
2718
2719         # Extract title
2720         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2721         if mobj is None:
2722             self._downloader.trouble(u'ERROR: unable to extract video title')
2723             return
2724         video_title = mobj.group(1)
2725
2726
2727         # Extract video thumbnail
2728         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2729         if mobj is None:
2730             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2731             return
2732         video_thumbnail = mobj.group(0)
2733
2734         info = {
2735             'id': video_id,
2736             'url': video_url,
2737             'uploader': None,
2738             'upload_date': None,
2739             'title': video_title,
2740             'ext': 'flv',
2741             'thumbnail': video_thumbnail,
2742             'description': None,
2743         }
2744
2745         return [info]
2746
2747
2748 class SoundcloudIE(InfoExtractor):
2749     """Information extractor for soundcloud.com
2750        To access the media, the uid of the song and a stream token
2751        must be extracted from the page source and the script must make
2752        a request to media.soundcloud.com/crossdomain.xml. Then
2753        the media can be grabbed by requesting from an url composed
2754        of the stream token and uid
2755      """
2756
2757     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2758     IE_NAME = u'soundcloud'
2759
2760     def __init__(self, downloader=None):
2761         InfoExtractor.__init__(self, downloader)
2762
2763     def report_resolve(self, video_id):
2764         """Report information extraction."""
2765         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2766
2767     def report_extraction(self, video_id):
2768         """Report information extraction."""
2769         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2770
2771     def _real_extract(self, url):
2772         mobj = re.match(self._VALID_URL, url)
2773         if mobj is None:
2774             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2775             return
2776
2777         # extract uploader (which is in the url)
2778         uploader = mobj.group(1)
2779         # extract simple title (uploader + slug of song title)
2780         slug_title =  mobj.group(2)
2781         simple_title = uploader + u'-' + slug_title
2782
2783         self.report_resolve('%s/%s' % (uploader, slug_title))
2784
2785         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2786         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2787         request = compat_urllib_request.Request(resolv_url)
2788         try:
2789             info_json_bytes = compat_urllib_request.urlopen(request).read()
2790             info_json = info_json_bytes.decode('utf-8')
2791         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2793             return
2794
2795         info = json.loads(info_json)
2796         video_id = info['id']
2797         self.report_extraction('%s/%s' % (uploader, slug_title))
2798
2799         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2800         request = compat_urllib_request.Request(streams_url)
2801         try:
2802             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2803             stream_json = stream_json_bytes.decode('utf-8')
2804         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2805             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2806             return
2807
2808         streams = json.loads(stream_json)
2809         mediaURL = streams['http_mp3_128_url']
2810
2811         return [{
2812             'id':       info['id'],
2813             'url':      mediaURL,
2814             'uploader': info['user']['username'],
2815             'upload_date':  info['created_at'],
2816             'title':    info['title'],
2817             'ext':      u'mp3',
2818             'description': info['description'],
2819         }]
2820
2821
2822 class InfoQIE(InfoExtractor):
2823     """Information extractor for infoq.com"""
2824     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2825
2826     def report_extraction(self, video_id):
2827         """Report information extraction."""
2828         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2829
2830     def _real_extract(self, url):
2831         mobj = re.match(self._VALID_URL, url)
2832         if mobj is None:
2833             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2834             return
2835
2836         webpage = self._download_webpage(url, video_id=url)
2837         self.report_extraction(url)
2838
2839         # Extract video URL
2840         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2841         if mobj is None:
2842             self._downloader.trouble(u'ERROR: unable to extract video url')
2843             return
2844         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2845         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2846
2847         # Extract title
2848         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2849         if mobj is None:
2850             self._downloader.trouble(u'ERROR: unable to extract video title')
2851             return
2852         video_title = mobj.group(1)
2853
2854         # Extract description
2855         video_description = u'No description available.'
2856         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2857         if mobj is not None:
2858             video_description = mobj.group(1)
2859
2860         video_filename = video_url.split('/')[-1]
2861         video_id, extension = video_filename.split('.')
2862
2863         info = {
2864             'id': video_id,
2865             'url': video_url,
2866             'uploader': None,
2867             'upload_date': None,
2868             'title': video_title,
2869             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2870             'thumbnail': None,
2871             'description': video_description,
2872         }
2873
2874         return [info]
2875
2876 class MixcloudIE(InfoExtractor):
2877     """Information extractor for www.mixcloud.com"""
2878
2879     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2880     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2881     IE_NAME = u'mixcloud'
2882
2883     def __init__(self, downloader=None):
2884         InfoExtractor.__init__(self, downloader)
2885
2886     def report_download_json(self, file_id):
2887         """Report JSON download."""
2888         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2889
2890     def report_extraction(self, file_id):
2891         """Report information extraction."""
2892         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2893
2894     def get_urls(self, jsonData, fmt, bitrate='best'):
2895         """Get urls from 'audio_formats' section in json"""
2896         file_url = None
2897         try:
2898             bitrate_list = jsonData[fmt]
2899             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2900                 bitrate = max(bitrate_list) # select highest
2901
2902             url_list = jsonData[fmt][bitrate]
2903         except TypeError: # we have no bitrate info.
2904             url_list = jsonData[fmt]
2905         return url_list
2906
2907     def check_urls(self, url_list):
2908         """Returns 1st active url from list"""
2909         for url in url_list:
2910             try:
2911                 compat_urllib_request.urlopen(url)
2912                 return url
2913             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2914                 url = None
2915
2916         return None
2917
2918     def _print_formats(self, formats):
2919         print('Available formats:')
2920         for fmt in formats.keys():
2921             for b in formats[fmt]:
2922                 try:
2923                     ext = formats[fmt][b][0]
2924                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2925                 except TypeError: # we have no bitrate info
2926                     ext = formats[fmt][0]
2927                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2928                     break
2929
2930     def _real_extract(self, url):
2931         mobj = re.match(self._VALID_URL, url)
2932         if mobj is None:
2933             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2934             return
2935         # extract uploader & filename from url
2936         uploader = mobj.group(1).decode('utf-8')
2937         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2938
2939         # construct API request
2940         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2941         # retrieve .json file with links to files
2942         request = compat_urllib_request.Request(file_url)
2943         try:
2944             self.report_download_json(file_url)
2945             jsonData = compat_urllib_request.urlopen(request).read()
2946         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2948             return
2949
2950         # parse JSON
2951         json_data = json.loads(jsonData)
2952         player_url = json_data['player_swf_url']
2953         formats = dict(json_data['audio_formats'])
2954
2955         req_format = self._downloader.params.get('format', None)
2956         bitrate = None
2957
2958         if self._downloader.params.get('listformats', None):
2959             self._print_formats(formats)
2960             return
2961
2962         if req_format is None or req_format == 'best':
2963             for format_param in formats.keys():
2964                 url_list = self.get_urls(formats, format_param)
2965                 # check urls
2966                 file_url = self.check_urls(url_list)
2967                 if file_url is not None:
2968                     break # got it!
2969         else:
2970             if req_format not in formats:
2971                 self._downloader.trouble(u'ERROR: format is not available')
2972                 return
2973
2974             url_list = self.get_urls(formats, req_format)
2975             file_url = self.check_urls(url_list)
2976             format_param = req_format
2977
2978         return [{
2979             'id': file_id.decode('utf-8'),
2980             'url': file_url.decode('utf-8'),
2981             'uploader': uploader.decode('utf-8'),
2982             'upload_date': None,
2983             'title': json_data['name'],
2984             'ext': file_url.split('.')[-1].decode('utf-8'),
2985             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2986             'thumbnail': json_data['thumbnail_url'],
2987             'description': json_data['description'],
2988             'player_url': player_url.decode('utf-8'),
2989         }]
2990
2991 class StanfordOpenClassroomIE(InfoExtractor):
2992     """Information extractor for Stanford's Open ClassRoom"""
2993
2994     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2995     IE_NAME = u'stanfordoc'
2996
2997     def report_download_webpage(self, objid):
2998         """Report information extraction."""
2999         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3000
3001     def report_extraction(self, video_id):
3002         """Report information extraction."""
3003         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3004
3005     def _real_extract(self, url):
3006         mobj = re.match(self._VALID_URL, url)
3007         if mobj is None:
3008             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3009             return
3010
3011         if mobj.group('course') and mobj.group('video'): # A specific video
3012             course = mobj.group('course')
3013             video = mobj.group('video')
3014             info = {
3015                 'id': course + '_' + video,
3016                 'uploader': None,
3017                 'upload_date': None,
3018             }
3019
3020             self.report_extraction(info['id'])
3021             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3022             xmlUrl = baseUrl + video + '.xml'
3023             try:
3024                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3025             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3026                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3027                 return
3028             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3029             try:
3030                 info['title'] = mdoc.findall('./title')[0].text
3031                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3032             except IndexError:
3033                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3034                 return
3035             info['ext'] = info['url'].rpartition('.')[2]
3036             return [info]
3037         elif mobj.group('course'): # A course page
3038             course = mobj.group('course')
3039             info = {
3040                 'id': course,
3041                 'type': 'playlist',
3042                 'uploader': None,
3043                 'upload_date': None,
3044             }
3045
3046             self.report_download_webpage(info['id'])
3047             try:
3048                 coursepage = compat_urllib_request.urlopen(url).read()
3049             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3050                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3051                 return
3052
3053             m = re.search('<h1>([^<]+)</h1>', coursepage)
3054             if m:
3055                 info['title'] = unescapeHTML(m.group(1))
3056             else:
3057                 info['title'] = info['id']
3058
3059             m = re.search('<description>([^<]+)</description>', coursepage)
3060             if m:
3061                 info['description'] = unescapeHTML(m.group(1))
3062
3063             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3064             info['list'] = [
3065                 {
3066                     'type': 'reference',
3067                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3068                 }
3069                     for vpage in links]
3070             results = []
3071             for entry in info['list']:
3072                 assert entry['type'] == 'reference'
3073                 results += self.extract(entry['url'])
3074             return results
3075
3076         else: # Root page
3077             info = {
3078                 'id': 'Stanford OpenClassroom',
3079                 'type': 'playlist',
3080                 'uploader': None,
3081                 'upload_date': None,
3082             }
3083
3084             self.report_download_webpage(info['id'])
3085             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3086             try:
3087                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3088             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3089                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3090                 return
3091
3092             info['title'] = info['id']
3093
3094             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3095             info['list'] = [
3096                 {
3097                     'type': 'reference',
3098                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3099                 }
3100                     for cpage in links]
3101
3102             results = []
3103             for entry in info['list']:
3104                 assert entry['type'] == 'reference'
3105                 results += self.extract(entry['url'])
3106             return results
3107
3108 class MTVIE(InfoExtractor):
3109     """Information extractor for MTV.com"""
3110
3111     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3112     IE_NAME = u'mtv'
3113
3114     def report_extraction(self, video_id):
3115         """Report information extraction."""
3116         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3117
3118     def _real_extract(self, url):
3119         mobj = re.match(self._VALID_URL, url)
3120         if mobj is None:
3121             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3122             return
3123         if not mobj.group('proto'):
3124             url = 'http://' + url
3125         video_id = mobj.group('videoid')
3126
3127         webpage = self._download_webpage(url, video_id)
3128
3129         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3130         if mobj is None:
3131             self._downloader.trouble(u'ERROR: unable to extract song name')
3132             return
3133         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3134         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3135         if mobj is None:
3136             self._downloader.trouble(u'ERROR: unable to extract performer')
3137             return
3138         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3139         video_title = performer + ' - ' + song_name
3140
3141         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3142         if mobj is None:
3143             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3144             return
3145         mtvn_uri = mobj.group(1)
3146
3147         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3148         if mobj is None:
3149             self._downloader.trouble(u'ERROR: unable to extract content id')
3150             return
3151         content_id = mobj.group(1)
3152
3153         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3154         self.report_extraction(video_id)
3155         request = compat_urllib_request.Request(videogen_url)
3156         try:
3157             metadataXml = compat_urllib_request.urlopen(request).read()
3158         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3160             return
3161
3162         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3163         renditions = mdoc.findall('.//rendition')
3164
3165         # For now, always pick the highest quality.
3166         rendition = renditions[-1]
3167
3168         try:
3169             _,_,ext = rendition.attrib['type'].partition('/')
3170             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3171             video_url = rendition.find('./src').text
3172         except KeyError:
3173             self._downloader.trouble('Invalid rendition field.')
3174             return
3175
3176         info = {
3177             'id': video_id,
3178             'url': video_url,
3179             'uploader': performer,
3180             'upload_date': None,
3181             'title': video_title,
3182             'ext': ext,
3183             'format': format,
3184         }
3185
3186         return [info]
3187
3188
3189 class YoukuIE(InfoExtractor):
3190     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3191
3192     def report_download_webpage(self, file_id):
3193         """Report webpage download."""
3194         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3195
3196     def report_extraction(self, file_id):
3197         """Report information extraction."""
3198         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3199
3200     def _gen_sid(self):
3201         nowTime = int(time.time() * 1000)
3202         random1 = random.randint(1000,1998)
3203         random2 = random.randint(1000,9999)
3204
3205         return "%d%d%d" %(nowTime,random1,random2)
3206
3207     def _get_file_ID_mix_string(self, seed):
3208         mixed = []
3209         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3210         seed = float(seed)
3211         for i in range(len(source)):
3212             seed  =  (seed * 211 + 30031 ) % 65536
3213             index  =  math.floor(seed / 65536 * len(source) )
3214             mixed.append(source[int(index)])
3215             source.remove(source[int(index)])
3216         #return ''.join(mixed)
3217         return mixed
3218
3219     def _get_file_id(self, fileId, seed):
3220         mixed = self._get_file_ID_mix_string(seed)
3221         ids = fileId.split('*')
3222         realId = []
3223         for ch in ids:
3224             if ch:
3225                 realId.append(mixed[int(ch)])
3226         return ''.join(realId)
3227
3228     def _real_extract(self, url):
3229         mobj = re.match(self._VALID_URL, url)
3230         if mobj is None:
3231             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3232             return
3233         video_id = mobj.group('ID')
3234
3235         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3236
3237         request = compat_urllib_request.Request(info_url, None, std_headers)
3238         try:
3239             self.report_download_webpage(video_id)
3240             jsondata = compat_urllib_request.urlopen(request).read()
3241         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3242             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3243             return
3244
3245         self.report_extraction(video_id)
3246         try:
3247             jsonstr = jsondata.decode('utf-8')
3248             config = json.loads(jsonstr)
3249
3250             video_title =  config['data'][0]['title']
3251             seed = config['data'][0]['seed']
3252
3253             format = self._downloader.params.get('format', None)
3254             supported_format = list(config['data'][0]['streamfileids'].keys())
3255
3256             if format is None or format == 'best':
3257                 if 'hd2' in supported_format:
3258                     format = 'hd2'
3259                 else:
3260                     format = 'flv'
3261                 ext = u'flv'
3262             elif format == 'worst':
3263                 format = 'mp4'
3264                 ext = u'mp4'
3265             else:
3266                 format = 'flv'
3267                 ext = u'flv'
3268
3269
3270             fileid = config['data'][0]['streamfileids'][format]
3271             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3272         except (UnicodeDecodeError, ValueError, KeyError):
3273             self._downloader.trouble(u'ERROR: unable to extract info section')
3274             return
3275
3276         files_info=[]
3277         sid = self._gen_sid()
3278         fileid = self._get_file_id(fileid, seed)
3279
3280         #column 8,9 of fileid represent the segment number
3281         #fileid[7:9] should be changed
3282         for index, key in enumerate(keys):
3283
3284             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3285             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3286
3287             info = {
3288                 'id': '%s_part%02d' % (video_id, index),
3289                 'url': download_url,
3290                 'uploader': None,
3291                 'upload_date': None,
3292                 'title': video_title,
3293                 'ext': ext,
3294             }
3295             files_info.append(info)
3296
3297         return files_info
3298
3299
3300 class XNXXIE(InfoExtractor):
3301     """Information extractor for xnxx.com"""
3302
3303     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3304     IE_NAME = u'xnxx'
3305     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3306     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3307     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3308
3309     def report_webpage(self, video_id):
3310         """Report information extraction"""
3311         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3312
3313     def report_extraction(self, video_id):
3314         """Report information extraction"""
3315         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3316
3317     def _real_extract(self, url):
3318         mobj = re.match(self._VALID_URL, url)
3319         if mobj is None:
3320             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3321             return
3322         video_id = mobj.group(1)
3323
3324         self.report_webpage(video_id)
3325
3326         # Get webpage content
3327         try:
3328             webpage_bytes = compat_urllib_request.urlopen(url).read()
3329             webpage = webpage_bytes.decode('utf-8')
3330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3331             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3332             return
3333
3334         result = re.search(self.VIDEO_URL_RE, webpage)
3335         if result is None:
3336             self._downloader.trouble(u'ERROR: unable to extract video url')
3337             return
3338         video_url = compat_urllib_parse.unquote(result.group(1))
3339
3340         result = re.search(self.VIDEO_TITLE_RE, webpage)
3341         if result is None:
3342             self._downloader.trouble(u'ERROR: unable to extract video title')
3343             return
3344         video_title = result.group(1)
3345
3346         result = re.search(self.VIDEO_THUMB_RE, webpage)
3347         if result is None:
3348             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3349             return
3350         video_thumbnail = result.group(1)
3351
3352         return [{
3353             'id': video_id,
3354             'url': video_url,
3355             'uploader': None,
3356             'upload_date': None,
3357             'title': video_title,
3358             'ext': 'flv',
3359             'thumbnail': video_thumbnail,
3360             'description': None,
3361         }]
3362
3363
3364 class GooglePlusIE(InfoExtractor):
3365     """Information extractor for plus.google.com."""
3366
3367     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3368     IE_NAME = u'plus.google'
3369
3370     def __init__(self, downloader=None):
3371         InfoExtractor.__init__(self, downloader)
3372
3373     def report_extract_entry(self, url):
3374         """Report downloading extry"""
3375         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3376
3377     def report_date(self, upload_date):
3378         """Report downloading extry"""
3379         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3380
3381     def report_uploader(self, uploader):
3382         """Report downloading extry"""
3383         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3384
3385     def report_title(self, video_title):
3386         """Report downloading extry"""
3387         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3388
3389     def report_extract_vid_page(self, video_page):
3390         """Report information extraction."""
3391         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3392
3393     def _real_extract(self, url):
3394         # Extract id from URL
3395         mobj = re.match(self._VALID_URL, url)
3396         if mobj is None:
3397             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3398             return
3399
3400         post_url = mobj.group(0)
3401         video_id = mobj.group(1)
3402
3403         video_extension = 'flv'
3404
3405         # Step 1, Retrieve post webpage to extract further information
3406         self.report_extract_entry(post_url)
3407         request = compat_urllib_request.Request(post_url)
3408         try:
3409             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3410         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3411             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3412             return
3413
3414         # Extract update date
3415         upload_date = None
3416         pattern = 'title="Timestamp">(.*?)</a>'
3417         mobj = re.search(pattern, webpage)
3418         if mobj:
3419             upload_date = mobj.group(1)
3420             # Convert timestring to a format suitable for filename
3421             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3422             upload_date = upload_date.strftime('%Y%m%d')
3423         self.report_date(upload_date)
3424
3425         # Extract uploader
3426         uploader = None
3427         pattern = r'rel\="author".*?>(.*?)</a>'
3428         mobj = re.search(pattern, webpage)
3429         if mobj:
3430             uploader = mobj.group(1)
3431         self.report_uploader(uploader)
3432
3433         # Extract title
3434         # Get the first line for title
3435         video_title = u'NA'
3436         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3437         mobj = re.search(pattern, webpage)
3438         if mobj:
3439             video_title = mobj.group(1)
3440         self.report_title(video_title)
3441
3442         # Step 2, Stimulate clicking the image box to launch video
3443         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3444         mobj = re.search(pattern, webpage)
3445         if mobj is None:
3446             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3447
3448         video_page = mobj.group(1)
3449         request = compat_urllib_request.Request(video_page)
3450         try:
3451             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3453             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3454             return
3455         self.report_extract_vid_page(video_page)
3456
3457
3458         # Extract video links on video page
3459         """Extract video links of all sizes"""
3460         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3461         mobj = re.findall(pattern, webpage)
3462         if len(mobj) == 0:
3463             self._downloader.trouble(u'ERROR: unable to extract video links')
3464
3465         # Sort in resolution
3466         links = sorted(mobj)
3467
3468         # Choose the lowest of the sort, i.e. highest resolution
3469         video_url = links[-1]
3470         # Only get the url. The resolution part in the tuple has no use anymore
3471         video_url = video_url[-1]
3472         # Treat escaped \u0026 style hex
3473         try:
3474             video_url = video_url.decode("unicode_escape")
3475         except AttributeError: # Python 3
3476             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3477
3478
3479         return [{
3480             'id':       video_id,
3481             'url':      video_url,
3482             'uploader': uploader,
3483             'upload_date':  upload_date,
3484             'title':    video_title,
3485             'ext':      video_extension,
3486         }]
3487
3488 class NBAIE(InfoExtractor):
3489     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3490     IE_NAME = u'nba'
3491
3492     def _real_extract(self, url):
3493         mobj = re.match(self._VALID_URL, url)
3494         if mobj is None:
3495             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3496             return
3497
3498         video_id = mobj.group(1)
3499         if video_id.endswith('/index.html'):
3500             video_id = video_id[:-len('/index.html')]
3501
3502         webpage = self._download_webpage(url, video_id)
3503
3504         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3505         def _findProp(rexp, default=None):
3506             m = re.search(rexp, webpage)
3507             if m:
3508                 return unescapeHTML(m.group(1))
3509             else:
3510                 return default
3511
3512         shortened_video_id = video_id.rpartition('/')[2]
3513         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3514         info = {
3515             'id': shortened_video_id,
3516             'url': video_url,
3517             'ext': 'mp4',
3518             'title': title,
3519             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3520             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3521         }
3522         return [info]
3523
3524 class JustinTVIE(InfoExtractor):
3525     """Information extractor for justin.tv and twitch.tv"""
3526     # TODO: One broadcast may be split into multiple videos. The key
3527     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3528     # starts at 1 and increases. Can we treat all parts as one video?
3529
3530     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3531         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3532     _JUSTIN_PAGE_LIMIT = 100
3533     IE_NAME = u'justin.tv'
3534
3535     def report_extraction(self, file_id):
3536         """Report information extraction."""
3537         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3538
3539     def report_download_page(self, channel, offset):
3540         """Report attempt to download a single page of videos."""
3541         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3542                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3543
3544     # Return count of items, list of *valid* items
3545     def _parse_page(self, url):
3546         try:
3547             urlh = compat_urllib_request.urlopen(url)
3548             webpage_bytes = urlh.read()
3549             webpage = webpage_bytes.decode('utf-8', 'ignore')
3550         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3551             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3552             return
3553
3554         response = json.loads(webpage)
3555         info = []
3556         for clip in response:
3557             video_url = clip['video_file_url']
3558             if video_url:
3559                 video_extension = os.path.splitext(video_url)[1][1:]
3560                 video_date = re.sub('-', '', clip['created_on'][:10])
3561                 info.append({
3562                     'id': clip['id'],
3563                     'url': video_url,
3564                     'title': clip['title'],
3565                     'uploader': clip.get('user_id', clip.get('channel_id')),
3566                     'upload_date': video_date,
3567                     'ext': video_extension,
3568                 })
3569         return (len(response), info)
3570
3571     def _real_extract(self, url):
3572         mobj = re.match(self._VALID_URL, url)
3573         if mobj is None:
3574             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3575             return
3576
3577         api = 'http://api.justin.tv'
3578         video_id = mobj.group(mobj.lastindex)
3579         paged = False
3580         if mobj.lastindex == 1:
3581             paged = True
3582             api += '/channel/archives/%s.json'
3583         else:
3584             api += '/clip/show/%s.json'
3585         api = api % (video_id,)
3586
3587         self.report_extraction(video_id)
3588
3589         info = []
3590         offset = 0
3591         limit = self._JUSTIN_PAGE_LIMIT
3592         while True:
3593             if paged:
3594                 self.report_download_page(video_id, offset)
3595             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3596             page_count, page_info = self._parse_page(page_url)
3597             info.extend(page_info)
3598             if not paged or page_count != limit:
3599                 break
3600             offset += limit
3601         return info
3602
3603 class FunnyOrDieIE(InfoExtractor):
3604     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3605
3606     def _real_extract(self, url):
3607         mobj = re.match(self._VALID_URL, url)
3608         if mobj is None:
3609             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3610             return
3611
3612         video_id = mobj.group('id')
3613         webpage = self._download_webpage(url, video_id)
3614
3615         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3616         if not m:
3617             self._downloader.trouble(u'ERROR: unable to find video information')
3618         video_url = unescapeHTML(m.group('url'))
3619
3620         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3621         if not m:
3622             self._downloader.trouble(u'Cannot find video title')
3623         title = unescapeHTML(m.group('title'))
3624
3625         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3626         if m:
3627             desc = unescapeHTML(m.group('desc'))
3628         else:
3629             desc = None
3630
3631         info = {
3632             'id': video_id,
3633             'url': video_url,
3634             'ext': 'mp4',
3635             'title': title,
3636             'description': desc,
3637         }
3638         return [info]
3639
3640 class TweetReelIE(InfoExtractor):
3641     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3642
3643     def _real_extract(self, url):
3644         mobj = re.match(self._VALID_URL, url)
3645         if mobj is None:
3646             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3647             return
3648
3649         video_id = mobj.group('id')
3650         webpage = self._download_webpage(url, video_id)
3651
3652         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3653         if not m:
3654             self._downloader.trouble(u'ERROR: Cannot find status ID')
3655         status_id = m.group(1)
3656
3657         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3658         if not m:
3659             self._downloader.trouble(u'WARNING: Cannot find description')
3660         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3661
3662         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3663         if not m:
3664             self._downloader.trouble(u'ERROR: Cannot find uploader')
3665         uploader = unescapeHTML(m.group('uploader'))
3666         uploader_id = unescapeHTML(m.group('uploader_id'))
3667
3668         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3669         if not m:
3670             self._downloader.trouble(u'ERROR: Cannot find upload date')
3671         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3672
3673         title = desc
3674         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3675
3676         info = {
3677             'id': video_id,
3678             'url': video_url,
3679             'ext': 'mov',
3680             'title': title,
3681             'description': desc,
3682             'uploader': uploader,
3683             'uploader_id': uploader_id,
3684             'internal_id': status_id,
3685             'upload_date': upload_date
3686         }
3687         return [info]
3688
3689 class SteamIE(InfoExtractor):
3690     _VALID_URL = r"""http://store.steampowered.com/
3691                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3692                 (?P<gameID>\d+)/?
3693                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3694                 """
3695
3696     def suitable(self, url):
3697         """Receives a URL and returns True if suitable for this IE."""
3698         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3699
3700     def _real_extract(self, url):
3701         m = re.match(self._VALID_URL, url, re.VERBOSE)
3702         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3703         gameID = m.group('gameID')
3704         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3705         webpage = self._download_webpage(videourl, gameID)
3706         mweb = re.finditer(urlRE, webpage)
3707         namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3708         titles = list(re.finditer(namesRE, webpage))
3709         videos = []
3710         for vid,vtitle in zip(mweb,titles):
3711             video_id = vid.group('videoID')
3712             title = vtitle.group('videoName')
3713             video_url = vid.group('videoURL')
3714             if not video_url:
3715                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3716             info = {
3717                 'id':video_id,
3718                 'url':video_url,
3719                 'ext': 'flv',
3720                 'title': title
3721                   }
3722             videos.append(info)
3723         return videos
3724
3725 class UstreamIE(InfoExtractor):
3726     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3727     IE_NAME = u'ustream'
3728
3729     def _real_extract(self, url):
3730         m = re.match(self._VALID_URL, url)
3731         video_id = m.group('videoID')
3732         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3733         webpage = self._download_webpage(url, video_id)
3734         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3735         title = m.group('title')
3736         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3737         uploader = m.group('uploader')
3738         info = {
3739                 'id':video_id,
3740                 'url':video_url,
3741                 'ext': 'flv',
3742                 'title': title,
3743                 'uploader': uploader
3744                   }
3745         return [info]
3746
3747
3748 def gen_extractors():
3749     """ Return a list of an instance of every supported extractor.
3750     The order does matter; the first extractor matched is the one handling the URL.
3751     """
3752     return [
3753         YoutubePlaylistIE(),
3754         YoutubeChannelIE(),
3755         YoutubeUserIE(),
3756         YoutubeSearchIE(),
3757         YoutubeIE(),
3758         MetacafeIE(),
3759         DailymotionIE(),
3760         GoogleSearchIE(),
3761         PhotobucketIE(),
3762         YahooIE(),
3763         YahooSearchIE(),
3764         DepositFilesIE(),
3765         FacebookIE(),
3766         BlipTVUserIE(),
3767         BlipTVIE(),
3768         VimeoIE(),
3769         MyVideoIE(),
3770         ComedyCentralIE(),
3771         EscapistIE(),
3772         CollegeHumorIE(),
3773         XVideosIE(),
3774         SoundcloudIE(),
3775         InfoQIE(),
3776         MixcloudIE(),
3777         StanfordOpenClassroomIE(),
3778         MTVIE(),
3779         YoukuIE(),
3780         XNXXIE(),
3781         GooglePlusIE(),
3782         ArteTvIE(),
3783         NBAIE(),
3784         JustinTVIE(),
3785         FunnyOrDieIE(),
3786         TweetReelIE(),
3787         SteamIE(),
3788         UstreamIE(),
3789         GenericIE()
3790     ]
3791
3792