youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         )
  15
  16 __license__ = 'Public Domain'
  17 __version__ = '2011.08.28-phihag'
  18
  19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
  20
  21 import cookielib
  22 import datetime
  23 import gzip
  24 import htmlentitydefs
  25 import httplib
  26 import locale
  27 import math
  28 import netrc
  29 import os
  30 import os.path
  31 import re
  32 import socket
  33 import string
  34 import subprocess
  35 import sys
  36 import time
  37 import urllib
  38 import urllib2
  39 import warnings
  40 import zlib
  41
  42 if os.name == 'nt':
  43         import ctypes
  44
  45 try:
  46         import email.utils
  47 except ImportError: # Python 2.4
  48         import email.Utils
  49 try:
  50         import cStringIO as StringIO
  51 except ImportError:
  52         import StringIO
  53
  54 # parse_qs was moved from the cgi module to the urlparse module recently.
  55 try:
  56         from urlparse import parse_qs
  57 except ImportError:
  58         from cgi import parse_qs
  59
  60 try:
  61         import lxml.etree
  62 except ImportError:
  63         pass # Handled below
  64
  65 std_headers = {
  66         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  67         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  68         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  69         'Accept-Encoding': 'gzip, deflate',
  70         'Accept-Language': 'en-us,en;q=0.5',
  71 }
  72
  73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  74
  75 try:
  76         import json
  77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  78         import re
  79         class json(object):
  80                 @staticmethod
  81                 def loads(s):
  82                         s = s.decode('UTF-8')
  83                         def raiseError(msg, i):
  84                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  85                         def skipSpace(i, expectMore=True):
  86                                 while i < len(s) and s[i] in ' \t\r\n':
  87                                         i += 1
  88                                 if expectMore:
  89                                         if i >= len(s):
  90                                                 raiseError('Premature end', i)
  91                                 return i
  92                         def decodeEscape(match):
  93                                 esc = match.group(1)
  94                                 _STATIC = {
  95                                         '"': '"',
  96                                         '\\': '\\',
  97                                         '/': '/',
  98                                         'b': unichr(0x8),
  99                                         'f': unichr(0xc),
 100                                         'n': '\n',
 101                                         'r': '\r',
 102                                         't': '\t',
 103                                 }
 104                                 if esc in _STATIC:
 105                                         return _STATIC[esc]
 106                                 if esc[0] == 'u':
 107                                         if len(esc) == 1+4:
 108                                                 return unichr(int(esc[1:5], 16))
 109                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 110                                                 hi = int(esc[1:5], 16)
 111                                                 low = int(esc[7:11], 16)
 112                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 113                                 raise ValueError('Unknown escape ' + str(esc))
 114                         def parseString(i):
 115                                 i += 1
 116                                 e = i
 117                                 while True:
 118                                         e = s.index('"', e)
 119                                         bslashes = 0
 120                                         while s[e-bslashes-1] == '\\':
 121                                                 bslashes += 1
 122                                         if bslashes % 2 == 1:
 123                                                 e += 1
 124                                                 continue
 125                                         break
 126                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 127                                 stri = rexp.sub(decodeEscape, s[i:e])
 128                                 return (e+1,stri)
 129                         def parseObj(i):
 130                                 i += 1
 131                                 res = {}
 132                                 i = skipSpace(i)
 133                                 if s[i] == '}': # Empty dictionary
 134                                         return (i+1,res)
 135                                 while True:
 136                                         if s[i] != '"':
 137                                                 raiseError('Expected a string object key', i)
 138                                         i,key = parseString(i)
 139                                         i = skipSpace(i)
 140                                         if i >= len(s) or s[i] != ':':
 141                                                 raiseError('Expected a colon', i)
 142                                         i,val = parse(i+1)
 143                                         res[key] = val
 144                                         i = skipSpace(i)
 145                                         if s[i] == '}':
 146                                                 return (i+1, res)
 147                                         if s[i] != ',':
 148                                                 raiseError('Expected comma or closing curly brace', i)
 149                                         i = skipSpace(i+1)
 150                         def parseArray(i):
 151                                 res = []
 152                                 i = skipSpace(i+1)
 153                                 if s[i] == ']': # Empty array
 154                                         return (i+1,res)
 155                                 while True:
 156                                         i,val = parse(i)
 157                                         res.append(val)
 158                                         i = skipSpace(i) # Raise exception if premature end
 159                                         if s[i] == ']':
 160                                                 return (i+1, res)
 161                                         if s[i] != ',':
 162                                                 raiseError('Expected a comma or closing bracket', i)
 163                                         i = skipSpace(i+1)
 164                         def parseDiscrete(i):
 165                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 166                                         if s.startswith(k, i):
 167                                                 return (i+len(k), v)
 168                                 raiseError('Not a boolean (or null)', i)
 169                         def parseNumber(i):
 170                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 171                                 if mobj is None:
 172                                         raiseError('Not a number', i)
 173                                 nums = mobj.group(1)
 174                                 if '.' in nums or 'e' in nums or 'E' in nums:
 175                                         return (i+len(nums), float(nums))
 176                                 return (i+len(nums), int(nums))
 177                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 178                         def parse(i):
 179                                 i = skipSpace(i)
 180                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 181                                 i = skipSpace(i, False)
 182                                 return (i,res)
 183                         i,res = parse(0)
 184                         if i < len(s):
 185                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 186                         return res
 187
 188 def preferredencoding():
 189         """Get preferred encoding.
 190
 191         Returns the best encoding scheme for the system, based on
 192         locale.getpreferredencoding() and some further tweaks.
 193         """
 194         def yield_preferredencoding():
 195                 try:
 196                         pref = locale.getpreferredencoding()
 197                         u'TEST'.encode(pref)
 198                 except:
 199                         pref = 'UTF-8'
 200                 while True:
 201                         yield pref
 202         return yield_preferredencoding().next()
 203
 204 def htmlentity_transform(matchobj):
 205         """Transforms an HTML entity to a Unicode character.
 206
 207         This function receives a match object and is intended to be used with
 208         the re.sub() function.
 209         """
 210         entity = matchobj.group(1)
 211
 212         # Known non-numeric HTML entity
 213         if entity in htmlentitydefs.name2codepoint:
 214                 return unichr(htmlentitydefs.name2codepoint[entity])
 215
 216         # Unicode character
 217         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 218         if mobj is not None:
 219                 numstr = mobj.group(1)
 220                 if numstr.startswith(u'x'):
 221                         base = 16
 222                         numstr = u'0%s' % numstr
 223                 else:
 224                         base = 10
 225                 return unichr(long(numstr, base))
 226
 227         # Unknown entity in name, return its literal representation
 228         return (u'&%s;' % entity)
 229
 230 def sanitize_title(utitle):
 231         """Sanitizes a video title so it could be used as part of a filename."""
 232         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 233         return utitle.replace(unicode(os.sep), u'%')
 234
 235 def sanitize_open(filename, open_mode):
 236         """Try to open the given filename, and slightly tweak it if this fails.
 237
 238         Attempts to open the given filename. If this fails, it tries to change
 239         the filename slightly, step by step, until it's either able to open it
 240         or it fails and raises a final exception, like the standard open()
 241         function.
 242
 243         It returns the tuple (stream, definitive_file_name).
 244         """
 245         try:
 246                 if filename == u'-':
 247                         if sys.platform == 'win32':
 248                                 import msvcrt
 249                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 250                         return (sys.stdout, filename)
 251                 stream = open(filename, open_mode)
 252                 return (stream, filename)
 253         except (IOError, OSError), err:
 254                 # In case of error, try to remove win32 forbidden chars
 255                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 256
 257                 # An exception here should be caught in the caller
 258                 stream = open(filename, open_mode)
 259                 return (stream, filename)
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269 class DownloadError(Exception):
 270         """Download Error exception.
 271
 272         This exception may be thrown by FileDownloader objects if they are not
 273         configured to continue on errors. They will contain the appropriate
 274         error message.
 275         """
 276         pass
 277
 278 class SameFileError(Exception):
 279         """Same File exception.
 280
 281         This exception will be thrown by FileDownloader objects if they detect
 282         multiple files would have to be downloaded to the same file on disk.
 283         """
 284         pass
 285
 286 class PostProcessingError(Exception):
 287         """Post Processing exception.
 288
 289         This exception may be raised by PostProcessor's .run() method to
 290         indicate an error in the postprocessing task.
 291         """
 292         pass
 293
 294 class UnavailableVideoError(Exception):
 295         """Unavailable Format exception.
 296
 297         This exception will be thrown when a video is requested
 298         in a format that is not available for that video.
 299         """
 300         pass
 301
 302 class ContentTooShortError(Exception):
 303         """Content Too Short exception.
 304
 305         This exception may be raised by FileDownloader objects when a file they
 306         download is too small for what the server announced first, indicating
 307         the connection was probably interrupted.
 308         """
 309         # Both in bytes
 310         downloaded = None
 311         expected = None
 312
 313         def __init__(self, downloaded, expected):
 314                 self.downloaded = downloaded
 315                 self.expected = expected
 316
 317 class YoutubeDLHandler(urllib2.HTTPHandler):
 318         """Handler for HTTP requests and responses.
 319
 320         This class, when installed with an OpenerDirector, automatically adds
 321         the standard headers to every HTTP request and handles gzipped and
 322         deflated responses from web servers. If compression is to be avoided in
 323         a particular request, the original request in the program code only has
 324         to include the HTTP header "Youtubedl-No-Compression", which will be
 325         removed before making the real request.
 326
 327         Part of this code was copied from:
 328
 329           http://techknack.net/python-urllib2-handlers/
 330
 331         Andrew Rowls, the author of that code, agreed to release it to the
 332         public domain.
 333         """
 334
 335         @staticmethod
 336         def deflate(data):
 337                 try:
 338                         return zlib.decompress(data, -zlib.MAX_WBITS)
 339                 except zlib.error:
 340                         return zlib.decompress(data)
 341
 342         @staticmethod
 343         def addinfourl_wrapper(stream, headers, url, code):
 344                 if hasattr(urllib2.addinfourl, 'getcode'):
 345                         return urllib2.addinfourl(stream, headers, url, code)
 346                 ret = urllib2.addinfourl(stream, headers, url)
 347                 ret.code = code
 348                 return ret
 349
 350         def http_request(self, req):
 351                 for h in std_headers:
 352                         if h in req.headers:
 353                                 del req.headers[h]
 354                         req.add_header(h, std_headers[h])
 355                 if 'Youtubedl-no-compression' in req.headers:
 356                         if 'Accept-encoding' in req.headers:
 357                                 del req.headers['Accept-encoding']
 358                         del req.headers['Youtubedl-no-compression']
 359                 return req
 360
 361         def http_response(self, req, resp):
 362                 old_resp = resp
 363                 # gzip
 364                 if resp.headers.get('Content-encoding', '') == 'gzip':
 365                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 366                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 367                         resp.msg = old_resp.msg
 368                 # deflate
 369                 if resp.headers.get('Content-encoding', '') == 'deflate':
 370                         gz = StringIO.StringIO(self.deflate(resp.read()))
 371                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 372                         resp.msg = old_resp.msg
 373                 return resp
 374
 375 class FileDownloader(object):
 376         """File Downloader class.
 377
 378         File downloader objects are the ones responsible of downloading the
 379         actual video file and writing it to disk if the user has requested
 380         it, among some other tasks. In most cases there should be one per
 381         program. As, given a video URL, the downloader doesn't know how to
 382         extract all the needed information, task that InfoExtractors do, it
 383         has to pass the URL to one of them.
 384
 385         For this, file downloader objects have a method that allows
 386         InfoExtractors to be registered in a given order. When it is passed
 387         a URL, the file downloader handles it to the first InfoExtractor it
 388         finds that reports being able to handle it. The InfoExtractor extracts
 389         all the information about the video or videos the URL refers to, and
 390         asks the FileDownloader to process the video information, possibly
 391         downloading the video.
 392
 393         File downloaders accept a lot of parameters. In order not to saturate
 394         the object constructor with arguments, it receives a dictionary of
 395         options instead. These options are available through the params
 396         attribute for the InfoExtractors to use. The FileDownloader also
 397         registers itself as the downloader in charge for the InfoExtractors
 398         that are added to it, so this is a "mutual registration".
 399
 400         Available options:
 401
 402         username:         Username for authentication purposes.
 403         password:         Password for authentication purposes.
 404         usenetrc:         Use netrc for authentication instead.
 405         quiet:            Do not print messages to stdout.
 406         forceurl:         Force printing final URL.
 407         forcetitle:       Force printing title.
 408         forcethumbnail:   Force printing thumbnail URL.
 409         forcedescription: Force printing description.
 410         forcefilename:    Force printing final filename.
 411         simulate:         Do not download the video files.
 412         format:           Video format code.
 413         format_limit:     Highest quality format to try.
 414         outtmpl:          Template for output names.
 415         ignoreerrors:     Do not stop on download errors.
 416         ratelimit:        Download speed limit, in bytes/sec.
 417         nooverwrites:     Prevent overwriting files.
 418         retries:          Number of times to retry for HTTP error 5xx
 419         continuedl:       Try to continue downloads if possible.
 420         noprogress:       Do not print the progress bar.
 421         playliststart:    Playlist item to start at.
 422         playlistend:      Playlist item to end at.
 423         logtostderr:      Log messages to stderr instead of stdout.
 424         consoletitle:     Display progress in console window's titlebar.
 425         nopart:           Do not use temporary .part files.
 426         updatetime:       Use the Last-modified header to set output file timestamps.
 427         writedescription: Write the video description to a .description file
 428         writeinfojson:    Write the video description to a .info.json file
 429         """
 430
 431         params = None
 432         _ies = []
 433         _pps = []
 434         _download_retcode = None
 435         _num_downloads = None
 436         _screen_file = None
 437
 438         def __init__(self, params):
 439                 """Create a FileDownloader object with the given options."""
 440                 self._ies = []
 441                 self._pps = []
 442                 self._download_retcode = 0
 443                 self._num_downloads = 0
 444                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 445                 self.params = params
 446
 447         @staticmethod
 448         def pmkdir(filename):
 449                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 450                 components = filename.split(os.sep)
 451                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 452                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 453                 for dir in aggregate:
 454                         if not os.path.exists(dir):
 455                                 os.mkdir(dir)
 456
 457         @staticmethod
 458         def format_bytes(bytes):
 459                 if bytes is None:
 460                         return 'N/A'
 461                 if type(bytes) is str:
 462                         bytes = float(bytes)
 463                 if bytes == 0.0:
 464                         exponent = 0
 465                 else:
 466                         exponent = long(math.log(bytes, 1024.0))
 467                 suffix = 'bkMGTPEZY'[exponent]
 468                 converted = float(bytes) / float(1024**exponent)
 469                 return '%.2f%s' % (converted, suffix)
 470
 471         @staticmethod
 472         def calc_percent(byte_counter, data_len):
 473                 if data_len is None:
 474                         return '---.-%'
 475                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 476
 477         @staticmethod
 478         def calc_eta(start, now, total, current):
 479                 if total is None:
 480                         return '--:--'
 481                 dif = now - start
 482                 if current == 0 or dif < 0.001: # One millisecond
 483                         return '--:--'
 484                 rate = float(current) / dif
 485                 eta = long((float(total) - float(current)) / rate)
 486                 (eta_mins, eta_secs) = divmod(eta, 60)
 487                 if eta_mins > 99:
 488                         return '--:--'
 489                 return '%02d:%02d' % (eta_mins, eta_secs)
 490
 491         @staticmethod
 492         def calc_speed(start, now, bytes):
 493                 dif = now - start
 494                 if bytes == 0 or dif < 0.001: # One millisecond
 495                         return '%10s' % '---b/s'
 496                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 497
 498         @staticmethod
 499         def best_block_size(elapsed_time, bytes):
 500                 new_min = max(bytes / 2.0, 1.0)
 501                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 502                 if elapsed_time < 0.001:
 503                         return long(new_max)
 504                 rate = bytes / elapsed_time
 505                 if rate > new_max:
 506                         return long(new_max)
 507                 if rate < new_min:
 508                         return long(new_min)
 509                 return long(rate)
 510
 511         @staticmethod
 512         def parse_bytes(bytestr):
 513                 """Parse a string indicating a byte quantity into a long integer."""
 514                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 515                 if matchobj is None:
 516                         return None
 517                 number = float(matchobj.group(1))
 518                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 519                 return long(round(number * multiplier))
 520
 521         def add_info_extractor(self, ie):
 522                 """Add an InfoExtractor object to the end of the list."""
 523                 self._ies.append(ie)
 524                 ie.set_downloader(self)
 525
 526         def add_post_processor(self, pp):
 527                 """Add a PostProcessor object to the end of the chain."""
 528                 self._pps.append(pp)
 529                 pp.set_downloader(self)
 530
 531         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 532                 """Print message to stdout if not in quiet mode."""
 533                 try:
 534                         if not self.params.get('quiet', False):
 535                                 terminator = [u'\n', u''][skip_eol]
 536                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 537                         self._screen_file.flush()
 538                 except (UnicodeEncodeError), err:
 539                         if not ignore_encoding_errors:
 540                                 raise
 541
 542         def to_stderr(self, message):
 543                 """Print message to stderr."""
 544                 print >>sys.stderr, message.encode(preferredencoding())
 545
 546         def to_cons_title(self, message):
 547                 """Set console/terminal window title to message."""
 548                 if not self.params.get('consoletitle', False):
 549                         return
 550                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 551                         # c_wchar_p() might not be necessary if `message` is
 552                         # already of type unicode()
 553                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 554                 elif 'TERM' in os.environ:
 555                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 556
 557         def fixed_template(self):
 558                 """Checks if the output template is fixed."""
 559                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 560
 561         def trouble(self, message=None):
 562                 """Determine action to take when a download problem appears.
 563
 564                 Depending on if the downloader has been configured to ignore
 565                 download errors or not, this method may throw an exception or
 566                 not when errors are found, after printing the message.
 567                 """
 568                 if message is not None:
 569                         self.to_stderr(message)
 570                 if not self.params.get('ignoreerrors', False):
 571                         raise DownloadError(message)
 572                 self._download_retcode = 1
 573
 574         def slow_down(self, start_time, byte_counter):
 575                 """Sleep if the download speed is over the rate limit."""
 576                 rate_limit = self.params.get('ratelimit', None)
 577                 if rate_limit is None or byte_counter == 0:
 578                         return
 579                 now = time.time()
 580                 elapsed = now - start_time
 581                 if elapsed <= 0.0:
 582                         return
 583                 speed = float(byte_counter) / elapsed
 584                 if speed > rate_limit:
 585                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 586
 587         def temp_name(self, filename):
 588                 """Returns a temporary filename for the given filename."""
 589                 if self.params.get('nopart', False) or filename == u'-' or \
 590                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 591                         return filename
 592                 return filename + u'.part'
 593
 594         def undo_temp_name(self, filename):
 595                 if filename.endswith(u'.part'):
 596                         return filename[:-len(u'.part')]
 597                 return filename
 598
 599         def try_rename(self, old_filename, new_filename):
 600                 try:
 601                         if old_filename == new_filename:
 602                                 return
 603                         os.rename(old_filename, new_filename)
 604                 except (IOError, OSError), err:
 605                         self.trouble(u'ERROR: unable to rename file')
 606
 607         def try_utime(self, filename, last_modified_hdr):
 608                 """Try to set the last-modified time of the given file."""
 609                 if last_modified_hdr is None:
 610                         return
 611                 if not os.path.isfile(filename):
 612                         return
 613                 timestr = last_modified_hdr
 614                 if timestr is None:
 615                         return
 616                 filetime = timeconvert(timestr)
 617                 if filetime is None:
 618                         return
 619                 try:
 620                         os.utime(filename,(time.time(), filetime))
 621                 except:
 622                         pass
 623
 624         def report_writedescription(self, descfn):
 625                 """ Report that the description file is being written """
 626                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 627
 628         def report_writeinfojson(self, infofn):
 629                 """ Report that the metadata file has been written """
 630                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 631
 632         def report_destination(self, filename):
 633                 """Report destination filename."""
 634                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 635
 636         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 637                 """Report download progress."""
 638                 if self.params.get('noprogress', False):
 639                         return
 640                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 641                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 642                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 643                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 644
 645         def report_resuming_byte(self, resume_len):
 646                 """Report attempt to resume at given byte."""
 647                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 648
 649         def report_retry(self, count, retries):
 650                 """Report retry in case of HTTP error 5xx"""
 651                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 652
 653         def report_file_already_downloaded(self, file_name):
 654                 """Report file has already been fully downloaded."""
 655                 try:
 656                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 657                 except (UnicodeEncodeError), err:
 658                         self.to_screen(u'[download] The file has already been downloaded')
 659
 660         def report_unable_to_resume(self):
 661                 """Report it was impossible to resume download."""
 662                 self.to_screen(u'[download] Unable to resume')
 663
 664         def report_finish(self):
 665                 """Report download finished."""
 666                 if self.params.get('noprogress', False):
 667                         self.to_screen(u'[download] Download completed')
 668                 else:
 669                         self.to_screen(u'')
 670
 671         def increment_downloads(self):
 672                 """Increment the ordinal that assigns a number to each file."""
 673                 self._num_downloads += 1
 674
 675         def prepare_filename(self, info_dict):
 676                 """Generate the output filename."""
 677                 try:
 678                         template_dict = dict(info_dict)
 679                         template_dict['epoch'] = unicode(long(time.time()))
 680                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 681                         filename = self.params['outtmpl'] % template_dict
 682                         return filename
 683                 except (ValueError, KeyError), err:
 684                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 685                         return None
 686
 687         def process_info(self, info_dict):
 688                 """Process a single dictionary returned by an InfoExtractor."""
 689                 filename = self.prepare_filename(info_dict)
 690                 # Do nothing else if in simulate mode
 691                 if self.params.get('simulate', False):
 692                         # Forced printings
 693                         if self.params.get('forcetitle', False):
 694                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 695                         if self.params.get('forceurl', False):
 696                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 697                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 698                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 699                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 700                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 701                         if self.params.get('forcefilename', False) and filename is not None:
 702                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 703
 704                         return
 705
 706                 if filename is None:
 707                         return
 708                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 709                         self.to_stderr(u'WARNING: file exists and will be skipped')
 710                         return
 711
 712                 try:
 713                         self.pmkdir(filename)
 714                 except (OSError, IOError), err:
 715                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 716                         return
 717
 718                 if self.params.get('writedescription', False):
 719                         try:
 720                                 descfn = filename + '.description'
 721                                 self.report_writedescription(descfn)
 722                                 descfile = open(descfn, 'wb')
 723                                 try:
 724                                         descfile.write(info_dict['description'].encode('utf-8'))
 725                                 finally:
 726                                         descfile.close()
 727                         except (OSError, IOError):
 728                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 729                                 return
 730
 731                 if self.params.get('writeinfojson', False):
 732                         infofn = filename + '.info.json'
 733                         self.report_writeinfojson(infofn)
 734                         try:
 735                                 json.dump
 736                         except (NameError,AttributeError):
 737                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 738                                 return
 739                         try:
 740                                 infof = open(infofn, 'wb')
 741                                 try:
 742                                         json.dump(info_dict, infof)
 743                                 finally:
 744                                         infof.close()
 745                         except (OSError, IOError):
 746                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 747                                 return
 748
 749                 try:
 750                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 751                 except (OSError, IOError), err:
 752                         raise UnavailableVideoError
 753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 754                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 755                         return
 756                 except (ContentTooShortError, ), err:
 757                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 758                         return
 759
 760                 if success:
 761                         try:
 762                                 self.post_process(filename, info_dict)
 763                         except (PostProcessingError), err:
 764                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 765                                 return
 766
 767         def download(self, url_list):
 768                 """Download a given list of URLs."""
 769                 if len(url_list) > 1 and self.fixed_template():
 770                         raise SameFileError(self.params['outtmpl'])
 771
 772                 for url in url_list:
 773                         suitable_found = False
 774                         for ie in self._ies:
 775                                 # Go to next InfoExtractor if not suitable
 776                                 if not ie.suitable(url):
 777                                         continue
 778
 779                                 # Suitable InfoExtractor found
 780                                 suitable_found = True
 781
 782                                 # Extract information from URL and process it
 783                                 ie.extract(url)
 784
 785                                 # Suitable InfoExtractor had been found; go to next URL
 786                                 break
 787
 788                         if not suitable_found:
 789                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 790
 791                 return self._download_retcode
 792
 793         def post_process(self, filename, ie_info):
 794                 """Run the postprocessing chain on the given file."""
 795                 info = dict(ie_info)
 796                 info['filepath'] = filename
 797                 for pp in self._pps:
 798                         info = pp.run(info)
 799                         if info is None:
 800                                 break
 801
 802         def _download_with_rtmpdump(self, filename, url, player_url):
 803                 self.report_destination(filename)
 804                 tmpfilename = self.temp_name(filename)
 805
 806                 # Check for rtmpdump first
 807                 try:
 808                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 809                 except (OSError, IOError):
 810                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 811                         return False
 812
 813                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 814                 # the connection was interrumpted and resuming appears to be
 815                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 816                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 817                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 818                 while retval == 2 or retval == 1:
 819                         prevsize = os.path.getsize(tmpfilename)
 820                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 821                         time.sleep(5.0) # This seems to be needed
 822                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 823                         cursize = os.path.getsize(tmpfilename)
 824                         if prevsize == cursize and retval == 1:
 825                                 break
 826                 if retval == 0:
 827                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 828                         self.try_rename(tmpfilename, filename)
 829                         return True
 830                 else:
 831                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 832                         return False
 833
 834         def _do_download(self, filename, url, player_url):
 835                 # Check file already present
 836                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 837                         self.report_file_already_downloaded(filename)
 838                         return True
 839
 840                 # Attempt to download using rtmpdump
 841                 if url.startswith('rtmp'):
 842                         return self._download_with_rtmpdump(filename, url, player_url)
 843
 844                 tmpfilename = self.temp_name(filename)
 845                 stream = None
 846                 open_mode = 'wb'
 847
 848                 # Do not include the Accept-Encoding header
 849                 headers = {'Youtubedl-no-compression': 'True'}
 850                 basic_request = urllib2.Request(url, None, headers)
 851                 request = urllib2.Request(url, None, headers)
 852
 853                 # Establish possible resume length
 854                 if os.path.isfile(tmpfilename):
 855                         resume_len = os.path.getsize(tmpfilename)
 856                 else:
 857                         resume_len = 0
 858
 859                 # Request parameters in case of being able to resume
 860                 if self.params.get('continuedl', False) and resume_len != 0:
 861                         self.report_resuming_byte(resume_len)
 862                         request.add_header('Range','bytes=%d-' % resume_len)
 863                         open_mode = 'ab'
 864
 865                 count = 0
 866                 retries = self.params.get('retries', 0)
 867                 while count <= retries:
 868                         # Establish connection
 869                         try:
 870                                 data = urllib2.urlopen(request)
 871                                 break
 872                         except (urllib2.HTTPError, ), err:
 873                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 874                                         # Unexpected HTTP error
 875                                         raise
 876                                 elif err.code == 416:
 877                                         # Unable to resume (requested range not satisfiable)
 878                                         try:
 879                                                 # Open the connection again without the range header
 880                                                 data = urllib2.urlopen(basic_request)
 881                                                 content_length = data.info()['Content-Length']
 882                                         except (urllib2.HTTPError, ), err:
 883                                                 if err.code < 500 or err.code >= 600:
 884                                                         raise
 885                                         else:
 886                                                 # Examine the reported length
 887                                                 if (content_length is not None and
 888                                                         (resume_len - 100 < long(content_length) < resume_len + 100)):
 889                                                         # The file had already been fully downloaded.
 890                                                         # Explanation to the above condition: in issue #175 it was revealed that
 891                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 892                                                         # changing the file size slightly and causing problems for some users. So
 893                                                         # I decided to implement a suggested change and consider the file
 894                                                         # completely downloaded if the file size differs less than 100 bytes from
 895                                                         # the one in the hard drive.
 896                                                         self.report_file_already_downloaded(filename)
 897                                                         self.try_rename(tmpfilename, filename)
 898                                                         return True
 899                                                 else:
 900                                                         # The length does not match, we start the download over
 901                                                         self.report_unable_to_resume()
 902                                                         open_mode = 'wb'
 903                                                         break
 904                         # Retry
 905                         count += 1
 906                         if count <= retries:
 907                                 self.report_retry(count, retries)
 908
 909                 if count > retries:
 910                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 911                         return False
 912
 913                 data_len = data.info().get('Content-length', None)
 914                 if data_len is not None:
 915                         data_len = long(data_len) + resume_len
 916                 data_len_str = self.format_bytes(data_len)
 917                 byte_counter = 0 + resume_len
 918                 block_size = 1024
 919                 start = time.time()
 920                 while True:
 921                         # Download and write
 922                         before = time.time()
 923                         data_block = data.read(block_size)
 924                         after = time.time()
 925                         if len(data_block) == 0:
 926                                 break
 927                         byte_counter += len(data_block)
 928
 929                         # Open file just in time
 930                         if stream is None:
 931                                 try:
 932                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 933                                         filename = self.undo_temp_name(tmpfilename)
 934                                         self.report_destination(filename)
 935                                 except (OSError, IOError), err:
 936                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 937                                         return False
 938                         try:
 939                                 stream.write(data_block)
 940                         except (IOError, OSError), err:
 941                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 942                                 return False
 943                         block_size = self.best_block_size(after - before, len(data_block))
 944
 945                         # Progress message
 946                         percent_str = self.calc_percent(byte_counter, data_len)
 947                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 948                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 949                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 950
 951                         # Apply rate limit
 952                         self.slow_down(start, byte_counter - resume_len)
 953
 954                 stream.close()
 955                 self.report_finish()
 956                 if data_len is not None and byte_counter != data_len:
 957                         raise ContentTooShortError(byte_counter, long(data_len))
 958                 self.try_rename(tmpfilename, filename)
 959
 960                 # Update file modification time
 961                 if self.params.get('updatetime', True):
 962                         self.try_utime(filename, data.info().get('last-modified', None))
 963
 964                 return True
 965
 966 class InfoExtractor(object):
 967         """Information Extractor class.
 968
 969         Information extractors are the classes that, given a URL, extract
 970         information from the video (or videos) the URL refers to. This
 971         information includes the real video URL, the video title and simplified
 972         title, author and others. The information is stored in a dictionary
 973         which is then passed to the FileDownloader. The FileDownloader
 974         processes this information possibly downloading the video to the file
 975         system, among other possible outcomes. The dictionaries must include
 976         the following fields:
 977
 978         id:             Video identifier.
 979         url:            Final video URL.
 980         uploader:       Nickname of the video uploader.
 981         title:          Literal title.
 982         stitle:         Simplified title.
 983         ext:            Video filename extension.
 984         format:         Video format.
 985         player_url:     SWF Player URL (may be None).
 986
 987         The following fields are optional. Their primary purpose is to allow
 988         youtube-dl to serve as the backend for a video search function, such
 989         as the one in youtube2mp3.  They are only used when their respective
 990         forced printing functions are called:
 991
 992         thumbnail:      Full URL to a video thumbnail image.
 993         description:    One-line video description.
 994
 995         Subclasses of this one should re-define the _real_initialize() and
 996         _real_extract() methods, as well as the suitable() static method.
 997         Probably, they should also be instantiated and added to the main
 998         downloader.
 999         """
1000
1001         _ready = False
1002         _downloader = None
1003
1004         def __init__(self, downloader=None):
1005                 """Constructor. Receives an optional downloader."""
1006                 self._ready = False
1007                 self.set_downloader(downloader)
1008
1009         @staticmethod
1010         def suitable(url):
1011                 """Receives a URL and returns True if suitable for this IE."""
1012                 return False
1013
1014         def initialize(self):
1015                 """Initializes an instance (authentication, etc)."""
1016                 if not self._ready:
1017                         self._real_initialize()
1018                         self._ready = True
1019
1020         def extract(self, url):
1021                 """Extracts URL information and returns it in list of dicts."""
1022                 self.initialize()
1023                 return self._real_extract(url)
1024
1025         def set_downloader(self, downloader):
1026                 """Sets the downloader for this IE."""
1027                 self._downloader = downloader
1028
1029         def _real_initialize(self):
1030                 """Real initialization process. Redefine in subclasses."""
1031                 pass
1032
1033         def _real_extract(self, url):
1034                 """Real extraction process. Redefine in subclasses."""
1035                 pass
1036
1037 class YoutubeIE(InfoExtractor):
1038         """Information extractor for youtube.com."""
1039
1040         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1041         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1042         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1043         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1044         _NETRC_MACHINE = 'youtube'
1045         # Listed in order of quality
1046         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1047         _video_extensions = {
1048                 '13': '3gp',
1049                 '17': 'mp4',
1050                 '18': 'mp4',
1051                 '22': 'mp4',
1052                 '37': 'mp4',
1053                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1054                 '43': 'webm',
1055                 '45': 'webm',
1056         }
1057
1058         @staticmethod
1059         def suitable(url):
1060                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1061
1062         def report_lang(self):
1063                 """Report attempt to set language."""
1064                 self._downloader.to_screen(u'[youtube] Setting language')
1065
1066         def report_login(self):
1067                 """Report attempt to log in."""
1068                 self._downloader.to_screen(u'[youtube] Logging in')
1069
1070         def report_age_confirmation(self):
1071                 """Report attempt to confirm age."""
1072                 self._downloader.to_screen(u'[youtube] Confirming age')
1073
1074         def report_video_webpage_download(self, video_id):
1075                 """Report attempt to download video webpage."""
1076                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1077
1078         def report_video_info_webpage_download(self, video_id):
1079                 """Report attempt to download video info webpage."""
1080                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1081
1082         def report_information_extraction(self, video_id):
1083                 """Report attempt to extract video information."""
1084                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1085
1086         def report_unavailable_format(self, video_id, format):
1087                 """Report extracted video URL."""
1088                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1089
1090         def report_rtmp_download(self):
1091                 """Indicate the download will use the RTMP protocol."""
1092                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1093
1094         def _real_initialize(self):
1095                 if self._downloader is None:
1096                         return
1097
1098                 username = None
1099                 password = None
1100                 downloader_params = self._downloader.params
1101
1102                 # Attempt to use provided username and password or .netrc data
1103                 if downloader_params.get('username', None) is not None:
1104                         username = downloader_params['username']
1105                         password = downloader_params['password']
1106                 elif downloader_params.get('usenetrc', False):
1107                         try:
1108                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1109                                 if info is not None:
1110                                         username = info[0]
1111                                         password = info[2]
1112                                 else:
1113                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1114                         except (IOError, netrc.NetrcParseError), err:
1115                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1116                                 return
1117
1118                 # Set language
1119                 request = urllib2.Request(self._LANG_URL)
1120                 try:
1121                         self.report_lang()
1122                         urllib2.urlopen(request).read()
1123                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1124                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1125                         return
1126
1127                 # No authentication to be performed
1128                 if username is None:
1129                         return
1130
1131                 # Log in
1132                 login_form = {
1133                                 'current_form': 'loginForm',
1134                                 'next':         '/',
1135                                 'action_login': 'Log In',
1136                                 'username':     username,
1137                                 'password':     password,
1138                                 }
1139                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1140                 try:
1141                         self.report_login()
1142                         login_results = urllib2.urlopen(request).read()
1143                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1144                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1145                                 return
1146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1147                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1148                         return
1149
1150                 # Confirm age
1151                 age_form = {
1152                                 'next_url':             '/',
1153                                 'action_confirm':       'Confirm',
1154                                 }
1155                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1156                 try:
1157                         self.report_age_confirmation()
1158                         age_results = urllib2.urlopen(request).read()
1159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1161                         return
1162
1163         def _real_extract(self, url):
1164                 # Extract video id from URL
1165                 mobj = re.match(self._VALID_URL, url)
1166                 if mobj is None:
1167                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1168                         return
1169                 video_id = mobj.group(2)
1170
1171                 # Get video webpage
1172                 self.report_video_webpage_download(video_id)
1173                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1174                 try:
1175                         video_webpage = urllib2.urlopen(request).read()
1176                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1177                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1178                         return
1179
1180                 # Attempt to extract SWF player URL
1181                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1182                 if mobj is not None:
1183                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1184                 else:
1185                         player_url = None
1186
1187                 # Get video info
1188                 self.report_video_info_webpage_download(video_id)
1189                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1190                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1191                                            % (video_id, el_type))
1192                         request = urllib2.Request(video_info_url)
1193                         try:
1194                                 video_info_webpage = urllib2.urlopen(request).read()
1195                                 video_info = parse_qs(video_info_webpage)
1196                                 if 'token' in video_info:
1197                                         break
1198                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1200                                 return
1201                 if 'token' not in video_info:
1202                         if 'reason' in video_info:
1203                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1204                         else:
1205                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1206                         return
1207
1208                 # Start extracting information
1209                 self.report_information_extraction(video_id)
1210
1211                 # uploader
1212                 if 'author' not in video_info:
1213                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1214                         return
1215                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1216
1217                 # title
1218                 if 'title' not in video_info:
1219                         self._downloader.trouble(u'ERROR: unable to extract video title')
1220                         return
1221                 video_title = urllib.unquote_plus(video_info['title'][0])
1222                 video_title = video_title.decode('utf-8')
1223                 video_title = sanitize_title(video_title)
1224
1225                 # simplified title
1226                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1227                 simple_title = simple_title.strip(ur'_')
1228
1229                 # thumbnail image
1230                 if 'thumbnail_url' not in video_info:
1231                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1232                         video_thumbnail = ''
1233                 else:   # don't panic if we can't find it
1234                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1235
1236                 # upload date
1237                 upload_date = u'NA'
1238                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1239                 if mobj is not None:
1240                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1241                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1242                         for expression in format_expressions:
1243                                 try:
1244                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1245                                 except:
1246                                         pass
1247
1248                 # description
1249                 try:
1250                         lxml.etree
1251                 except NameError:
1252                         video_description = u'No description available.'
1253                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1254                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1255                                 if mobj is not None:
1256                                         video_description = mobj.group(1).decode('utf-8')
1257                 else:
1258                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1259                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1260                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1261                         # TODO use another parser
1262
1263                 # token
1264                 video_token = urllib.unquote_plus(video_info['token'][0])
1265
1266                 # Decide which formats to download
1267                 req_format = self._downloader.params.get('format', None)
1268
1269                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1270                         self.report_rtmp_download()
1271                         video_url_list = [(None, video_info['conn'][0])]
1272                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1273                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1274                         url_data = [parse_qs(uds) for uds in url_data_strs]
1275                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1276                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1277
1278                         format_limit = self._downloader.params.get('format_limit', None)
1279                         if format_limit is not None and format_limit in self._available_formats:
1280                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1281                         else:
1282                                 format_list = self._available_formats
1283                         existing_formats = [x for x in format_list if x in url_map]
1284                         if len(existing_formats) == 0:
1285                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1286                                 return
1287                         if req_format is None:
1288                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1289                         elif req_format == '-1':
1290                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1291                         else:
1292                                 # Specific format
1293                                 if req_format not in url_map:
1294                                         self._downloader.trouble(u'ERROR: requested format not available')
1295                                         return
1296                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1297                 else:
1298                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1299                         return
1300
1301                 for format_param, video_real_url in video_url_list:
1302                         # At this point we have a new video
1303                         self._downloader.increment_downloads()
1304
1305                         # Extension
1306                         video_extension = self._video_extensions.get(format_param, 'flv')
1307
1308                         try:
1309                                 # Process video information
1310                                 self._downloader.process_info({
1311                                         'id':           video_id.decode('utf-8'),
1312                                         'url':          video_real_url.decode('utf-8'),
1313                                         'uploader':     video_uploader.decode('utf-8'),
1314                                         'upload_date':  upload_date,
1315                                         'title':        video_title,
1316                                         'stitle':       simple_title,
1317                                         'ext':          video_extension.decode('utf-8'),
1318                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1319                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1320                                         'description':  video_description,
1321                                         'player_url':   player_url,
1322                                 })
1323                         except UnavailableVideoError, err:
1324                                 self._downloader.trouble(u'\nERROR: unable to download video')
1325
1326
1327 class MetacafeIE(InfoExtractor):
1328         """Information Extractor for metacafe.com."""
1329
1330         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1331         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1332         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1333         _youtube_ie = None
1334
1335         def __init__(self, youtube_ie, downloader=None):
1336                 InfoExtractor.__init__(self, downloader)
1337                 self._youtube_ie = youtube_ie
1338
1339         @staticmethod
1340         def suitable(url):
1341                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1342
1343         def report_disclaimer(self):
1344                 """Report disclaimer retrieval."""
1345                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1346
1347         def report_age_confirmation(self):
1348                 """Report attempt to confirm age."""
1349                 self._downloader.to_screen(u'[metacafe] Confirming age')
1350
1351         def report_download_webpage(self, video_id):
1352                 """Report webpage download."""
1353                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1354
1355         def report_extraction(self, video_id):
1356                 """Report information extraction."""
1357                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1358
1359         def _real_initialize(self):
1360                 # Retrieve disclaimer
1361                 request = urllib2.Request(self._DISCLAIMER)
1362                 try:
1363                         self.report_disclaimer()
1364                         disclaimer = urllib2.urlopen(request).read()
1365                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1366                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1367                         return
1368
1369                 # Confirm age
1370                 disclaimer_form = {
1371                         'filters': '0',
1372                         'submit': "Continue - I'm over 18",
1373                         }
1374                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1375                 try:
1376                         self.report_age_confirmation()
1377                         disclaimer = urllib2.urlopen(request).read()
1378                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1379                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1380                         return
1381
1382         def _real_extract(self, url):
1383                 # Extract id and simplified title from URL
1384                 mobj = re.match(self._VALID_URL, url)
1385                 if mobj is None:
1386                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1387                         return
1388
1389                 video_id = mobj.group(1)
1390
1391                 # Check if video comes from YouTube
1392                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1393                 if mobj2 is not None:
1394                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1395                         return
1396
1397                 # At this point we have a new video
1398                 self._downloader.increment_downloads()
1399
1400                 simple_title = mobj.group(2).decode('utf-8')
1401
1402                 # Retrieve video webpage to extract further information
1403                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1404                 try:
1405                         self.report_download_webpage(video_id)
1406                         webpage = urllib2.urlopen(request).read()
1407                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1408                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1409                         return
1410
1411                 # Extract URL, uploader and title from webpage
1412                 self.report_extraction(video_id)
1413                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1414                 if mobj is not None:
1415                         mediaURL = urllib.unquote(mobj.group(1))
1416                         video_extension = mediaURL[-3:]
1417
1418                         # Extract gdaKey if available
1419                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1420                         if mobj is None:
1421                                 video_url = mediaURL
1422                         else:
1423                                 gdaKey = mobj.group(1)
1424                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1425                 else:
1426                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1427                         if mobj is None:
1428                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1429                                 return
1430                         vardict = parse_qs(mobj.group(1))
1431                         if 'mediaData' not in vardict:
1432                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433                                 return
1434                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1435                         if mobj is None:
1436                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1437                                 return
1438                         mediaURL = mobj.group(1).replace('\\/', '/')
1439                         video_extension = mediaURL[-3:]
1440                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1441
1442                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1443                 if mobj is None:
1444                         self._downloader.trouble(u'ERROR: unable to extract title')
1445                         return
1446                 video_title = mobj.group(1).decode('utf-8')
1447                 video_title = sanitize_title(video_title)
1448
1449                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1450                 if mobj is None:
1451                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1452                         return
1453                 video_uploader = mobj.group(1)
1454
1455                 try:
1456                         # Process video information
1457                         self._downloader.process_info({
1458                                 'id':           video_id.decode('utf-8'),
1459                                 'url':          video_url.decode('utf-8'),
1460                                 'uploader':     video_uploader.decode('utf-8'),
1461                                 'upload_date':  u'NA',
1462                                 'title':        video_title,
1463                                 'stitle':       simple_title,
1464                                 'ext':          video_extension.decode('utf-8'),
1465                                 'format':       u'NA',
1466                                 'player_url':   None,
1467                         })
1468                 except UnavailableVideoError:
1469                         self._downloader.trouble(u'\nERROR: unable to download video')
1470
1471
1472 class DailymotionIE(InfoExtractor):
1473         """Information Extractor for Dailymotion"""
1474
1475         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1476
1477         def __init__(self, downloader=None):
1478                 InfoExtractor.__init__(self, downloader)
1479
1480         @staticmethod
1481         def suitable(url):
1482                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1483
1484         def report_download_webpage(self, video_id):
1485                 """Report webpage download."""
1486                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1487
1488         def report_extraction(self, video_id):
1489                 """Report information extraction."""
1490                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1491
1492         def _real_initialize(self):
1493                 return
1494
1495         def _real_extract(self, url):
1496                 # Extract id and simplified title from URL
1497                 mobj = re.match(self._VALID_URL, url)
1498                 if mobj is None:
1499                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1500                         return
1501
1502                 # At this point we have a new video
1503                 self._downloader.increment_downloads()
1504                 video_id = mobj.group(1)
1505
1506                 simple_title = mobj.group(2).decode('utf-8')
1507                 video_extension = 'flv'
1508
1509                 # Retrieve video webpage to extract further information
1510                 request = urllib2.Request(url)
1511                 try:
1512                         self.report_download_webpage(video_id)
1513                         webpage = urllib2.urlopen(request).read()
1514                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1515                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1516                         return
1517
1518                 # Extract URL, uploader and title from webpage
1519                 self.report_extraction(video_id)
1520                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1521                 if mobj is None:
1522                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1523                         return
1524                 mediaURL = urllib.unquote(mobj.group(1))
1525
1526                 # if needed add http://www.dailymotion.com/ if relative URL
1527
1528                 video_url = mediaURL
1529
1530                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1531                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1532                 if mobj is None:
1533                         self._downloader.trouble(u'ERROR: unable to extract title')
1534                         return
1535                 video_title = mobj.group(1).decode('utf-8')
1536                 video_title = sanitize_title(video_title)
1537
1538                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1539                 if mobj is None:
1540                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1541                         return
1542                 video_uploader = mobj.group(1)
1543
1544                 try:
1545                         # Process video information
1546                         self._downloader.process_info({
1547                                 'id':           video_id.decode('utf-8'),
1548                                 'url':          video_url.decode('utf-8'),
1549                                 'uploader':     video_uploader.decode('utf-8'),
1550                                 'upload_date':  u'NA',
1551                                 'title':        video_title,
1552                                 'stitle':       simple_title,
1553                                 'ext':          video_extension.decode('utf-8'),
1554                                 'format':       u'NA',
1555                                 'player_url':   None,
1556                         })
1557                 except UnavailableVideoError:
1558                         self._downloader.trouble(u'\nERROR: unable to download video')
1559
1560 class GoogleIE(InfoExtractor):
1561         """Information extractor for video.google.com."""
1562
1563         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1564
1565         def __init__(self, downloader=None):
1566                 InfoExtractor.__init__(self, downloader)
1567
1568         @staticmethod
1569         def suitable(url):
1570                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1571
1572         def report_download_webpage(self, video_id):
1573                 """Report webpage download."""
1574                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1575
1576         def report_extraction(self, video_id):
1577                 """Report information extraction."""
1578                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1579
1580         def _real_initialize(self):
1581                 return
1582
1583         def _real_extract(self, url):
1584                 # Extract id from URL
1585                 mobj = re.match(self._VALID_URL, url)
1586                 if mobj is None:
1587                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1588                         return
1589
1590                 # At this point we have a new video
1591                 self._downloader.increment_downloads()
1592                 video_id = mobj.group(1)
1593
1594                 video_extension = 'mp4'
1595
1596                 # Retrieve video webpage to extract further information
1597                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1598                 try:
1599                         self.report_download_webpage(video_id)
1600                         webpage = urllib2.urlopen(request).read()
1601                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1602                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1603                         return
1604
1605                 # Extract URL, uploader, and title from webpage
1606                 self.report_extraction(video_id)
1607                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1608                 if mobj is None:
1609                         video_extension = 'flv'
1610                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1611                 if mobj is None:
1612                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1613                         return
1614                 mediaURL = urllib.unquote(mobj.group(1))
1615                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1616                 mediaURL = mediaURL.replace('\\x26', '\x26')
1617
1618                 video_url = mediaURL
1619
1620                 mobj = re.search(r'<title>(.*)</title>', webpage)
1621                 if mobj is None:
1622                         self._downloader.trouble(u'ERROR: unable to extract title')
1623                         return
1624                 video_title = mobj.group(1).decode('utf-8')
1625                 video_title = sanitize_title(video_title)
1626                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1627
1628                 # Extract video description
1629                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1630                 if mobj is None:
1631                         self._downloader.trouble(u'ERROR: unable to extract video description')
1632                         return
1633                 video_description = mobj.group(1).decode('utf-8')
1634                 if not video_description:
1635                         video_description = 'No description available.'
1636
1637                 # Extract video thumbnail
1638                 if self._downloader.params.get('forcethumbnail', False):
1639                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1640                         try:
1641                                 webpage = urllib2.urlopen(request).read()
1642                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1643                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1644                                 return
1645                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1646                         if mobj is None:
1647                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1648                                 return
1649                         video_thumbnail = mobj.group(1)
1650                 else:   # we need something to pass to process_info
1651                         video_thumbnail = ''
1652
1653
1654                 try:
1655                         # Process video information
1656                         self._downloader.process_info({
1657                                 'id':           video_id.decode('utf-8'),
1658                                 'url':          video_url.decode('utf-8'),
1659                                 'uploader':     u'NA',
1660                                 'upload_date':  u'NA',
1661                                 'title':        video_title,
1662                                 'stitle':       simple_title,
1663                                 'ext':          video_extension.decode('utf-8'),
1664                                 'format':       u'NA',
1665                                 'player_url':   None,
1666                         })
1667                 except UnavailableVideoError:
1668                         self._downloader.trouble(u'\nERROR: unable to download video')
1669
1670
1671 class PhotobucketIE(InfoExtractor):
1672         """Information extractor for photobucket.com."""
1673
1674         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1675
1676         def __init__(self, downloader=None):
1677                 InfoExtractor.__init__(self, downloader)
1678
1679         @staticmethod
1680         def suitable(url):
1681                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1682
1683         def report_download_webpage(self, video_id):
1684                 """Report webpage download."""
1685                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1686
1687         def report_extraction(self, video_id):
1688                 """Report information extraction."""
1689                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1690
1691         def _real_initialize(self):
1692                 return
1693
1694         def _real_extract(self, url):
1695                 # Extract id from URL
1696                 mobj = re.match(self._VALID_URL, url)
1697                 if mobj is None:
1698                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1699                         return
1700
1701                 # At this point we have a new video
1702                 self._downloader.increment_downloads()
1703                 video_id = mobj.group(1)
1704
1705                 video_extension = 'flv'
1706
1707                 # Retrieve video webpage to extract further information
1708                 request = urllib2.Request(url)
1709                 try:
1710                         self.report_download_webpage(video_id)
1711                         webpage = urllib2.urlopen(request).read()
1712                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1713                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1714                         return
1715
1716                 # Extract URL, uploader, and title from webpage
1717                 self.report_extraction(video_id)
1718                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1719                 if mobj is None:
1720                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1721                         return
1722                 mediaURL = urllib.unquote(mobj.group(1))
1723
1724                 video_url = mediaURL
1725
1726                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1727                 if mobj is None:
1728                         self._downloader.trouble(u'ERROR: unable to extract title')
1729                         return
1730                 video_title = mobj.group(1).decode('utf-8')
1731                 video_title = sanitize_title(video_title)
1732                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1733
1734                 video_uploader = mobj.group(2).decode('utf-8')
1735
1736                 try:
1737                         # Process video information
1738                         self._downloader.process_info({
1739                                 'id':           video_id.decode('utf-8'),
1740                                 'url':          video_url.decode('utf-8'),
1741                                 'uploader':     video_uploader,
1742                                 'upload_date':  u'NA',
1743                                 'title':        video_title,
1744                                 'stitle':       simple_title,
1745                                 'ext':          video_extension.decode('utf-8'),
1746                                 'format':       u'NA',
1747                                 'player_url':   None,
1748                         })
1749                 except UnavailableVideoError:
1750                         self._downloader.trouble(u'\nERROR: unable to download video')
1751
1752
1753 class YahooIE(InfoExtractor):
1754         """Information extractor for video.yahoo.com."""
1755
1756         # _VALID_URL matches all Yahoo! Video URLs
1757         # _VPAGE_URL matches only the extractable '/watch/' URLs
1758         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1759         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1760
1761         def __init__(self, downloader=None):
1762                 InfoExtractor.__init__(self, downloader)
1763
1764         @staticmethod
1765         def suitable(url):
1766                 return (re.match(YahooIE._VALID_URL, url) is not None)
1767
1768         def report_download_webpage(self, video_id):
1769                 """Report webpage download."""
1770                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1771
1772         def report_extraction(self, video_id):
1773                 """Report information extraction."""
1774                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1775
1776         def _real_initialize(self):
1777                 return
1778
1779         def _real_extract(self, url, new_video=True):
1780                 # Extract ID from URL
1781                 mobj = re.match(self._VALID_URL, url)
1782                 if mobj is None:
1783                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1784                         return
1785
1786                 # At this point we have a new video
1787                 self._downloader.increment_downloads()
1788                 video_id = mobj.group(2)
1789                 video_extension = 'flv'
1790
1791                 # Rewrite valid but non-extractable URLs as
1792                 # extractable English language /watch/ URLs
1793                 if re.match(self._VPAGE_URL, url) is None:
1794                         request = urllib2.Request(url)
1795                         try:
1796                                 webpage = urllib2.urlopen(request).read()
1797                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1798                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1799                                 return
1800
1801                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1802                         if mobj is None:
1803                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1804                                 return
1805                         yahoo_id = mobj.group(1)
1806
1807                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1808                         if mobj is None:
1809                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1810                                 return
1811                         yahoo_vid = mobj.group(1)
1812
1813                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1814                         return self._real_extract(url, new_video=False)
1815
1816                 # Retrieve video webpage to extract further information
1817                 request = urllib2.Request(url)
1818                 try:
1819                         self.report_download_webpage(video_id)
1820                         webpage = urllib2.urlopen(request).read()
1821                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1822                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1823                         return
1824
1825                 # Extract uploader and title from webpage
1826                 self.report_extraction(video_id)
1827                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1828                 if mobj is None:
1829                         self._downloader.trouble(u'ERROR: unable to extract video title')
1830                         return
1831                 video_title = mobj.group(1).decode('utf-8')
1832                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1833
1834                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1837                         return
1838                 video_uploader = mobj.group(1).decode('utf-8')
1839
1840                 # Extract video thumbnail
1841                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1842                 if mobj is None:
1843                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1844                         return
1845                 video_thumbnail = mobj.group(1).decode('utf-8')
1846
1847                 # Extract video description
1848                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: unable to extract video description')
1851                         return
1852                 video_description = mobj.group(1).decode('utf-8')
1853                 if not video_description: video_description = 'No description available.'
1854
1855                 # Extract video height and width
1856                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1857                 if mobj is None:
1858                         self._downloader.trouble(u'ERROR: unable to extract video height')
1859                         return
1860                 yv_video_height = mobj.group(1)
1861
1862                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1863                 if mobj is None:
1864                         self._downloader.trouble(u'ERROR: unable to extract video width')
1865                         return
1866                 yv_video_width = mobj.group(1)
1867
1868                 # Retrieve video playlist to extract media URL
1869                 # I'm not completely sure what all these options are, but we
1870                 # seem to need most of them, otherwise the server sends a 401.
1871                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1872                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1873                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1874                                                                   '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1875                                                                   '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1876                 try:
1877                         self.report_download_webpage(video_id)
1878                         webpage = urllib2.urlopen(request).read()
1879                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1881                         return
1882
1883                 # Extract media URL from playlist XML
1884                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1885                 if mobj is None:
1886                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1887                         return
1888                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1889                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1890
1891                 try:
1892                         # Process video information
1893                         self._downloader.process_info({
1894                                 'id':           video_id.decode('utf-8'),
1895                                 'url':          video_url,
1896                                 'uploader':     video_uploader,
1897                                 'upload_date':  u'NA',
1898                                 'title':        video_title,
1899                                 'stitle':       simple_title,
1900                                 'ext':          video_extension.decode('utf-8'),
1901                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1902                                 'description':  video_description,
1903                                 'thumbnail':    video_thumbnail,
1904                                 'description':  video_description,
1905                                 'player_url':   None,
1906                         })
1907                 except UnavailableVideoError:
1908                         self._downloader.trouble(u'\nERROR: unable to download video')
1909
1910
1911 class VimeoIE(InfoExtractor):
1912         """Information extractor for vimeo.com."""
1913
1914         # _VALID_URL matches Vimeo URLs
1915         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1916
1917         def __init__(self, downloader=None):
1918                 InfoExtractor.__init__(self, downloader)
1919
1920         @staticmethod
1921         def suitable(url):
1922                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1923
1924         def report_download_webpage(self, video_id):
1925                 """Report webpage download."""
1926                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1927
1928         def report_extraction(self, video_id):
1929                 """Report information extraction."""
1930                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1931
1932         def _real_initialize(self):
1933                 return
1934
1935         def _real_extract(self, url, new_video=True):
1936                 # Extract ID from URL
1937                 mobj = re.match(self._VALID_URL, url)
1938                 if mobj is None:
1939                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1940                         return
1941
1942                 # At this point we have a new video
1943                 self._downloader.increment_downloads()
1944                 video_id = mobj.group(1)
1945
1946                 # Retrieve video webpage to extract further information
1947                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1948                 try:
1949                         self.report_download_webpage(video_id)
1950                         webpage = urllib2.urlopen(request).read()
1951                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1952                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1953                         return
1954
1955                 # Now we begin extracting as much information as we can from what we
1956                 # retrieved. First we extract the information common to all extractors,
1957                 # and latter we extract those that are Vimeo specific.
1958                 self.report_extraction(video_id)
1959
1960                 # Extract title
1961                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1962                 if mobj is None:
1963                         self._downloader.trouble(u'ERROR: unable to extract video title')
1964                         return
1965                 video_title = mobj.group(1).decode('utf-8')
1966                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1967
1968                 # Extract uploader
1969                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1970                 if mobj is None:
1971                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1972                         return
1973                 video_uploader = mobj.group(1).decode('utf-8')
1974
1975                 # Extract video thumbnail
1976                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1979                         return
1980                 video_thumbnail = mobj.group(1).decode('utf-8')
1981
1982                 # # Extract video description
1983                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1984                 # if mobj is None:
1985                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1986                 #       return
1987                 # video_description = mobj.group(1).decode('utf-8')
1988                 # if not video_description: video_description = 'No description available.'
1989                 video_description = 'Foo.'
1990
1991                 # Vimeo specific: extract request signature
1992                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1995                         return
1996                 sig = mobj.group(1).decode('utf-8')
1997
1998                 # Vimeo specific: Extract request signature expiration
1999                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2000                 if mobj is None:
2001                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2002                         return
2003                 sig_exp = mobj.group(1).decode('utf-8')
2004
2005                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2006
2007                 try:
2008                         # Process video information
2009                         self._downloader.process_info({
2010                                 'id':           video_id.decode('utf-8'),
2011                                 'url':          video_url,
2012                                 'uploader':     video_uploader,
2013                                 'upload_date':  u'NA',
2014                                 'title':        video_title,
2015                                 'stitle':       simple_title,
2016                                 'ext':          u'mp4',
2017                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2018                                 'description':  video_description,
2019                                 'thumbnail':    video_thumbnail,
2020                                 'description':  video_description,
2021                                 'player_url':   None,
2022                         })
2023                 except UnavailableVideoError:
2024                         self._downloader.trouble(u'ERROR: unable to download video')
2025
2026
2027 class GenericIE(InfoExtractor):
2028         """Generic last-resort information extractor."""
2029
2030         def __init__(self, downloader=None):
2031                 InfoExtractor.__init__(self, downloader)
2032
2033         @staticmethod
2034         def suitable(url):
2035                 return True
2036
2037         def report_download_webpage(self, video_id):
2038                 """Report webpage download."""
2039                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2040                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2041
2042         def report_extraction(self, video_id):
2043                 """Report information extraction."""
2044                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2045
2046         def _real_initialize(self):
2047                 return
2048
2049         def _real_extract(self, url):
2050                 # At this point we have a new video
2051                 self._downloader.increment_downloads()
2052
2053                 video_id = url.split('/')[-1]
2054                 request = urllib2.Request(url)
2055                 try:
2056                         self.report_download_webpage(video_id)
2057                         webpage = urllib2.urlopen(request).read()
2058                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2060                         return
2061                 except ValueError, err:
2062                         # since this is the last-resort InfoExtractor, if
2063                         # this error is thrown, it'll be thrown here
2064                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2065                         return
2066
2067                 self.report_extraction(video_id)
2068                 # Start with something easy: JW Player in SWFObject
2069                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2070                 if mobj is None:
2071                         # Broaden the search a little bit
2072                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2073                 if mobj is None:
2074                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2075                         return
2076
2077                 # It's possible that one of the regexes
2078                 # matched, but returned an empty group:
2079                 if mobj.group(1) is None:
2080                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2081                         return
2082
2083                 video_url = urllib.unquote(mobj.group(1))
2084                 video_id  = os.path.basename(video_url)
2085
2086                 # here's a fun little line of code for you:
2087                 video_extension = os.path.splitext(video_id)[1][1:]
2088                 video_id        = os.path.splitext(video_id)[0]
2089
2090                 # it's tempting to parse this further, but you would
2091                 # have to take into account all the variations like
2092                 #   Video Title - Site Name
2093                 #   Site Name | Video Title
2094                 #   Video Title - Tagline | Site Name
2095                 # and so on and so forth; it's just not practical
2096                 mobj = re.search(r'<title>(.*)</title>', webpage)
2097                 if mobj is None:
2098                         self._downloader.trouble(u'ERROR: unable to extract title')
2099                         return
2100                 video_title = mobj.group(1).decode('utf-8')
2101                 video_title = sanitize_title(video_title)
2102                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2103
2104                 # video uploader is domain name
2105                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2106                 if mobj is None:
2107                         self._downloader.trouble(u'ERROR: unable to extract title')
2108                         return
2109                 video_uploader = mobj.group(1).decode('utf-8')
2110
2111                 try:
2112                         # Process video information
2113                         self._downloader.process_info({
2114                                 'id':           video_id.decode('utf-8'),
2115                                 'url':          video_url.decode('utf-8'),
2116                                 'uploader':     video_uploader,
2117                                 'upload_date':  u'NA',
2118                                 'title':        video_title,
2119                                 'stitle':       simple_title,
2120                                 'ext':          video_extension.decode('utf-8'),
2121                                 'format':       u'NA',
2122                                 'player_url':   None,
2123                         })
2124                 except UnavailableVideoError, err:
2125                         self._downloader.trouble(u'\nERROR: unable to download video')
2126
2127
2128 class YoutubeSearchIE(InfoExtractor):
2129         """Information Extractor for YouTube search queries."""
2130         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2131         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2132         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2133         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2134         _youtube_ie = None
2135         _max_youtube_results = 1000
2136
2137         def __init__(self, youtube_ie, downloader=None):
2138                 InfoExtractor.__init__(self, downloader)
2139                 self._youtube_ie = youtube_ie
2140
2141         @staticmethod
2142         def suitable(url):
2143                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2144
2145         def report_download_page(self, query, pagenum):
2146                 """Report attempt to download playlist page with given number."""
2147                 query = query.decode(preferredencoding())
2148                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2149
2150         def _real_initialize(self):
2151                 self._youtube_ie.initialize()
2152
2153         def _real_extract(self, query):
2154                 mobj = re.match(self._VALID_QUERY, query)
2155                 if mobj is None:
2156                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2157                         return
2158
2159                 prefix, query = query.split(':')
2160                 prefix = prefix[8:]
2161                 query  = query.encode('utf-8')
2162                 if prefix == '':
2163                         self._download_n_results(query, 1)
2164                         return
2165                 elif prefix == 'all':
2166                         self._download_n_results(query, self._max_youtube_results)
2167                         return
2168                 else:
2169                         try:
2170                                 n = long(prefix)
2171                                 if n <= 0:
2172                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2173                                         return
2174                                 elif n > self._max_youtube_results:
2175                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2176                                         n = self._max_youtube_results
2177                                 self._download_n_results(query, n)
2178                                 return
2179                         except ValueError: # parsing prefix as integer fails
2180                                 self._download_n_results(query, 1)
2181                                 return
2182
2183         def _download_n_results(self, query, n):
2184                 """Downloads a specified number of results for a query"""
2185
2186                 video_ids = []
2187                 already_seen = set()
2188                 pagenum = 1
2189
2190                 while True:
2191                         self.report_download_page(query, pagenum)
2192                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2193                         request = urllib2.Request(result_url)
2194                         try:
2195                                 page = urllib2.urlopen(request).read()
2196                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2197                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2198                                 return
2199
2200                         # Extract video identifiers
2201                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2202                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2203                                 if video_id not in already_seen:
2204                                         video_ids.append(video_id)
2205                                         already_seen.add(video_id)
2206                                         if len(video_ids) == n:
2207                                                 # Specified n videos reached
2208                                                 for id in video_ids:
2209                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2210                                                 return
2211
2212                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2213                                 for id in video_ids:
2214                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2215                                 return
2216
2217                         pagenum = pagenum + 1
2218
2219 class GoogleSearchIE(InfoExtractor):
2220         """Information Extractor for Google Video search queries."""
2221         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2222         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2223         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2224         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2225         _google_ie = None
2226         _max_google_results = 1000
2227
2228         def __init__(self, google_ie, downloader=None):
2229                 InfoExtractor.__init__(self, downloader)
2230                 self._google_ie = google_ie
2231
2232         @staticmethod
2233         def suitable(url):
2234                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2235
2236         def report_download_page(self, query, pagenum):
2237                 """Report attempt to download playlist page with given number."""
2238                 query = query.decode(preferredencoding())
2239                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2240
2241         def _real_initialize(self):
2242                 self._google_ie.initialize()
2243
2244         def _real_extract(self, query):
2245                 mobj = re.match(self._VALID_QUERY, query)
2246                 if mobj is None:
2247                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2248                         return
2249
2250                 prefix, query = query.split(':')
2251                 prefix = prefix[8:]
2252                 query  = query.encode('utf-8')
2253                 if prefix == '':
2254                         self._download_n_results(query, 1)
2255                         return
2256                 elif prefix == 'all':
2257                         self._download_n_results(query, self._max_google_results)
2258                         return
2259                 else:
2260                         try:
2261                                 n = long(prefix)
2262                                 if n <= 0:
2263                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2264                                         return
2265                                 elif n > self._max_google_results:
2266                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2267                                         n = self._max_google_results
2268                                 self._download_n_results(query, n)
2269                                 return
2270                         except ValueError: # parsing prefix as integer fails
2271                                 self._download_n_results(query, 1)
2272                                 return
2273
2274         def _download_n_results(self, query, n):
2275                 """Downloads a specified number of results for a query"""
2276
2277                 video_ids = []
2278                 already_seen = set()
2279                 pagenum = 1
2280
2281                 while True:
2282                         self.report_download_page(query, pagenum)
2283                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2284                         request = urllib2.Request(result_url)
2285                         try:
2286                                 page = urllib2.urlopen(request).read()
2287                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2288                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2289                                 return
2290
2291                         # Extract video identifiers
2292                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2293                                 video_id = mobj.group(1)
2294                                 if video_id not in already_seen:
2295                                         video_ids.append(video_id)
2296                                         already_seen.add(video_id)
2297                                         if len(video_ids) == n:
2298                                                 # Specified n videos reached
2299                                                 for id in video_ids:
2300                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2301                                                 return
2302
2303                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2304                                 for id in video_ids:
2305                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2306                                 return
2307
2308                         pagenum = pagenum + 1
2309
2310 class YahooSearchIE(InfoExtractor):
2311         """Information Extractor for Yahoo! Video search queries."""
2312         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2313         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2314         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2315         _MORE_PAGES_INDICATOR = r'\s*Next'
2316         _yahoo_ie = None
2317         _max_yahoo_results = 1000
2318
2319         def __init__(self, yahoo_ie, downloader=None):
2320                 InfoExtractor.__init__(self, downloader)
2321                 self._yahoo_ie = yahoo_ie
2322
2323         @staticmethod
2324         def suitable(url):
2325                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2326
2327         def report_download_page(self, query, pagenum):
2328                 """Report attempt to download playlist page with given number."""
2329                 query = query.decode(preferredencoding())
2330                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2331
2332         def _real_initialize(self):
2333                 self._yahoo_ie.initialize()
2334
2335         def _real_extract(self, query):
2336                 mobj = re.match(self._VALID_QUERY, query)
2337                 if mobj is None:
2338                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2339                         return
2340
2341                 prefix, query = query.split(':')
2342                 prefix = prefix[8:]
2343                 query  = query.encode('utf-8')
2344                 if prefix == '':
2345                         self._download_n_results(query, 1)
2346                         return
2347                 elif prefix == 'all':
2348                         self._download_n_results(query, self._max_yahoo_results)
2349                         return
2350                 else:
2351                         try:
2352                                 n = long(prefix)
2353                                 if n <= 0:
2354                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2355                                         return
2356                                 elif n > self._max_yahoo_results:
2357                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2358                                         n = self._max_yahoo_results
2359                                 self._download_n_results(query, n)
2360                                 return
2361                         except ValueError: # parsing prefix as integer fails
2362                                 self._download_n_results(query, 1)
2363                                 return
2364
2365         def _download_n_results(self, query, n):
2366                 """Downloads a specified number of results for a query"""
2367
2368                 video_ids = []
2369                 already_seen = set()
2370                 pagenum = 1
2371
2372                 while True:
2373                         self.report_download_page(query, pagenum)
2374                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2375                         request = urllib2.Request(result_url)
2376                         try:
2377                                 page = urllib2.urlopen(request).read()
2378                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2380                                 return
2381
2382                         # Extract video identifiers
2383                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2384                                 video_id = mobj.group(1)
2385                                 if video_id not in already_seen:
2386                                         video_ids.append(video_id)
2387                                         already_seen.add(video_id)
2388                                         if len(video_ids) == n:
2389                                                 # Specified n videos reached
2390                                                 for id in video_ids:
2391                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2392                                                 return
2393
2394                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2395                                 for id in video_ids:
2396                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2397                                 return
2398
2399                         pagenum = pagenum + 1
2400
2401 class YoutubePlaylistIE(InfoExtractor):
2402         """Information Extractor for YouTube playlists."""
2403
2404         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2405         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2406         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2407         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2408         _youtube_ie = None
2409
2410         def __init__(self, youtube_ie, downloader=None):
2411                 InfoExtractor.__init__(self, downloader)
2412                 self._youtube_ie = youtube_ie
2413
2414         @staticmethod
2415         def suitable(url):
2416                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2417
2418         def report_download_page(self, playlist_id, pagenum):
2419                 """Report attempt to download playlist page with given number."""
2420                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2421
2422         def _real_initialize(self):
2423                 self._youtube_ie.initialize()
2424
2425         def _real_extract(self, url):
2426                 # Extract playlist id
2427                 mobj = re.match(self._VALID_URL, url)
2428                 if mobj is None:
2429                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2430                         return
2431
2432                 # Single video case
2433                 if mobj.group(3) is not None:
2434                         self._youtube_ie.extract(mobj.group(3))
2435                         return
2436
2437                 # Download playlist pages
2438                 # prefix is 'p' as default for playlists but there are other types that need extra care
2439                 playlist_prefix = mobj.group(1)
2440                 if playlist_prefix == 'a':
2441                         playlist_access = 'artist'
2442                 else:
2443                         playlist_prefix = 'p'
2444                         playlist_access = 'view_play_list'
2445                 playlist_id = mobj.group(2)
2446                 video_ids = []
2447                 pagenum = 1
2448
2449                 while True:
2450                         self.report_download_page(playlist_id, pagenum)
2451                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2452                         try:
2453                                 page = urllib2.urlopen(request).read()
2454                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2455                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2456                                 return
2457
2458                         # Extract video identifiers
2459                         ids_in_page = []
2460                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2461                                 if mobj.group(1) not in ids_in_page:
2462                                         ids_in_page.append(mobj.group(1))
2463                         video_ids.extend(ids_in_page)
2464
2465                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2466                                 break
2467                         pagenum = pagenum + 1
2468
2469                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2470                 playlistend = self._downloader.params.get('playlistend', -1)
2471                 video_ids = video_ids[playliststart:playlistend]
2472
2473                 for id in video_ids:
2474                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2475                 return
2476
2477 class YoutubeUserIE(InfoExtractor):
2478         """Information Extractor for YouTube users."""
2479
2480         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2481         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2482         _GDATA_PAGE_SIZE = 50
2483         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2484         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2485         _youtube_ie = None
2486
2487         def __init__(self, youtube_ie, downloader=None):
2488                 InfoExtractor.__init__(self, downloader)
2489                 self._youtube_ie = youtube_ie
2490
2491         @staticmethod
2492         def suitable(url):
2493                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2494
2495         def report_download_page(self, username, start_index):
2496                 """Report attempt to download user page."""
2497                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2498                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2499
2500         def _real_initialize(self):
2501                 self._youtube_ie.initialize()
2502
2503         def _real_extract(self, url):
2504                 # Extract username
2505                 mobj = re.match(self._VALID_URL, url)
2506                 if mobj is None:
2507                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2508                         return
2509
2510                 username = mobj.group(1)
2511
2512                 # Download video ids using YouTube Data API. Result size per
2513                 # query is limited (currently to 50 videos) so we need to query
2514                 # page by page until there are no video ids - it means we got
2515                 # all of them.
2516
2517                 video_ids = []
2518                 pagenum = 0
2519
2520                 while True:
2521                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2522                         self.report_download_page(username, start_index)
2523
2524                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2525
2526                         try:
2527                                 page = urllib2.urlopen(request).read()
2528                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2530                                 return
2531
2532                         # Extract video identifiers
2533                         ids_in_page = []
2534
2535                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2536                                 if mobj.group(1) not in ids_in_page:
2537                                         ids_in_page.append(mobj.group(1))
2538
2539                         video_ids.extend(ids_in_page)
2540
2541                         # A little optimization - if current page is not
2542                         # "full", ie. does not contain PAGE_SIZE video ids then
2543                         # we can assume that this page is the last one - there
2544                         # are no more ids on further pages - no need to query
2545                         # again.
2546
2547                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2548                                 break
2549
2550                         pagenum += 1
2551
2552                 all_ids_count = len(video_ids)
2553                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2554                 playlistend = self._downloader.params.get('playlistend', -1)
2555
2556                 if playlistend == -1:
2557                         video_ids = video_ids[playliststart:]
2558                 else:
2559                         video_ids = video_ids[playliststart:playlistend]
2560
2561                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2562                                                                   (username, all_ids_count, len(video_ids)))
2563
2564                 for video_id in video_ids:
2565                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2566
2567
2568 class DepositFilesIE(InfoExtractor):
2569         """Information extractor for depositfiles.com"""
2570
2571         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2572
2573         def __init__(self, downloader=None):
2574                 InfoExtractor.__init__(self, downloader)
2575
2576         @staticmethod
2577         def suitable(url):
2578                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2579
2580         def report_download_webpage(self, file_id):
2581                 """Report webpage download."""
2582                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2583
2584         def report_extraction(self, file_id):
2585                 """Report information extraction."""
2586                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2587
2588         def _real_initialize(self):
2589                 return
2590
2591         def _real_extract(self, url):
2592                 # At this point we have a new file
2593                 self._downloader.increment_downloads()
2594
2595                 file_id = url.split('/')[-1]
2596                 # Rebuild url in english locale
2597                 url = 'http://depositfiles.com/en/files/' + file_id
2598
2599                 # Retrieve file webpage with 'Free download' button pressed
2600                 free_download_indication = { 'gateway_result' : '1' }
2601                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2602                 try:
2603                         self.report_download_webpage(file_id)
2604                         webpage = urllib2.urlopen(request).read()
2605                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2606                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2607                         return
2608
2609                 # Search for the real file URL
2610                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2611                 if (mobj is None) or (mobj.group(1) is None):
2612                         # Try to figure out reason of the error.
2613                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2614                         if (mobj is not None) and (mobj.group(1) is not None):
2615                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2616                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2617                         else:
2618                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2619                         return
2620
2621                 file_url = mobj.group(1)
2622                 file_extension = os.path.splitext(file_url)[1][1:]
2623
2624                 # Search for file title
2625                 mobj = re.search(r'<b title="(.*?)">', webpage)
2626                 if mobj is None:
2627                         self._downloader.trouble(u'ERROR: unable to extract title')
2628                         return
2629                 file_title = mobj.group(1).decode('utf-8')
2630
2631                 try:
2632                         # Process file information
2633                         self._downloader.process_info({
2634                                 'id':           file_id.decode('utf-8'),
2635                                 'url':          file_url.decode('utf-8'),
2636                                 'uploader':     u'NA',
2637                                 'upload_date':  u'NA',
2638                                 'title':        file_title,
2639                                 'stitle':       file_title,
2640                                 'ext':          file_extension.decode('utf-8'),
2641                                 'format':       u'NA',
2642                                 'player_url':   None,
2643                         })
2644                 except UnavailableVideoError, err:
2645                         self._downloader.trouble(u'ERROR: unable to download file')
2646
2647 class FacebookIE(InfoExtractor):
2648         """Information Extractor for Facebook"""
2649
2650         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2651         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2652         _NETRC_MACHINE = 'facebook'
2653         _available_formats = ['highqual', 'lowqual']
2654         _video_extensions = {
2655                 'highqual': 'mp4',
2656                 'lowqual': 'mp4',
2657         }
2658
2659         def __init__(self, downloader=None):
2660                 InfoExtractor.__init__(self, downloader)
2661
2662         @staticmethod
2663         def suitable(url):
2664                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2665
2666         def _reporter(self, message):
2667                 """Add header and report message."""
2668                 self._downloader.to_screen(u'[facebook] %s' % message)
2669
2670         def report_login(self):
2671                 """Report attempt to log in."""
2672                 self._reporter(u'Logging in')
2673
2674         def report_video_webpage_download(self, video_id):
2675                 """Report attempt to download video webpage."""
2676                 self._reporter(u'%s: Downloading video webpage' % video_id)
2677
2678         def report_information_extraction(self, video_id):
2679                 """Report attempt to extract video information."""
2680                 self._reporter(u'%s: Extracting video information' % video_id)
2681
2682         def _parse_page(self, video_webpage):
2683                 """Extract video information from page"""
2684                 # General data
2685                 data = {'title': r'class="video_title datawrap">(.*?)</',
2686                         'description': r'<div class="datawrap">(.*?)</div>',
2687                         'owner': r'\("video_owner_name", "(.*?)"\)',
2688                         'upload_date': r'data-date="(.*?)"',
2689                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2690                         }
2691                 video_info = {}
2692                 for piece in data.keys():
2693                         mobj = re.search(data[piece], video_webpage)
2694                         if mobj is not None:
2695                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2696
2697                 # Video urls
2698                 video_urls = {}
2699                 for fmt in self._available_formats:
2700                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2701                         if mobj is not None:
2702                                 # URL is in a Javascript segment inside an escaped Unicode format within
2703                                 # the generally utf-8 page
2704                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2705                 video_info['video_urls'] = video_urls
2706
2707                 return video_info
2708
2709         def _real_initialize(self):
2710                 if self._downloader is None:
2711                         return
2712
2713                 useremail = None
2714                 password = None
2715                 downloader_params = self._downloader.params
2716
2717                 # Attempt to use provided username and password or .netrc data
2718                 if downloader_params.get('username', None) is not None:
2719                         useremail = downloader_params['username']
2720                         password = downloader_params['password']
2721                 elif downloader_params.get('usenetrc', False):
2722                         try:
2723                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2724                                 if info is not None:
2725                                         useremail = info[0]
2726                                         password = info[2]
2727                                 else:
2728                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2729                         except (IOError, netrc.NetrcParseError), err:
2730                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2731                                 return
2732
2733                 if useremail is None:
2734                         return
2735
2736                 # Log in
2737                 login_form = {
2738                         'email': useremail,
2739                         'pass': password,
2740                         'login': 'Log+In'
2741                         }
2742                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2743                 try:
2744                         self.report_login()
2745                         login_results = urllib2.urlopen(request).read()
2746                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2747                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2748                                 return
2749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2750                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2751                         return
2752
2753         def _real_extract(self, url):
2754                 mobj = re.match(self._VALID_URL, url)
2755                 if mobj is None:
2756                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2757                         return
2758                 video_id = mobj.group('ID')
2759
2760                 # Get video webpage
2761                 self.report_video_webpage_download(video_id)
2762                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2763                 try:
2764                         page = urllib2.urlopen(request)
2765                         video_webpage = page.read()
2766                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2767                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2768                         return
2769
2770                 # Start extracting information
2771                 self.report_information_extraction(video_id)
2772
2773                 # Extract information
2774                 video_info = self._parse_page(video_webpage)
2775
2776                 # uploader
2777                 if 'owner' not in video_info:
2778                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2779                         return
2780                 video_uploader = video_info['owner']
2781
2782                 # title
2783                 if 'title' not in video_info:
2784                         self._downloader.trouble(u'ERROR: unable to extract video title')
2785                         return
2786                 video_title = video_info['title']
2787                 video_title = video_title.decode('utf-8')
2788                 video_title = sanitize_title(video_title)
2789
2790                 # simplified title
2791                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2792                 simple_title = simple_title.strip(ur'_')
2793
2794                 # thumbnail image
2795                 if 'thumbnail' not in video_info:
2796                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2797                         video_thumbnail = ''
2798                 else:
2799                         video_thumbnail = video_info['thumbnail']
2800
2801                 # upload date
2802                 upload_date = u'NA'
2803                 if 'upload_date' in video_info:
2804                         upload_time = video_info['upload_date']
2805                         timetuple = email.utils.parsedate_tz(upload_time)
2806                         if timetuple is not None:
2807                                 try:
2808                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2809                                 except:
2810                                         pass
2811
2812                 # description
2813                 video_description = video_info.get('description', 'No description available.')
2814
2815                 url_map = video_info['video_urls']
2816                 if len(url_map.keys()) > 0:
2817                         # Decide which formats to download
2818                         req_format = self._downloader.params.get('format', None)
2819                         format_limit = self._downloader.params.get('format_limit', None)
2820
2821                         if format_limit is not None and format_limit in self._available_formats:
2822                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2823                         else:
2824                                 format_list = self._available_formats
2825                         existing_formats = [x for x in format_list if x in url_map]
2826                         if len(existing_formats) == 0:
2827                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2828                                 return
2829                         if req_format is None:
2830                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2831                         elif req_format == '-1':
2832                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2833                         else:
2834                                 # Specific format
2835                                 if req_format not in url_map:
2836                                         self._downloader.trouble(u'ERROR: requested format not available')
2837                                         return
2838                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2839
2840                 for format_param, video_real_url in video_url_list:
2841
2842                         # At this point we have a new video
2843                         self._downloader.increment_downloads()
2844
2845                         # Extension
2846                         video_extension = self._video_extensions.get(format_param, 'mp4')
2847
2848                         try:
2849                                 # Process video information
2850                                 self._downloader.process_info({
2851                                         'id':           video_id.decode('utf-8'),
2852                                         'url':          video_real_url.decode('utf-8'),
2853                                         'uploader':     video_uploader.decode('utf-8'),
2854                                         'upload_date':  upload_date,
2855                                         'title':        video_title,
2856                                         'stitle':       simple_title,
2857                                         'ext':          video_extension.decode('utf-8'),
2858                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2859                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2860                                         'description':  video_description.decode('utf-8'),
2861                                         'player_url':   None,
2862                                 })
2863                         except UnavailableVideoError, err:
2864                                 self._downloader.trouble(u'\nERROR: unable to download video')
2865
2866 class BlipTVIE(InfoExtractor):
2867         """Information extractor for blip.tv"""
2868
2869         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2870         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2871
2872         @staticmethod
2873         def suitable(url):
2874                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2875
2876         def report_extraction(self, file_id):
2877                 """Report information extraction."""
2878                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2879
2880         def _simplify_title(self, title):
2881                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2882                 res = res.strip(ur'_')
2883                 return res
2884
2885         def _real_extract(self, url):
2886                 mobj = re.match(self._VALID_URL, url)
2887                 if mobj is None:
2888                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2889                         return
2890
2891                 if '?' in url:
2892                         cchar = '&'
2893                 else:
2894                         cchar = '?'
2895                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2896                 request = urllib2.Request(json_url)
2897                 self.report_extraction(mobj.group(1))
2898                 try:
2899                         json_code = urllib2.urlopen(request).read()
2900                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2901                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2902                         return
2903                 try:
2904                         json_data = json.loads(json_code)
2905                         if 'Post' in json_data:
2906                                 data = json_data['Post']
2907                         else:
2908                                 data = json_data
2909
2910                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2911                         video_url = data['media']['url']
2912                         umobj = re.match(self._URL_EXT, video_url)
2913                         if umobj is None:
2914                                 raise ValueError('Can not determine filename extension')
2915                         ext = umobj.group(1)
2916
2917                         self._downloader.increment_downloads()
2918
2919                         info = {
2920                                 'id': data['item_id'],
2921                                 'url': video_url,
2922                                 'uploader': data['display_name'],
2923                                 'upload_date': upload_date,
2924                                 'title': data['title'],
2925                                 'stitle': self._simplify_title(data['title']),
2926                                 'ext': ext,
2927                                 'format': data['media']['mimeType'],
2928                                 'thumbnail': data['thumbnailUrl'],
2929                                 'description': data['description'],
2930                                 'player_url': data['embedUrl']
2931                         }
2932                 except (ValueError,KeyError), err:
2933                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2934                         return
2935
2936                 try:
2937                         self._downloader.process_info(info)
2938                 except UnavailableVideoError, err:
2939                         self._downloader.trouble(u'\nERROR: unable to download video')
2940
2941
2942 class PostProcessor(object):
2943         """Post Processor class.
2944
2945         PostProcessor objects can be added to downloaders with their
2946         add_post_processor() method. When the downloader has finished a
2947         successful download, it will take its internal chain of PostProcessors
2948         and start calling the run() method on each one of them, first with
2949         an initial argument and then with the returned value of the previous
2950         PostProcessor.
2951
2952         The chain will be stopped if one of them ever returns None or the end
2953         of the chain is reached.
2954
2955         PostProcessor objects follow a "mutual registration" process similar
2956         to InfoExtractor objects.
2957         """
2958
2959         _downloader = None
2960
2961         def __init__(self, downloader=None):
2962                 self._downloader = downloader
2963
2964         def set_downloader(self, downloader):
2965                 """Sets the downloader for this PP."""
2966                 self._downloader = downloader
2967
2968         def run(self, information):
2969                 """Run the PostProcessor.
2970
2971                 The "information" argument is a dictionary like the ones
2972                 composed by InfoExtractors. The only difference is that this
2973                 one has an extra field called "filepath" that points to the
2974                 downloaded file.
2975
2976                 When this method returns None, the postprocessing chain is
2977                 stopped. However, this method may return an information
2978                 dictionary that will be passed to the next postprocessing
2979                 object in the chain. It can be the one it received after
2980                 changing some fields.
2981
2982                 In addition, this method may raise a PostProcessingError
2983                 exception that will be taken into account by the downloader
2984                 it was called from.
2985                 """
2986                 return information # by default, do nothing
2987
2988 class FFmpegExtractAudioPP(PostProcessor):
2989
2990         def __init__(self, downloader=None, preferredcodec=None):
2991                 PostProcessor.__init__(self, downloader)
2992                 if preferredcodec is None:
2993                         preferredcodec = 'best'
2994                 self._preferredcodec = preferredcodec
2995
2996         @staticmethod
2997         def get_audio_codec(path):
2998                 try:
2999                         cmd = ['ffprobe', '-show_streams', '--', path]
3000                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3001                         output = handle.communicate()[0]
3002                         if handle.wait() != 0:
3003                                 return None
3004                 except (IOError, OSError):
3005                         return None
3006                 audio_codec = None
3007                 for line in output.split('\n'):
3008                         if line.startswith('codec_name='):
3009                                 audio_codec = line.split('=')[1].strip()
3010                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3011                                 return audio_codec
3012                 return None
3013
3014         @staticmethod
3015         def run_ffmpeg(path, out_path, codec, more_opts):
3016                 try:
3017                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3018                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3019                         return (ret == 0)
3020                 except (IOError, OSError):
3021                         return False
3022
3023         def run(self, information):
3024                 path = information['filepath']
3025
3026                 filecodec = self.get_audio_codec(path)
3027                 if filecodec is None:
3028                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3029                         return None
3030
3031                 more_opts = []
3032                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3033                         if filecodec == 'aac' or filecodec == 'mp3':
3034                                 # Lossless if possible
3035                                 acodec = 'copy'
3036                                 extension = filecodec
3037                                 if filecodec == 'aac':
3038                                         more_opts = ['-f', 'adts']
3039                         else:
3040                                 # MP3 otherwise.
3041                                 acodec = 'libmp3lame'
3042                                 extension = 'mp3'
3043                                 more_opts = ['-ab', '128k']
3044                 else:
3045                         # We convert the audio (lossy)
3046                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3047                         extension = self._preferredcodec
3048                         more_opts = ['-ab', '128k']
3049                         if self._preferredcodec == 'aac':
3050                                 more_opts += ['-f', 'adts']
3051
3052                 (prefix, ext) = os.path.splitext(path)
3053                 new_path = prefix + '.' + extension
3054                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3055                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3056
3057                 if not status:
3058                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3059                         return None
3060
3061                 try:
3062                         os.remove(path)
3063                 except (IOError, OSError):
3064                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3065                         return None
3066
3067                 information['filepath'] = new_path
3068                 return information
3069
3070
3071 def updateSelf(downloader, filename):
3072         ''' Update the program file with the latest version from the repository '''
3073         # Note: downloader only used for options
3074         if not os.access(filename, os.W_OK):
3075                 sys.exit('ERROR: no write permissions on %s' % filename)
3076
3077         downloader.to_screen('Updating to latest version...')
3078
3079         try:
3080                 try:
3081                         urlh = urllib.urlopen(UPDATE_URL)
3082                         newcontent = urlh.read()
3083                 finally:
3084                         urlh.close()
3085         except (IOError, OSError), err:
3086                 sys.exit('ERROR: unable to download latest version')
3087
3088         try:
3089                 outf = open(filename, 'wb')
3090                 try:
3091                         outf.write(newcontent)
3092                 finally:
3093                         outf.close()
3094         except (IOError, OSError), err:
3095                 sys.exit('ERROR: unable to overwrite current version')
3096
3097         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3098
3099 def parseOpts():
3100         # Deferred imports
3101         import getpass
3102         import optparse
3103
3104         def _format_option_string(option):
3105                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3106
3107                 opts = []
3108
3109                 if option._short_opts: opts.append(option._short_opts[0])
3110                 if option._long_opts: opts.append(option._long_opts[0])
3111                 if len(opts) > 1: opts.insert(1, ', ')
3112
3113                 if option.takes_value(): opts.append(' %s' % option.metavar)
3114
3115                 return "".join(opts)
3116
3117         def _find_term_columns():
3118                 columns = os.environ.get('COLUMNS', None)
3119                 if columns:
3120                         return int(columns)
3121
3122                 try:
3123                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3124                         out,err = sp.communicate()
3125                         return int(out.split()[1])
3126                 except:
3127                         pass
3128                 return None
3129
3130         max_width = 80
3131         max_help_position = 80
3132
3133         # No need to wrap help messages if we're on a wide console
3134         columns = _find_term_columns()
3135         if columns: max_width = columns
3136
3137         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3138         fmt.format_option_strings = _format_option_string
3139
3140         kw = {
3141                 'version'   : __version__,
3142                 'formatter' : fmt,
3143                 'usage' : '%prog [options] url...',
3144                 'conflict_handler' : 'resolve',
3145         }
3146
3147         parser = optparse.OptionParser(**kw)
3148
3149         # option groups
3150         general        = optparse.OptionGroup(parser, 'General Options')
3151         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3152         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3153         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3154         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3155         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3156
3157         general.add_option('-h', '--help',
3158                         action='help', help='print this help text and exit')
3159         general.add_option('-v', '--version',
3160                         action='version', help='print program version and exit')
3161         general.add_option('-U', '--update',
3162                         action='store_true', dest='update_self', help='update this program to latest version')
3163         general.add_option('-i', '--ignore-errors',
3164                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3165         general.add_option('-r', '--rate-limit',
3166                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3167         general.add_option('-R', '--retries',
3168                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3169         general.add_option('--playlist-start',
3170                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3171         general.add_option('--playlist-end',
3172                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3173         general.add_option('--dump-user-agent',
3174                         action='store_true', dest='dump_user_agent',
3175                         help='display the current browser identification', default=False)
3176
3177         authentication.add_option('-u', '--username',
3178                         dest='username', metavar='USERNAME', help='account username')
3179         authentication.add_option('-p', '--password',
3180                         dest='password', metavar='PASSWORD', help='account password')
3181         authentication.add_option('-n', '--netrc',
3182                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3183
3184
3185         video_format.add_option('-f', '--format',
3186                         action='store', dest='format', metavar='FORMAT', help='video format code')
3187         video_format.add_option('--all-formats',
3188                         action='store_const', dest='format', help='download all available video formats', const='-1')
3189         video_format.add_option('--max-quality',
3190                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3191
3192
3193         verbosity.add_option('-q', '--quiet',
3194                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3195         verbosity.add_option('-s', '--simulate',
3196                         action='store_true', dest='simulate', help='do not download video', default=False)
3197         verbosity.add_option('-g', '--get-url',
3198                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3199         verbosity.add_option('-e', '--get-title',
3200                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3201         verbosity.add_option('--get-thumbnail',
3202                         action='store_true', dest='getthumbnail',
3203                         help='simulate, quiet but print thumbnail URL', default=False)
3204         verbosity.add_option('--get-description',
3205                         action='store_true', dest='getdescription',
3206                         help='simulate, quiet but print video description', default=False)
3207         verbosity.add_option('--get-filename',
3208                         action='store_true', dest='getfilename',
3209                         help='simulate, quiet but print output filename', default=False)
3210         verbosity.add_option('--no-progress',
3211                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3212         verbosity.add_option('--console-title',
3213                         action='store_true', dest='consoletitle',
3214                         help='display progress in console titlebar', default=False)
3215
3216
3217         filesystem.add_option('-t', '--title',
3218                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3219         filesystem.add_option('-l', '--literal',
3220                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3221         filesystem.add_option('-A', '--auto-number',
3222                         action='store_true', dest='autonumber',
3223                         help='number downloaded files starting from 00000', default=False)
3224         filesystem.add_option('-o', '--output',
3225                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3226         filesystem.add_option('-a', '--batch-file',
3227                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3228         filesystem.add_option('-w', '--no-overwrites',
3229                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3230         filesystem.add_option('-c', '--continue',
3231                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3232         filesystem.add_option('--cookies',
3233                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3234         filesystem.add_option('--no-part',
3235                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3236         filesystem.add_option('--no-mtime',
3237                         action='store_false', dest='updatetime',
3238                         help='do not use the Last-modified header to set the file modification time', default=True)
3239         filesystem.add_option('--write-description',
3240                         action='store_true', dest='writedescription',
3241                         help='write video description to a .description file', default=False)
3242         filesystem.add_option('--write-info-json',
3243                         action='store_true', dest='writeinfojson',
3244                         help='write video metadata to a .info.json file', default=False)
3245
3246
3247         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3248                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3249         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3250                         help='"best", "aac" or "mp3"; best by default')
3251
3252
3253         parser.add_option_group(general)
3254         parser.add_option_group(filesystem)
3255         parser.add_option_group(verbosity)
3256         parser.add_option_group(video_format)
3257         parser.add_option_group(authentication)
3258         parser.add_option_group(postproc)
3259
3260         opts, args = parser.parse_args()
3261
3262         return parser, opts, args
3263
3264 def main():
3265         parser, opts, args = parseOpts()
3266
3267         # Open appropriate CookieJar
3268         if opts.cookiefile is None:
3269                 jar = cookielib.CookieJar()
3270         else:
3271                 try:
3272                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3273                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3274                                 jar.load()
3275                 except (IOError, OSError), err:
3276                         sys.exit(u'ERROR: unable to open cookie file')
3277
3278         # Dump user agent
3279         if opts.dump_user_agent:
3280                 print std_headers['User-Agent']
3281                 sys.exit(0)
3282
3283         # General configuration
3284         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3285         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3286         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3287
3288         # Batch file verification
3289         batchurls = []
3290         if opts.batchfile is not None:
3291                 try:
3292                         if opts.batchfile == '-':
3293                                 batchfd = sys.stdin
3294                         else:
3295                                 batchfd = open(opts.batchfile, 'r')
3296                         batchurls = batchfd.readlines()
3297                         batchurls = [x.strip() for x in batchurls]
3298                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3299                 except IOError:
3300                         sys.exit(u'ERROR: batch file could not be read')
3301         all_urls = batchurls + args
3302
3303         # Conflicting, missing and erroneous options
3304         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3305                 parser.error(u'using .netrc conflicts with giving username/password')
3306         if opts.password is not None and opts.username is None:
3307                 parser.error(u'account username missing')
3308         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3309                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3310         if opts.usetitle and opts.useliteral:
3311                 parser.error(u'using title conflicts with using literal title')
3312         if opts.username is not None and opts.password is None:
3313                 opts.password = getpass.getpass(u'Type account password and press return:')
3314         if opts.ratelimit is not None:
3315                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3316                 if numeric_limit is None:
3317                         parser.error(u'invalid rate limit specified')
3318                 opts.ratelimit = numeric_limit
3319         if opts.retries is not None:
3320                 try:
3321                         opts.retries = long(opts.retries)
3322                 except (TypeError, ValueError), err:
3323                         parser.error(u'invalid retry count specified')
3324         try:
3325                 opts.playliststart = int(opts.playliststart)
3326                 if opts.playliststart <= 0:
3327                         raise ValueError(u'Playlist start must be positive')
3328         except (TypeError, ValueError), err:
3329                 parser.error(u'invalid playlist start number specified')
3330         try:
3331                 opts.playlistend = int(opts.playlistend)
3332                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3333                         raise ValueError(u'Playlist end must be greater than playlist start')
3334         except (TypeError, ValueError), err:
3335                 parser.error(u'invalid playlist end number specified')
3336         if opts.extractaudio:
3337                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3338                         parser.error(u'invalid audio format specified')
3339
3340         # Information extractors
3341         youtube_ie = YoutubeIE()
3342         metacafe_ie = MetacafeIE(youtube_ie)
3343         dailymotion_ie = DailymotionIE()
3344         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3345         youtube_user_ie = YoutubeUserIE(youtube_ie)
3346         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3347         google_ie = GoogleIE()
3348         google_search_ie = GoogleSearchIE(google_ie)
3349         photobucket_ie = PhotobucketIE()
3350         yahoo_ie = YahooIE()
3351         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3352         deposit_files_ie = DepositFilesIE()
3353         facebook_ie = FacebookIE()
3354         bliptv_ie = BlipTVIE()
3355         vimeo_ie = VimeoIE()
3356         generic_ie = GenericIE()
3357
3358         # File downloader
3359         fd = FileDownloader({
3360                 'usenetrc': opts.usenetrc,
3361                 'username': opts.username,
3362                 'password': opts.password,
3363                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3364                 'forceurl': opts.geturl,
3365                 'forcetitle': opts.gettitle,
3366                 'forcethumbnail': opts.getthumbnail,
3367                 'forcedescription': opts.getdescription,
3368                 'forcefilename': opts.getfilename,
3369                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3370                 'format': opts.format,
3371                 'format_limit': opts.format_limit,
3372                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3373                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3374                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3375                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3376                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3377                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3378                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3379                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3380                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3381                         or u'%(id)s.%(ext)s'),
3382                 'ignoreerrors': opts.ignoreerrors,
3383                 'ratelimit': opts.ratelimit,
3384                 'nooverwrites': opts.nooverwrites,
3385                 'retries': opts.retries,
3386                 'continuedl': opts.continue_dl,
3387                 'noprogress': opts.noprogress,
3388                 'playliststart': opts.playliststart,
3389                 'playlistend': opts.playlistend,
3390                 'logtostderr': opts.outtmpl == '-',
3391                 'consoletitle': opts.consoletitle,
3392                 'nopart': opts.nopart,
3393                 'updatetime': opts.updatetime,
3394                 'writedescription': opts.writedescription,
3395                 'writeinfojson': opts.writeinfojson,
3396                 })
3397         fd.add_info_extractor(youtube_search_ie)
3398         fd.add_info_extractor(youtube_pl_ie)
3399         fd.add_info_extractor(youtube_user_ie)
3400         fd.add_info_extractor(metacafe_ie)
3401         fd.add_info_extractor(dailymotion_ie)
3402         fd.add_info_extractor(youtube_ie)
3403         fd.add_info_extractor(google_ie)
3404         fd.add_info_extractor(google_search_ie)
3405         fd.add_info_extractor(photobucket_ie)
3406         fd.add_info_extractor(yahoo_ie)
3407         fd.add_info_extractor(yahoo_search_ie)
3408         fd.add_info_extractor(deposit_files_ie)
3409         fd.add_info_extractor(facebook_ie)
3410         fd.add_info_extractor(bliptv_ie)
3411         fd.add_info_extractor(vimeo_ie)
3412
3413         # This must come last since it's the
3414         # fallback if none of the others work
3415         fd.add_info_extractor(generic_ie)
3416
3417         # PostProcessors
3418         if opts.extractaudio:
3419                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3420
3421         # Update version
3422         if opts.update_self:
3423                 updateSelf(fd, sys.argv[0])
3424
3425         # Maybe do nothing
3426         if len(all_urls) < 1:
3427                 if not opts.update_self:
3428                         parser.error(u'you must provide at least one URL')
3429                 else:
3430                         sys.exit()
3431         retcode = fd.download(all_urls)
3432
3433         # Dump cookie jar if requested
3434         if opts.cookiefile is not None:
3435                 try:
3436                         jar.save()
3437                 except (IOError, OSError), err:
3438                         sys.exit(u'ERROR: unable to save cookie jar')
3439
3440         sys.exit(retcode)
3441
3442
3443 if __name__ == '__main__':
3444         try:
3445                 main()
3446         except DownloadError:
3447                 sys.exit(1)
3448         except SameFileError:
3449                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3450         except KeyboardInterrupt:
3451                 sys.exit(u'\nERROR: Interrupted by user')
3452
3453 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: