youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 from __future__ import with_statement
  13 import contextlib
  14 import cookielib
  15 import ctypes
  16 import datetime
  17 import email.utils
  18 import gzip
  19 import htmlentitydefs
  20 import httplib
  21 import locale
  22 import math
  23 import netrc
  24 import os
  25 import os.path
  26 import re
  27 import socket
  28 import string
  29 import subprocess
  30 import sys
  31 import time
  32 import urllib
  33 import urllib2
  34 import warnings
  35 import zlib
  36
  37 try:
  38         import cStringIO as StringIO
  39 except ImportError:
  40         import StringIO
  41
  42 # parse_qs was moved from the cgi module to the urlparse module recently.
  43 try:
  44         from urlparse import parse_qs
  45 except ImportError:
  46         from cgi import parse_qs
  47
  48 try:
  49         import lxml.etree
  50 except ImportError: # Python < 2.6
  51         pass # Handled below
  52
  53 std_headers = {
  54         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  55         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  56         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  57         'Accept-Encoding': 'gzip, deflate',
  58         'Accept-Language': 'en-us,en;q=0.5',
  59 }
  60
  61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  62
  63 try:
  64         import json
  65 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  66         import re
  67         class json(object):
  68                 @staticmethod
  69                 def loads(s):
  70                         s = s.decode('UTF-8')
  71                         def raiseError(msg, i):
  72                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  73                         def skipSpace(i, expectMore=True):
  74                                 while i < len(s) and s[i] in ' \t\r\n':
  75                                         i += 1
  76                                 if expectMore:
  77                                         if i >= len(s):
  78                                                 raiseError('Premature end', i)
  79                                 return i
  80                         def decodeEscape(match):
  81                                 esc = match.group(1)
  82                                 _STATIC = {
  83                                         '"': '"',
  84                                         '\\': '\\',
  85                                         '/': '/',
  86                                         'b': unichr(0x8),
  87                                         'f': unichr(0xc),
  88                                         'n': '\n',
  89                                         'r': '\r',
  90                                         't': '\t',
  91                                 }
  92                                 if esc in _STATIC:
  93                                         return _STATIC[esc]
  94                                 if esc[0] == 'u':
  95                                         if len(esc) == 1+4:
  96                                                 return unichr(int(esc[1:5], 16))
  97                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
  98                                                 hi = int(esc[1:5], 16)
  99                                                 low = int(esc[7:11], 16)
 100                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 101                                 raise ValueError('Unknown escape ' + str(esc))
 102                         def parseString(i):
 103                                 i += 1
 104                                 e = i
 105                                 while True:
 106                                         e = s.index('"', e)
 107                                         bslashes = 0
 108                                         while s[e-bslashes-1] == '\\':
 109                                                 bslashes += 1
 110                                         if bslashes % 2 == 1:
 111                                                 e += 1
 112                                                 continue
 113                                         break
 114                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 115                                 stri = rexp.sub(decodeEscape, s[i:e])
 116                                 return (e+1,stri)
 117                         def parseObj(i):
 118                                 i += 1
 119                                 res = {}
 120                                 i = skipSpace(i)
 121                                 if s[i] == '}': # Empty dictionary
 122                                         return (i+1,res)
 123                                 while True:
 124                                         if s[i] != '"':
 125                                                 raiseError('Expected a string object key', i)
 126                                         i,key = parseString(i)
 127                                         i = skipSpace(i)
 128                                         if i >= len(s) or s[i] != ':':
 129                                                 raiseError('Expected a colon', i)
 130                                         i,val = parse(i+1)
 131                                         res[key] = val
 132                                         i = skipSpace(i)
 133                                         if s[i] == '}':
 134                                                 return (i+1, res)
 135                                         if s[i] != ',':
 136                                                 raiseError('Expected comma or closing curly brace', i)
 137                                         i = skipSpace(i+1)
 138                         def parseArray(i):
 139                                 res = []
 140                                 i = skipSpace(i+1)
 141                                 if s[i] == ']': # Empty array
 142                                         return (i+1,res)
 143                                 while True:
 144                                         i,val = parse(i)
 145                                         res.append(val)
 146                                         i = skipSpace(i) # Raise exception if premature end
 147                                         if s[i] == ']':
 148                                                 return (i+1, res)
 149                                         if s[i] != ',':
 150                                                 raiseError('Expected a comma or closing bracket', i)
 151                                         i = skipSpace(i+1)
 152                         def parseDiscrete(i):
 153                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 154                                         if s.startswith(k, i):
 155                                                 return (i+len(k), v)
 156                                 raiseError('Not a boolean (or null)', i)
 157                         def parseNumber(i):
 158                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 159                                 if mobj is None:
 160                                         raiseError('Not a number', i)
 161                                 nums = mobj.group(1)
 162                                 if '.' in nums or 'e' in nums or 'E' in nums:
 163                                         return (i+len(nums), float(nums))
 164                                 return (i+len(nums), int(nums))
 165                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 166                         def parse(i):
 167                                 i = skipSpace(i)
 168                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 169                                 i = skipSpace(i, False)
 170                                 return (i,res)
 171                         i,res = parse(0)
 172                         if i < len(s):
 173                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 174                         return res
 175
 176 def preferredencoding():
 177         """Get preferred encoding.
 178
 179         Returns the best encoding scheme for the system, based on
 180         locale.getpreferredencoding() and some further tweaks.
 181         """
 182         def yield_preferredencoding():
 183                 try:
 184                         pref = locale.getpreferredencoding()
 185                         u'TEST'.encode(pref)
 186                 except:
 187                         pref = 'UTF-8'
 188                 while True:
 189                         yield pref
 190         return yield_preferredencoding().next()
 191
 192 def htmlentity_transform(matchobj):
 193         """Transforms an HTML entity to a Unicode character.
 194
 195         This function receives a match object and is intended to be used with
 196         the re.sub() function.
 197         """
 198         entity = matchobj.group(1)
 199
 200         # Known non-numeric HTML entity
 201         if entity in htmlentitydefs.name2codepoint:
 202                 return unichr(htmlentitydefs.name2codepoint[entity])
 203
 204         # Unicode character
 205         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 206         if mobj is not None:
 207                 numstr = mobj.group(1)
 208                 if numstr.startswith(u'x'):
 209                         base = 16
 210                         numstr = u'0%s' % numstr
 211                 else:
 212                         base = 10
 213                 return unichr(long(numstr, base))
 214
 215         # Unknown entity in name, return its literal representation
 216         return (u'&%s;' % entity)
 217
 218 def sanitize_title(utitle):
 219         """Sanitizes a video title so it could be used as part of a filename."""
 220         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 221         return utitle.replace(unicode(os.sep), u'%')
 222
 223 def sanitize_open(filename, open_mode):
 224         """Try to open the given filename, and slightly tweak it if this fails.
 225
 226         Attempts to open the given filename. If this fails, it tries to change
 227         the filename slightly, step by step, until it's either able to open it
 228         or it fails and raises a final exception, like the standard open()
 229         function.
 230
 231         It returns the tuple (stream, definitive_file_name).
 232         """
 233         try:
 234                 if filename == u'-':
 235                         if sys.platform == 'win32':
 236                                 import msvcrt
 237                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238                         return (sys.stdout, filename)
 239                 stream = open(filename, open_mode)
 240                 return (stream, filename)
 241         except (IOError, OSError), err:
 242                 # In case of error, try to remove win32 forbidden chars
 243                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 244
 245                 # An exception here should be caught in the caller
 246                 stream = open(filename, open_mode)
 247                 return (stream, filename)
 248
 249 def timeconvert(timestr):
 250     """Convert RFC 2822 defined time string into system timestamp"""
 251     timestamp = None
 252     timetuple = email.utils.parsedate_tz(timestr)
 253     if timetuple is not None:
 254         timestamp = email.utils.mktime_tz(timetuple)
 255     return timestamp
 256
 257 class DownloadError(Exception):
 258         """Download Error exception.
 259
 260         This exception may be thrown by FileDownloader objects if they are not
 261         configured to continue on errors. They will contain the appropriate
 262         error message.
 263         """
 264         pass
 265
 266 class SameFileError(Exception):
 267         """Same File exception.
 268
 269         This exception will be thrown by FileDownloader objects if they detect
 270         multiple files would have to be downloaded to the same file on disk.
 271         """
 272         pass
 273
 274 class PostProcessingError(Exception):
 275         """Post Processing exception.
 276
 277         This exception may be raised by PostProcessor's .run() method to
 278         indicate an error in the postprocessing task.
 279         """
 280         pass
 281
 282 class UnavailableVideoError(Exception):
 283         """Unavailable Format exception.
 284
 285         This exception will be thrown when a video is requested
 286         in a format that is not available for that video.
 287         """
 288         pass
 289
 290 class ContentTooShortError(Exception):
 291         """Content Too Short exception.
 292
 293         This exception may be raised by FileDownloader objects when a file they
 294         download is too small for what the server announced first, indicating
 295         the connection was probably interrupted.
 296         """
 297         # Both in bytes
 298         downloaded = None
 299         expected = None
 300
 301         def __init__(self, downloaded, expected):
 302                 self.downloaded = downloaded
 303                 self.expected = expected
 304
 305 class YoutubeDLHandler(urllib2.HTTPHandler):
 306         """Handler for HTTP requests and responses.
 307
 308         This class, when installed with an OpenerDirector, automatically adds
 309         the standard headers to every HTTP request and handles gzipped and
 310         deflated responses from web servers. If compression is to be avoided in
 311         a particular request, the original request in the program code only has
 312         to include the HTTP header "Youtubedl-No-Compression", which will be
 313         removed before making the real request.
 314
 315         Part of this code was copied from:
 316
 317           http://techknack.net/python-urllib2-handlers/
 318
 319         Andrew Rowls, the author of that code, agreed to release it to the
 320         public domain.
 321         """
 322
 323         @staticmethod
 324         def deflate(data):
 325                 try:
 326                         return zlib.decompress(data, -zlib.MAX_WBITS)
 327                 except zlib.error:
 328                         return zlib.decompress(data)
 329
 330         @staticmethod
 331         def addinfourl_wrapper(stream, headers, url, code):
 332                 if hasattr(urllib2.addinfourl, 'getcode'):
 333                         return urllib2.addinfourl(stream, headers, url, code)
 334                 ret = urllib2.addinfourl(stream, headers, url)
 335                 ret.code = code
 336                 return ret
 337
 338         def http_request(self, req):
 339                 for h in std_headers:
 340                         if h in req.headers:
 341                                 del req.headers[h]
 342                         req.add_header(h, std_headers[h])
 343                 if 'Youtubedl-no-compression' in req.headers:
 344                         if 'Accept-encoding' in req.headers:
 345                                 del req.headers['Accept-encoding']
 346                         del req.headers['Youtubedl-no-compression']
 347                 return req
 348
 349         def http_response(self, req, resp):
 350                 old_resp = resp
 351                 # gzip
 352                 if resp.headers.get('Content-encoding', '') == 'gzip':
 353                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 355                         resp.msg = old_resp.msg
 356                 # deflate
 357                 if resp.headers.get('Content-encoding', '') == 'deflate':
 358                         gz = StringIO.StringIO(self.deflate(resp.read()))
 359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 360                         resp.msg = old_resp.msg
 361                 return resp
 362
 363 class FileDownloader(object):
 364         """File Downloader class.
 365
 366         File downloader objects are the ones responsible of downloading the
 367         actual video file and writing it to disk if the user has requested
 368         it, among some other tasks. In most cases there should be one per
 369         program. As, given a video URL, the downloader doesn't know how to
 370         extract all the needed information, task that InfoExtractors do, it
 371         has to pass the URL to one of them.
 372
 373         For this, file downloader objects have a method that allows
 374         InfoExtractors to be registered in a given order. When it is passed
 375         a URL, the file downloader handles it to the first InfoExtractor it
 376         finds that reports being able to handle it. The InfoExtractor extracts
 377         all the information about the video or videos the URL refers to, and
 378         asks the FileDownloader to process the video information, possibly
 379         downloading the video.
 380
 381         File downloaders accept a lot of parameters. In order not to saturate
 382         the object constructor with arguments, it receives a dictionary of
 383         options instead. These options are available through the params
 384         attribute for the InfoExtractors to use. The FileDownloader also
 385         registers itself as the downloader in charge for the InfoExtractors
 386         that are added to it, so this is a "mutual registration".
 387
 388         Available options:
 389
 390         username:         Username for authentication purposes.
 391         password:         Password for authentication purposes.
 392         usenetrc:         Use netrc for authentication instead.
 393         quiet:            Do not print messages to stdout.
 394         forceurl:         Force printing final URL.
 395         forcetitle:       Force printing title.
 396         forcethumbnail:   Force printing thumbnail URL.
 397         forcedescription: Force printing description.
 398         forcefilename:    Force printing final filename.
 399         simulate:         Do not download the video files.
 400         format:           Video format code.
 401         format_limit:     Highest quality format to try.
 402         outtmpl:          Template for output names.
 403         ignoreerrors:     Do not stop on download errors.
 404         ratelimit:        Download speed limit, in bytes/sec.
 405         nooverwrites:     Prevent overwriting files.
 406         retries:          Number of times to retry for HTTP error 5xx
 407         continuedl:       Try to continue downloads if possible.
 408         noprogress:       Do not print the progress bar.
 409         playliststart:    Playlist item to start at.
 410         playlistend:      Playlist item to end at.
 411         logtostderr:      Log messages to stderr instead of stdout.
 412         consoletitle:     Display progress in console window's titlebar.
 413         nopart:           Do not use temporary .part files.
 414         updatetime:       Use the Last-modified header to set output file timestamps.
 415         writedescription: Write the video description to a .description file
 416         writeinfojson:    Write the video description to a .info.json file
 417         """
 418
 419         params = None
 420         _ies = []
 421         _pps = []
 422         _download_retcode = None
 423         _num_downloads = None
 424         _screen_file = None
 425
 426         def __init__(self, params):
 427                 """Create a FileDownloader object with the given options."""
 428                 self._ies = []
 429                 self._pps = []
 430                 self._download_retcode = 0
 431                 self._num_downloads = 0
 432                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 433                 self.params = params
 434
 435         @staticmethod
 436         def pmkdir(filename):
 437                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 438                 components = filename.split(os.sep)
 439                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 440                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 441                 for dir in aggregate:
 442                         if not os.path.exists(dir):
 443                                 os.mkdir(dir)
 444
 445         @staticmethod
 446         def format_bytes(bytes):
 447                 if bytes is None:
 448                         return 'N/A'
 449                 if type(bytes) is str:
 450                         bytes = float(bytes)
 451                 if bytes == 0.0:
 452                         exponent = 0
 453                 else:
 454                         exponent = long(math.log(bytes, 1024.0))
 455                 suffix = 'bkMGTPEZY'[exponent]
 456                 converted = float(bytes) / float(1024**exponent)
 457                 return '%.2f%s' % (converted, suffix)
 458
 459         @staticmethod
 460         def calc_percent(byte_counter, data_len):
 461                 if data_len is None:
 462                         return '---.-%'
 463                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 464
 465         @staticmethod
 466         def calc_eta(start, now, total, current):
 467                 if total is None:
 468                         return '--:--'
 469                 dif = now - start
 470                 if current == 0 or dif < 0.001: # One millisecond
 471                         return '--:--'
 472                 rate = float(current) / dif
 473                 eta = long((float(total) - float(current)) / rate)
 474                 (eta_mins, eta_secs) = divmod(eta, 60)
 475                 if eta_mins > 99:
 476                         return '--:--'
 477                 return '%02d:%02d' % (eta_mins, eta_secs)
 478
 479         @staticmethod
 480         def calc_speed(start, now, bytes):
 481                 dif = now - start
 482                 if bytes == 0 or dif < 0.001: # One millisecond
 483                         return '%10s' % '---b/s'
 484                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 485
 486         @staticmethod
 487         def best_block_size(elapsed_time, bytes):
 488                 new_min = max(bytes / 2.0, 1.0)
 489                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 490                 if elapsed_time < 0.001:
 491                         return long(new_max)
 492                 rate = bytes / elapsed_time
 493                 if rate > new_max:
 494                         return long(new_max)
 495                 if rate < new_min:
 496                         return long(new_min)
 497                 return long(rate)
 498
 499         @staticmethod
 500         def parse_bytes(bytestr):
 501                 """Parse a string indicating a byte quantity into a long integer."""
 502                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 503                 if matchobj is None:
 504                         return None
 505                 number = float(matchobj.group(1))
 506                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 507                 return long(round(number * multiplier))
 508
 509         def add_info_extractor(self, ie):
 510                 """Add an InfoExtractor object to the end of the list."""
 511                 self._ies.append(ie)
 512                 ie.set_downloader(self)
 513
 514         def add_post_processor(self, pp):
 515                 """Add a PostProcessor object to the end of the chain."""
 516                 self._pps.append(pp)
 517                 pp.set_downloader(self)
 518
 519         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 520                 """Print message to stdout if not in quiet mode."""
 521                 try:
 522                         if not self.params.get('quiet', False):
 523                                 terminator = [u'\n', u''][skip_eol]
 524                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 525                         self._screen_file.flush()
 526                 except (UnicodeEncodeError), err:
 527                         if not ignore_encoding_errors:
 528                                 raise
 529
 530         def to_stderr(self, message):
 531                 """Print message to stderr."""
 532                 print >>sys.stderr, message.encode(preferredencoding())
 533
 534         def to_cons_title(self, message):
 535                 """Set console/terminal window title to message."""
 536                 if not self.params.get('consoletitle', False):
 537                         return
 538                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 539                         # c_wchar_p() might not be necessary if `message` is
 540                         # already of type unicode()
 541                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 542                 elif 'TERM' in os.environ:
 543                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 544
 545         def fixed_template(self):
 546                 """Checks if the output template is fixed."""
 547                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 548
 549         def trouble(self, message=None):
 550                 """Determine action to take when a download problem appears.
 551
 552                 Depending on if the downloader has been configured to ignore
 553                 download errors or not, this method may throw an exception or
 554                 not when errors are found, after printing the message.
 555                 """
 556                 if message is not None:
 557                         self.to_stderr(message)
 558                 if not self.params.get('ignoreerrors', False):
 559                         raise DownloadError(message)
 560                 self._download_retcode = 1
 561
 562         def slow_down(self, start_time, byte_counter):
 563                 """Sleep if the download speed is over the rate limit."""
 564                 rate_limit = self.params.get('ratelimit', None)
 565                 if rate_limit is None or byte_counter == 0:
 566                         return
 567                 now = time.time()
 568                 elapsed = now - start_time
 569                 if elapsed <= 0.0:
 570                         return
 571                 speed = float(byte_counter) / elapsed
 572                 if speed > rate_limit:
 573                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 574
 575         def temp_name(self, filename):
 576                 """Returns a temporary filename for the given filename."""
 577                 if self.params.get('nopart', False) or filename == u'-' or \
 578                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 579                         return filename
 580                 return filename + u'.part'
 581
 582         def undo_temp_name(self, filename):
 583                 if filename.endswith(u'.part'):
 584                         return filename[:-len(u'.part')]
 585                 return filename
 586
 587         def try_rename(self, old_filename, new_filename):
 588                 try:
 589                         if old_filename == new_filename:
 590                                 return
 591                         os.rename(old_filename, new_filename)
 592                 except (IOError, OSError), err:
 593                         self.trouble(u'ERROR: unable to rename file')
 594
 595         def try_utime(self, filename, last_modified_hdr):
 596                 """Try to set the last-modified time of the given file."""
 597                 if last_modified_hdr is None:
 598                         return
 599                 if not os.path.isfile(filename):
 600                         return
 601                 timestr = last_modified_hdr
 602                 if timestr is None:
 603                         return
 604                 filetime = timeconvert(timestr)
 605                 if filetime is None:
 606                         return
 607                 try:
 608                         os.utime(filename,(time.time(), filetime))
 609                 except:
 610                         pass
 611
 612         def report_writedescription(self, descfn):
 613                 """ Report that the description file is being written """
 614                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 615
 616         def report_writeinfojson(self, infofn):
 617                 """ Report that the metadata file has been written """
 618                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 619
 620         def report_destination(self, filename):
 621                 """Report destination filename."""
 622                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 623
 624         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 625                 """Report download progress."""
 626                 if self.params.get('noprogress', False):
 627                         return
 628                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 629                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 630                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 631                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 632
 633         def report_resuming_byte(self, resume_len):
 634                 """Report attempt to resume at given byte."""
 635                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 636
 637         def report_retry(self, count, retries):
 638                 """Report retry in case of HTTP error 5xx"""
 639                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 640
 641         def report_file_already_downloaded(self, file_name):
 642                 """Report file has already been fully downloaded."""
 643                 try:
 644                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 645                 except (UnicodeEncodeError), err:
 646                         self.to_screen(u'[download] The file has already been downloaded')
 647
 648         def report_unable_to_resume(self):
 649                 """Report it was impossible to resume download."""
 650                 self.to_screen(u'[download] Unable to resume')
 651
 652         def report_finish(self):
 653                 """Report download finished."""
 654                 if self.params.get('noprogress', False):
 655                         self.to_screen(u'[download] Download completed')
 656                 else:
 657                         self.to_screen(u'')
 658
 659         def increment_downloads(self):
 660                 """Increment the ordinal that assigns a number to each file."""
 661                 self._num_downloads += 1
 662
 663         def prepare_filename(self, info_dict):
 664                 """Generate the output filename."""
 665                 try:
 666                         template_dict = dict(info_dict)
 667                         template_dict['epoch'] = unicode(long(time.time()))
 668                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 669                         filename = self.params['outtmpl'] % template_dict
 670                         return filename
 671                 except (ValueError, KeyError), err:
 672                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 673                         return None
 674
 675         def process_info(self, info_dict):
 676                 """Process a single dictionary returned by an InfoExtractor."""
 677                 filename = self.prepare_filename(info_dict)
 678                 # Do nothing else if in simulate mode
 679                 if self.params.get('simulate', False):
 680                         # Forced printings
 681                         if self.params.get('forcetitle', False):
 682                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 683                         if self.params.get('forceurl', False):
 684                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 685                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 686                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 687                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 688                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 689                         if self.params.get('forcefilename', False) and filename is not None:
 690                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 691
 692                         return
 693
 694                 if filename is None:
 695                         return
 696                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 697                         self.to_stderr(u'WARNING: file exists and will be skipped')
 698                         return
 699
 700                 try:
 701                         self.pmkdir(filename)
 702                 except (OSError, IOError), err:
 703                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 704                         return
 705
 706                 if self.params.get('writedescription', False):
 707                         try:
 708                                 descfn = filename + '.description'
 709                                 self.report_writedescription(descfn)
 710                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 711                                         descfile.write(info_dict['description'].encode('utf-8'))
 712                         except (OSError, IOError):
 713                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 714                                 return
 715
 716                 if self.params.get('writeinfojson', False):
 717                         infofn = filename + '.info.json'
 718                         self.report_writeinfojson(infofn)
 719                         try:
 720                                 json.dump
 721                         except (NameError,AttributeError):
 722                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 723                                 return
 724                         try:
 725                                 with contextlib.closing(open(infofn, 'wb')) as infof:
 726                                         json.dump(info_dict, infof)
 727                         except (OSError, IOError):
 728                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 729                                 return
 730
 731                 try:
 732                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 733                 except (OSError, IOError), err:
 734                         raise UnavailableVideoError
 735                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 736                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 737                         return
 738                 except (ContentTooShortError, ), err:
 739                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 740                         return
 741
 742                 if success:
 743                         try:
 744                                 self.post_process(filename, info_dict)
 745                         except (PostProcessingError), err:
 746                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 747                                 return
 748
 749         def download(self, url_list):
 750                 """Download a given list of URLs."""
 751                 if len(url_list) > 1 and self.fixed_template():
 752                         raise SameFileError(self.params['outtmpl'])
 753
 754                 for url in url_list:
 755                         suitable_found = False
 756                         for ie in self._ies:
 757                                 # Go to next InfoExtractor if not suitable
 758                                 if not ie.suitable(url):
 759                                         continue
 760
 761                                 # Suitable InfoExtractor found
 762                                 suitable_found = True
 763
 764                                 # Extract information from URL and process it
 765                                 ie.extract(url)
 766
 767                                 # Suitable InfoExtractor had been found; go to next URL
 768                                 break
 769
 770                         if not suitable_found:
 771                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 772
 773                 return self._download_retcode
 774
 775         def post_process(self, filename, ie_info):
 776                 """Run the postprocessing chain on the given file."""
 777                 info = dict(ie_info)
 778                 info['filepath'] = filename
 779                 for pp in self._pps:
 780                         info = pp.run(info)
 781                         if info is None:
 782                                 break
 783
 784         def _download_with_rtmpdump(self, filename, url, player_url):
 785                 self.report_destination(filename)
 786                 tmpfilename = self.temp_name(filename)
 787
 788                 # Check for rtmpdump first
 789                 try:
 790                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 791                 except (OSError, IOError):
 792                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 793                         return False
 794
 795                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 796                 # the connection was interrumpted and resuming appears to be
 797                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 798                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 799                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 800                 while retval == 2 or retval == 1:
 801                         prevsize = os.path.getsize(tmpfilename)
 802                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 803                         time.sleep(5.0) # This seems to be needed
 804                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 805                         cursize = os.path.getsize(tmpfilename)
 806                         if prevsize == cursize and retval == 1:
 807                                 break
 808                 if retval == 0:
 809                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 810                         self.try_rename(tmpfilename, filename)
 811                         return True
 812                 else:
 813                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 814                         return False
 815
 816         def _do_download(self, filename, url, player_url):
 817                 # Check file already present
 818                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 819                         self.report_file_already_downloaded(filename)
 820                         return True
 821
 822                 # Attempt to download using rtmpdump
 823                 if url.startswith('rtmp'):
 824                         return self._download_with_rtmpdump(filename, url, player_url)
 825
 826                 tmpfilename = self.temp_name(filename)
 827                 stream = None
 828                 open_mode = 'wb'
 829
 830                 # Do not include the Accept-Encoding header
 831                 headers = {'Youtubedl-no-compression': 'True'}
 832                 basic_request = urllib2.Request(url, None, headers)
 833                 request = urllib2.Request(url, None, headers)
 834
 835                 # Establish possible resume length
 836                 if os.path.isfile(tmpfilename):
 837                         resume_len = os.path.getsize(tmpfilename)
 838                 else:
 839                         resume_len = 0
 840
 841                 # Request parameters in case of being able to resume
 842                 if self.params.get('continuedl', False) and resume_len != 0:
 843                         self.report_resuming_byte(resume_len)
 844                         request.add_header('Range','bytes=%d-' % resume_len)
 845                         open_mode = 'ab'
 846
 847                 count = 0
 848                 retries = self.params.get('retries', 0)
 849                 while count <= retries:
 850                         # Establish connection
 851                         try:
 852                                 data = urllib2.urlopen(request)
 853                                 break
 854                         except (urllib2.HTTPError, ), err:
 855                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 856                                         # Unexpected HTTP error
 857                                         raise
 858                                 elif err.code == 416:
 859                                         # Unable to resume (requested range not satisfiable)
 860                                         try:
 861                                                 # Open the connection again without the range header
 862                                                 data = urllib2.urlopen(basic_request)
 863                                                 content_length = data.info()['Content-Length']
 864                                         except (urllib2.HTTPError, ), err:
 865                                                 if err.code < 500 or err.code >= 600:
 866                                                         raise
 867                                         else:
 868                                                 # Examine the reported length
 869                                                 if (content_length is not None and
 870                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 871                                                         # The file had already been fully downloaded.
 872                                                         # Explanation to the above condition: in issue #175 it was revealed that
 873                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 874                                                         # changing the file size slightly and causing problems for some users. So
 875                                                         # I decided to implement a suggested change and consider the file
 876                                                         # completely downloaded if the file size differs less than 100 bytes from
 877                                                         # the one in the hard drive.
 878                                                         self.report_file_already_downloaded(filename)
 879                                                         self.try_rename(tmpfilename, filename)
 880                                                         return True
 881                                                 else:
 882                                                         # The length does not match, we start the download over
 883                                                         self.report_unable_to_resume()
 884                                                         open_mode = 'wb'
 885                                                         break
 886                         # Retry
 887                         count += 1
 888                         if count <= retries:
 889                                 self.report_retry(count, retries)
 890
 891                 if count > retries:
 892                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 893                         return False
 894
 895                 data_len = data.info().get('Content-length', None)
 896                 if data_len is not None:
 897                         data_len = long(data_len) + resume_len
 898                 data_len_str = self.format_bytes(data_len)
 899                 byte_counter = 0 + resume_len
 900                 block_size = 1024
 901                 start = time.time()
 902                 while True:
 903                         # Download and write
 904                         before = time.time()
 905                         data_block = data.read(block_size)
 906                         after = time.time()
 907                         if len(data_block) == 0:
 908                                 break
 909                         byte_counter += len(data_block)
 910
 911                         # Open file just in time
 912                         if stream is None:
 913                                 try:
 914                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 915                                         filename = self.undo_temp_name(tmpfilename)
 916                                         self.report_destination(filename)
 917                                 except (OSError, IOError), err:
 918                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 919                                         return False
 920                         try:
 921                                 stream.write(data_block)
 922                         except (IOError, OSError), err:
 923                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 924                                 return False
 925                         block_size = self.best_block_size(after - before, len(data_block))
 926
 927                         # Progress message
 928                         percent_str = self.calc_percent(byte_counter, data_len)
 929                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 930                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 931                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 932
 933                         # Apply rate limit
 934                         self.slow_down(start, byte_counter - resume_len)
 935
 936                 stream.close()
 937                 self.report_finish()
 938                 if data_len is not None and byte_counter != data_len:
 939                         raise ContentTooShortError(byte_counter, long(data_len))
 940                 self.try_rename(tmpfilename, filename)
 941
 942                 # Update file modification time
 943                 if self.params.get('updatetime', True):
 944                         self.try_utime(filename, data.info().get('last-modified', None))
 945
 946                 return True
 947
 948 class InfoExtractor(object):
 949         """Information Extractor class.
 950
 951         Information extractors are the classes that, given a URL, extract
 952         information from the video (or videos) the URL refers to. This
 953         information includes the real video URL, the video title and simplified
 954         title, author and others. The information is stored in a dictionary
 955         which is then passed to the FileDownloader. The FileDownloader
 956         processes this information possibly downloading the video to the file
 957         system, among other possible outcomes. The dictionaries must include
 958         the following fields:
 959
 960         id:             Video identifier.
 961         url:            Final video URL.
 962         uploader:       Nickname of the video uploader.
 963         title:          Literal title.
 964         stitle:         Simplified title.
 965         ext:            Video filename extension.
 966         format:         Video format.
 967         player_url:     SWF Player URL (may be None).
 968
 969         The following fields are optional. Their primary purpose is to allow
 970         youtube-dl to serve as the backend for a video search function, such
 971         as the one in youtube2mp3.  They are only used when their respective
 972         forced printing functions are called:
 973
 974         thumbnail:      Full URL to a video thumbnail image.
 975         description:    One-line video description.
 976
 977         Subclasses of this one should re-define the _real_initialize() and
 978         _real_extract() methods, as well as the suitable() static method.
 979         Probably, they should also be instantiated and added to the main
 980         downloader.
 981         """
 982
 983         _ready = False
 984         _downloader = None
 985
 986         def __init__(self, downloader=None):
 987                 """Constructor. Receives an optional downloader."""
 988                 self._ready = False
 989                 self.set_downloader(downloader)
 990
 991         @staticmethod
 992         def suitable(url):
 993                 """Receives a URL and returns True if suitable for this IE."""
 994                 return False
 995
 996         def initialize(self):
 997                 """Initializes an instance (authentication, etc)."""
 998                 if not self._ready:
 999                         self._real_initialize()
1000                         self._ready = True
1001
1002         def extract(self, url):
1003                 """Extracts URL information and returns it in list of dicts."""
1004                 self.initialize()
1005                 return self._real_extract(url)
1006
1007         def set_downloader(self, downloader):
1008                 """Sets the downloader for this IE."""
1009                 self._downloader = downloader
1010
1011         def _real_initialize(self):
1012                 """Real initialization process. Redefine in subclasses."""
1013                 pass
1014
1015         def _real_extract(self, url):
1016                 """Real extraction process. Redefine in subclasses."""
1017                 pass
1018
1019 class YoutubeIE(InfoExtractor):
1020         """Information extractor for youtube.com."""
1021
1022         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1023         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1024         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1025         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1026         _NETRC_MACHINE = 'youtube'
1027         # Listed in order of quality
1028         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1029         _video_extensions = {
1030                 '13': '3gp',
1031                 '17': 'mp4',
1032                 '18': 'mp4',
1033                 '22': 'mp4',
1034                 '37': 'mp4',
1035                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1036                 '43': 'webm',
1037                 '45': 'webm',
1038         }
1039
1040         @staticmethod
1041         def suitable(url):
1042                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1043
1044         def report_lang(self):
1045                 """Report attempt to set language."""
1046                 self._downloader.to_screen(u'[youtube] Setting language')
1047
1048         def report_login(self):
1049                 """Report attempt to log in."""
1050                 self._downloader.to_screen(u'[youtube] Logging in')
1051
1052         def report_age_confirmation(self):
1053                 """Report attempt to confirm age."""
1054                 self._downloader.to_screen(u'[youtube] Confirming age')
1055
1056         def report_video_webpage_download(self, video_id):
1057                 """Report attempt to download video webpage."""
1058                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1059
1060         def report_video_info_webpage_download(self, video_id):
1061                 """Report attempt to download video info webpage."""
1062                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1063
1064         def report_information_extraction(self, video_id):
1065                 """Report attempt to extract video information."""
1066                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1067
1068         def report_unavailable_format(self, video_id, format):
1069                 """Report extracted video URL."""
1070                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1071
1072         def report_rtmp_download(self):
1073                 """Indicate the download will use the RTMP protocol."""
1074                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1075
1076         def _real_initialize(self):
1077                 if self._downloader is None:
1078                         return
1079
1080                 username = None
1081                 password = None
1082                 downloader_params = self._downloader.params
1083
1084                 # Attempt to use provided username and password or .netrc data
1085                 if downloader_params.get('username', None) is not None:
1086                         username = downloader_params['username']
1087                         password = downloader_params['password']
1088                 elif downloader_params.get('usenetrc', False):
1089                         try:
1090                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1091                                 if info is not None:
1092                                         username = info[0]
1093                                         password = info[2]
1094                                 else:
1095                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1096                         except (IOError, netrc.NetrcParseError), err:
1097                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1098                                 return
1099
1100                 # Set language
1101                 request = urllib2.Request(self._LANG_URL)
1102                 try:
1103                         self.report_lang()
1104                         urllib2.urlopen(request).read()
1105                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1107                         return
1108
1109                 # No authentication to be performed
1110                 if username is None:
1111                         return
1112
1113                 # Log in
1114                 login_form = {
1115                                 'current_form': 'loginForm',
1116                                 'next':         '/',
1117                                 'action_login': 'Log In',
1118                                 'username':     username,
1119                                 'password':     password,
1120                                 }
1121                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1122                 try:
1123                         self.report_login()
1124                         login_results = urllib2.urlopen(request).read()
1125                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1126                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1127                                 return
1128                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1130                         return
1131
1132                 # Confirm age
1133                 age_form = {
1134                                 'next_url':             '/',
1135                                 'action_confirm':       'Confirm',
1136                                 }
1137                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1138                 try:
1139                         self.report_age_confirmation()
1140                         age_results = urllib2.urlopen(request).read()
1141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1142                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1143                         return
1144
1145         def _real_extract(self, url):
1146                 # Extract video id from URL
1147                 mobj = re.match(self._VALID_URL, url)
1148                 if mobj is None:
1149                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1150                         return
1151                 video_id = mobj.group(2)
1152
1153                 # Get video webpage
1154                 self.report_video_webpage_download(video_id)
1155                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1156                 try:
1157                         video_webpage = urllib2.urlopen(request).read()
1158                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1160                         return
1161
1162                 # Attempt to extract SWF player URL
1163                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1164                 if mobj is not None:
1165                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1166                 else:
1167                         player_url = None
1168
1169                 # Get video info
1170                 self.report_video_info_webpage_download(video_id)
1171                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1172                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1173                                            % (video_id, el_type))
1174                         request = urllib2.Request(video_info_url)
1175                         try:
1176                                 video_info_webpage = urllib2.urlopen(request).read()
1177                                 video_info = parse_qs(video_info_webpage)
1178                                 if 'token' in video_info:
1179                                         break
1180                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1182                                 return
1183                 if 'token' not in video_info:
1184                         if 'reason' in video_info:
1185                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1186                         else:
1187                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1188                         return
1189
1190                 # Start extracting information
1191                 self.report_information_extraction(video_id)
1192
1193                 # uploader
1194                 if 'author' not in video_info:
1195                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1196                         return
1197                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1198
1199                 # title
1200                 if 'title' not in video_info:
1201                         self._downloader.trouble(u'ERROR: unable to extract video title')
1202                         return
1203                 video_title = urllib.unquote_plus(video_info['title'][0])
1204                 video_title = video_title.decode('utf-8')
1205                 video_title = sanitize_title(video_title)
1206
1207                 # simplified title
1208                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1209                 simple_title = simple_title.strip(ur'_')
1210
1211                 # thumbnail image
1212                 if 'thumbnail_url' not in video_info:
1213                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1214                         video_thumbnail = ''
1215                 else:   # don't panic if we can't find it
1216                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1217
1218                 # upload date
1219                 upload_date = u'NA'
1220                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1221                 if mobj is not None:
1222                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1223                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1224                         for expression in format_expressions:
1225                                 try:
1226                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1227                                 except:
1228                                         pass
1229
1230                 # description
1231                 try:
1232                         lxml.etree
1233                 except NameError:
1234                         video_description = u'No description available.'
1235                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1236                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1237                                 if mobj is not None:
1238                                         video_description = mobj.group(1).decode('utf-8')
1239                 else:
1240                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1241                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1242                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1243                         # TODO use another parser
1244
1245                 # token
1246                 video_token = urllib.unquote_plus(video_info['token'][0])
1247
1248                 # Decide which formats to download
1249                 req_format = self._downloader.params.get('format', None)
1250
1251                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1252                         self.report_rtmp_download()
1253                         video_url_list = [(None, video_info['conn'][0])]
1254                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1255                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1256                         url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1257                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1258                         url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1259
1260                         format_limit = self._downloader.params.get('format_limit', None)
1261                         if format_limit is not None and format_limit in self._available_formats:
1262                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1263                         else:
1264                                 format_list = self._available_formats
1265                         existing_formats = [x for x in format_list if x in url_map]
1266                         if len(existing_formats) == 0:
1267                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1268                                 return
1269                         if req_format is None:
1270                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1271                         elif req_format == '-1':
1272                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1273                         else:
1274                                 # Specific format
1275                                 if req_format not in url_map:
1276                                         self._downloader.trouble(u'ERROR: requested format not available')
1277                                         return
1278                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1279                 else:
1280                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1281                         return
1282
1283                 for format_param, video_real_url in video_url_list:
1284                         # At this point we have a new video
1285                         self._downloader.increment_downloads()
1286
1287                         # Extension
1288                         video_extension = self._video_extensions.get(format_param, 'flv')
1289
1290                         # Find the video URL in fmt_url_map or conn paramters
1291                         try:
1292                                 # Process video information
1293                                 self._downloader.process_info({
1294                                         'id':           video_id.decode('utf-8'),
1295                                         'url':          video_real_url.decode('utf-8'),
1296                                         'uploader':     video_uploader.decode('utf-8'),
1297                                         'upload_date':  upload_date,
1298                                         'title':        video_title,
1299                                         'stitle':       simple_title,
1300                                         'ext':          video_extension.decode('utf-8'),
1301                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1302                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1303                                         'description':  video_description,
1304                                         'player_url':   player_url,
1305                                 })
1306                         except UnavailableVideoError, err:
1307                                 self._downloader.trouble(u'\nERROR: unable to download video')
1308
1309
1310 class MetacafeIE(InfoExtractor):
1311         """Information Extractor for metacafe.com."""
1312
1313         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1314         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1315         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1316         _youtube_ie = None
1317
1318         def __init__(self, youtube_ie, downloader=None):
1319                 InfoExtractor.__init__(self, downloader)
1320                 self._youtube_ie = youtube_ie
1321
1322         @staticmethod
1323         def suitable(url):
1324                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1325
1326         def report_disclaimer(self):
1327                 """Report disclaimer retrieval."""
1328                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1329
1330         def report_age_confirmation(self):
1331                 """Report attempt to confirm age."""
1332                 self._downloader.to_screen(u'[metacafe] Confirming age')
1333
1334         def report_download_webpage(self, video_id):
1335                 """Report webpage download."""
1336                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1337
1338         def report_extraction(self, video_id):
1339                 """Report information extraction."""
1340                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1341
1342         def _real_initialize(self):
1343                 # Retrieve disclaimer
1344                 request = urllib2.Request(self._DISCLAIMER)
1345                 try:
1346                         self.report_disclaimer()
1347                         disclaimer = urllib2.urlopen(request).read()
1348                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1349                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1350                         return
1351
1352                 # Confirm age
1353                 disclaimer_form = {
1354                         'filters': '0',
1355                         'submit': "Continue - I'm over 18",
1356                         }
1357                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1358                 try:
1359                         self.report_age_confirmation()
1360                         disclaimer = urllib2.urlopen(request).read()
1361                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1362                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1363                         return
1364
1365         def _real_extract(self, url):
1366                 # Extract id and simplified title from URL
1367                 mobj = re.match(self._VALID_URL, url)
1368                 if mobj is None:
1369                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1370                         return
1371
1372                 video_id = mobj.group(1)
1373
1374                 # Check if video comes from YouTube
1375                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1376                 if mobj2 is not None:
1377                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1378                         return
1379
1380                 # At this point we have a new video
1381                 self._downloader.increment_downloads()
1382
1383                 simple_title = mobj.group(2).decode('utf-8')
1384
1385                 # Retrieve video webpage to extract further information
1386                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1387                 try:
1388                         self.report_download_webpage(video_id)
1389                         webpage = urllib2.urlopen(request).read()
1390                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1391                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1392                         return
1393
1394                 # Extract URL, uploader and title from webpage
1395                 self.report_extraction(video_id)
1396                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1397                 if mobj is not None:
1398                         mediaURL = urllib.unquote(mobj.group(1))
1399                         video_extension = mediaURL[-3:]
1400
1401                         # Extract gdaKey if available
1402                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1403                         if mobj is None:
1404                                 video_url = mediaURL
1405                         else:
1406                                 gdaKey = mobj.group(1)
1407                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1408                 else:
1409                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1410                         if mobj is None:
1411                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1412                                 return
1413                         vardict = parse_qs(mobj.group(1))
1414                         if 'mediaData' not in vardict:
1415                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1416                                 return
1417                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1418                         if mobj is None:
1419                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1420                                 return
1421                         mediaURL = mobj.group(1).replace('\\/', '/')
1422                         video_extension = mediaURL[-3:]
1423                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1424
1425                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1426                 if mobj is None:
1427                         self._downloader.trouble(u'ERROR: unable to extract title')
1428                         return
1429                 video_title = mobj.group(1).decode('utf-8')
1430                 video_title = sanitize_title(video_title)
1431
1432                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1433                 if mobj is None:
1434                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1435                         return
1436                 video_uploader = mobj.group(1)
1437
1438                 try:
1439                         # Process video information
1440                         self._downloader.process_info({
1441                                 'id':           video_id.decode('utf-8'),
1442                                 'url':          video_url.decode('utf-8'),
1443                                 'uploader':     video_uploader.decode('utf-8'),
1444                                 'upload_date':  u'NA',
1445                                 'title':        video_title,
1446                                 'stitle':       simple_title,
1447                                 'ext':          video_extension.decode('utf-8'),
1448                                 'format':       u'NA',
1449                                 'player_url':   None,
1450                         })
1451                 except UnavailableVideoError:
1452                         self._downloader.trouble(u'\nERROR: unable to download video')
1453
1454
1455 class DailymotionIE(InfoExtractor):
1456         """Information Extractor for Dailymotion"""
1457
1458         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1459
1460         def __init__(self, downloader=None):
1461                 InfoExtractor.__init__(self, downloader)
1462
1463         @staticmethod
1464         def suitable(url):
1465                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1466
1467         def report_download_webpage(self, video_id):
1468                 """Report webpage download."""
1469                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1470
1471         def report_extraction(self, video_id):
1472                 """Report information extraction."""
1473                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1474
1475         def _real_initialize(self):
1476                 return
1477
1478         def _real_extract(self, url):
1479                 # Extract id and simplified title from URL
1480                 mobj = re.match(self._VALID_URL, url)
1481                 if mobj is None:
1482                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1483                         return
1484
1485                 # At this point we have a new video
1486                 self._downloader.increment_downloads()
1487                 video_id = mobj.group(1)
1488
1489                 simple_title = mobj.group(2).decode('utf-8')
1490                 video_extension = 'flv'
1491
1492                 # Retrieve video webpage to extract further information
1493                 request = urllib2.Request(url)
1494                 try:
1495                         self.report_download_webpage(video_id)
1496                         webpage = urllib2.urlopen(request).read()
1497                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1499                         return
1500
1501                 # Extract URL, uploader and title from webpage
1502                 self.report_extraction(video_id)
1503                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1504                 if mobj is None:
1505                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1506                         return
1507                 mediaURL = urllib.unquote(mobj.group(1))
1508
1509                 # if needed add http://www.dailymotion.com/ if relative URL
1510
1511                 video_url = mediaURL
1512
1513                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1514                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1515                 if mobj is None:
1516                         self._downloader.trouble(u'ERROR: unable to extract title')
1517                         return
1518                 video_title = mobj.group(1).decode('utf-8')
1519                 video_title = sanitize_title(video_title)
1520
1521                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1522                 if mobj is None:
1523                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1524                         return
1525                 video_uploader = mobj.group(1)
1526
1527                 try:
1528                         # Process video information
1529                         self._downloader.process_info({
1530                                 'id':           video_id.decode('utf-8'),
1531                                 'url':          video_url.decode('utf-8'),
1532                                 'uploader':     video_uploader.decode('utf-8'),
1533                                 'upload_date':  u'NA',
1534                                 'title':        video_title,
1535                                 'stitle':       simple_title,
1536                                 'ext':          video_extension.decode('utf-8'),
1537                                 'format':       u'NA',
1538                                 'player_url':   None,
1539                         })
1540                 except UnavailableVideoError:
1541                         self._downloader.trouble(u'\nERROR: unable to download video')
1542
1543 class GoogleIE(InfoExtractor):
1544         """Information extractor for video.google.com."""
1545
1546         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1547
1548         def __init__(self, downloader=None):
1549                 InfoExtractor.__init__(self, downloader)
1550
1551         @staticmethod
1552         def suitable(url):
1553                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1554
1555         def report_download_webpage(self, video_id):
1556                 """Report webpage download."""
1557                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1558
1559         def report_extraction(self, video_id):
1560                 """Report information extraction."""
1561                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1562
1563         def _real_initialize(self):
1564                 return
1565
1566         def _real_extract(self, url):
1567                 # Extract id from URL
1568                 mobj = re.match(self._VALID_URL, url)
1569                 if mobj is None:
1570                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1571                         return
1572
1573                 # At this point we have a new video
1574                 self._downloader.increment_downloads()
1575                 video_id = mobj.group(1)
1576
1577                 video_extension = 'mp4'
1578
1579                 # Retrieve video webpage to extract further information
1580                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1581                 try:
1582                         self.report_download_webpage(video_id)
1583                         webpage = urllib2.urlopen(request).read()
1584                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1586                         return
1587
1588                 # Extract URL, uploader, and title from webpage
1589                 self.report_extraction(video_id)
1590                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1591                 if mobj is None:
1592                         video_extension = 'flv'
1593                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1596                         return
1597                 mediaURL = urllib.unquote(mobj.group(1))
1598                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1599                 mediaURL = mediaURL.replace('\\x26', '\x26')
1600
1601                 video_url = mediaURL
1602
1603                 mobj = re.search(r'<title>(.*)</title>', webpage)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: unable to extract title')
1606                         return
1607                 video_title = mobj.group(1).decode('utf-8')
1608                 video_title = sanitize_title(video_title)
1609                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1610
1611                 # Extract video description
1612                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: unable to extract video description')
1615                         return
1616                 video_description = mobj.group(1).decode('utf-8')
1617                 if not video_description:
1618                         video_description = 'No description available.'
1619
1620                 # Extract video thumbnail
1621                 if self._downloader.params.get('forcethumbnail', False):
1622                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1623                         try:
1624                                 webpage = urllib2.urlopen(request).read()
1625                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1626                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1627                                 return
1628                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1629                         if mobj is None:
1630                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1631                                 return
1632                         video_thumbnail = mobj.group(1)
1633                 else:   # we need something to pass to process_info
1634                         video_thumbnail = ''
1635
1636
1637                 try:
1638                         # Process video information
1639                         self._downloader.process_info({
1640                                 'id':           video_id.decode('utf-8'),
1641                                 'url':          video_url.decode('utf-8'),
1642                                 'uploader':     u'NA',
1643                                 'upload_date':  u'NA',
1644                                 'title':        video_title,
1645                                 'stitle':       simple_title,
1646                                 'ext':          video_extension.decode('utf-8'),
1647                                 'format':       u'NA',
1648                                 'player_url':   None,
1649                         })
1650                 except UnavailableVideoError:
1651                         self._downloader.trouble(u'\nERROR: unable to download video')
1652
1653
1654 class PhotobucketIE(InfoExtractor):
1655         """Information extractor for photobucket.com."""
1656
1657         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1658
1659         def __init__(self, downloader=None):
1660                 InfoExtractor.__init__(self, downloader)
1661
1662         @staticmethod
1663         def suitable(url):
1664                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1665
1666         def report_download_webpage(self, video_id):
1667                 """Report webpage download."""
1668                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1669
1670         def report_extraction(self, video_id):
1671                 """Report information extraction."""
1672                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1673
1674         def _real_initialize(self):
1675                 return
1676
1677         def _real_extract(self, url):
1678                 # Extract id from URL
1679                 mobj = re.match(self._VALID_URL, url)
1680                 if mobj is None:
1681                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1682                         return
1683
1684                 # At this point we have a new video
1685                 self._downloader.increment_downloads()
1686                 video_id = mobj.group(1)
1687
1688                 video_extension = 'flv'
1689
1690                 # Retrieve video webpage to extract further information
1691                 request = urllib2.Request(url)
1692                 try:
1693                         self.report_download_webpage(video_id)
1694                         webpage = urllib2.urlopen(request).read()
1695                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1696                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1697                         return
1698
1699                 # Extract URL, uploader, and title from webpage
1700                 self.report_extraction(video_id)
1701                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1702                 if mobj is None:
1703                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1704                         return
1705                 mediaURL = urllib.unquote(mobj.group(1))
1706
1707                 video_url = mediaURL
1708
1709                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1710                 if mobj is None:
1711                         self._downloader.trouble(u'ERROR: unable to extract title')
1712                         return
1713                 video_title = mobj.group(1).decode('utf-8')
1714                 video_title = sanitize_title(video_title)
1715                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1716
1717                 video_uploader = mobj.group(2).decode('utf-8')
1718
1719                 try:
1720                         # Process video information
1721                         self._downloader.process_info({
1722                                 'id':           video_id.decode('utf-8'),
1723                                 'url':          video_url.decode('utf-8'),
1724                                 'uploader':     video_uploader,
1725                                 'upload_date':  u'NA',
1726                                 'title':        video_title,
1727                                 'stitle':       simple_title,
1728                                 'ext':          video_extension.decode('utf-8'),
1729                                 'format':       u'NA',
1730                                 'player_url':   None,
1731                         })
1732                 except UnavailableVideoError:
1733                         self._downloader.trouble(u'\nERROR: unable to download video')
1734
1735
1736 class YahooIE(InfoExtractor):
1737         """Information extractor for video.yahoo.com."""
1738
1739         # _VALID_URL matches all Yahoo! Video URLs
1740         # _VPAGE_URL matches only the extractable '/watch/' URLs
1741         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1742         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1743
1744         def __init__(self, downloader=None):
1745                 InfoExtractor.__init__(self, downloader)
1746
1747         @staticmethod
1748         def suitable(url):
1749                 return (re.match(YahooIE._VALID_URL, url) is not None)
1750
1751         def report_download_webpage(self, video_id):
1752                 """Report webpage download."""
1753                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1754
1755         def report_extraction(self, video_id):
1756                 """Report information extraction."""
1757                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1758
1759         def _real_initialize(self):
1760                 return
1761
1762         def _real_extract(self, url, new_video=True):
1763                 # Extract ID from URL
1764                 mobj = re.match(self._VALID_URL, url)
1765                 if mobj is None:
1766                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1767                         return
1768
1769                 # At this point we have a new video
1770                 self._downloader.increment_downloads()
1771                 video_id = mobj.group(2)
1772                 video_extension = 'flv'
1773
1774                 # Rewrite valid but non-extractable URLs as
1775                 # extractable English language /watch/ URLs
1776                 if re.match(self._VPAGE_URL, url) is None:
1777                         request = urllib2.Request(url)
1778                         try:
1779                                 webpage = urllib2.urlopen(request).read()
1780                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1781                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1782                                 return
1783
1784                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1785                         if mobj is None:
1786                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1787                                 return
1788                         yahoo_id = mobj.group(1)
1789
1790                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1791                         if mobj is None:
1792                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1793                                 return
1794                         yahoo_vid = mobj.group(1)
1795
1796                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1797                         return self._real_extract(url, new_video=False)
1798
1799                 # Retrieve video webpage to extract further information
1800                 request = urllib2.Request(url)
1801                 try:
1802                         self.report_download_webpage(video_id)
1803                         webpage = urllib2.urlopen(request).read()
1804                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1806                         return
1807
1808                 # Extract uploader and title from webpage
1809                 self.report_extraction(video_id)
1810                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: unable to extract video title')
1813                         return
1814                 video_title = mobj.group(1).decode('utf-8')
1815                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1816
1817                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1820                         return
1821                 video_uploader = mobj.group(1).decode('utf-8')
1822
1823                 # Extract video thumbnail
1824                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1827                         return
1828                 video_thumbnail = mobj.group(1).decode('utf-8')
1829
1830                 # Extract video description
1831                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1832                 if mobj is None:
1833                         self._downloader.trouble(u'ERROR: unable to extract video description')
1834                         return
1835                 video_description = mobj.group(1).decode('utf-8')
1836                 if not video_description: video_description = 'No description available.'
1837
1838                 # Extract video height and width
1839                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1840                 if mobj is None:
1841                         self._downloader.trouble(u'ERROR: unable to extract video height')
1842                         return
1843                 yv_video_height = mobj.group(1)
1844
1845                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: unable to extract video width')
1848                         return
1849                 yv_video_width = mobj.group(1)
1850
1851                 # Retrieve video playlist to extract media URL
1852                 # I'm not completely sure what all these options are, but we
1853                 # seem to need most of them, otherwise the server sends a 401.
1854                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1855                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1856                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1857                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1858                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1859                 try:
1860                         self.report_download_webpage(video_id)
1861                         webpage = urllib2.urlopen(request).read()
1862                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1863                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1864                         return
1865
1866                 # Extract media URL from playlist XML
1867                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1870                         return
1871                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1872                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1873
1874                 try:
1875                         # Process video information
1876                         self._downloader.process_info({
1877                                 'id':           video_id.decode('utf-8'),
1878                                 'url':          video_url,
1879                                 'uploader':     video_uploader,
1880                                 'upload_date':  u'NA',
1881                                 'title':        video_title,
1882                                 'stitle':       simple_title,
1883                                 'ext':          video_extension.decode('utf-8'),
1884                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1885                                 'description':  video_description,
1886                                 'thumbnail':    video_thumbnail,
1887                                 'description':  video_description,
1888                                 'player_url':   None,
1889                         })
1890                 except UnavailableVideoError:
1891                         self._downloader.trouble(u'\nERROR: unable to download video')
1892
1893
1894 class GenericIE(InfoExtractor):
1895         """Generic last-resort information extractor."""
1896
1897         def __init__(self, downloader=None):
1898                 InfoExtractor.__init__(self, downloader)
1899
1900         @staticmethod
1901         def suitable(url):
1902                 return True
1903
1904         def report_download_webpage(self, video_id):
1905                 """Report webpage download."""
1906                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1907                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1908
1909         def report_extraction(self, video_id):
1910                 """Report information extraction."""
1911                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1912
1913         def _real_initialize(self):
1914                 return
1915
1916         def _real_extract(self, url):
1917                 # At this point we have a new video
1918                 self._downloader.increment_downloads()
1919
1920                 video_id = url.split('/')[-1]
1921                 request = urllib2.Request(url)
1922                 try:
1923                         self.report_download_webpage(video_id)
1924                         webpage = urllib2.urlopen(request).read()
1925                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1926                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1927                         return
1928                 except ValueError, err:
1929                         # since this is the last-resort InfoExtractor, if
1930                         # this error is thrown, it'll be thrown here
1931                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1932                         return
1933
1934                 self.report_extraction(video_id)
1935                 # Start with something easy: JW Player in SWFObject
1936                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1937                 if mobj is None:
1938                         # Broaden the search a little bit
1939                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1940                 if mobj is None:
1941                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1942                         return
1943
1944                 # It's possible that one of the regexes
1945                 # matched, but returned an empty group:
1946                 if mobj.group(1) is None:
1947                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1948                         return
1949
1950                 video_url = urllib.unquote(mobj.group(1))
1951                 video_id  = os.path.basename(video_url)
1952
1953                 # here's a fun little line of code for you:
1954                 video_extension = os.path.splitext(video_id)[1][1:]
1955                 video_id        = os.path.splitext(video_id)[0]
1956
1957                 # it's tempting to parse this further, but you would
1958                 # have to take into account all the variations like
1959                 #   Video Title - Site Name
1960                 #   Site Name | Video Title
1961                 #   Video Title - Tagline | Site Name
1962                 # and so on and so forth; it's just not practical
1963                 mobj = re.search(r'<title>(.*)</title>', webpage)
1964                 if mobj is None:
1965                         self._downloader.trouble(u'ERROR: unable to extract title')
1966                         return
1967                 video_title = mobj.group(1).decode('utf-8')
1968                 video_title = sanitize_title(video_title)
1969                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1970
1971                 # video uploader is domain name
1972                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1973                 if mobj is None:
1974                         self._downloader.trouble(u'ERROR: unable to extract title')
1975                         return
1976                 video_uploader = mobj.group(1).decode('utf-8')
1977
1978                 try:
1979                         # Process video information
1980                         self._downloader.process_info({
1981                                 'id':           video_id.decode('utf-8'),
1982                                 'url':          video_url.decode('utf-8'),
1983                                 'uploader':     video_uploader,
1984                                 'upload_date':  u'NA',
1985                                 'title':        video_title,
1986                                 'stitle':       simple_title,
1987                                 'ext':          video_extension.decode('utf-8'),
1988                                 'format':       u'NA',
1989                                 'player_url':   None,
1990                         })
1991                 except UnavailableVideoError, err:
1992                         self._downloader.trouble(u'\nERROR: unable to download video')
1993
1994
1995 class YoutubeSearchIE(InfoExtractor):
1996         """Information Extractor for YouTube search queries."""
1997         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1998         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1999         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2000         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2001         _youtube_ie = None
2002         _max_youtube_results = 1000
2003
2004         def __init__(self, youtube_ie, downloader=None):
2005                 InfoExtractor.__init__(self, downloader)
2006                 self._youtube_ie = youtube_ie
2007
2008         @staticmethod
2009         def suitable(url):
2010                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2011
2012         def report_download_page(self, query, pagenum):
2013                 """Report attempt to download playlist page with given number."""
2014                 query = query.decode(preferredencoding())
2015                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2016
2017         def _real_initialize(self):
2018                 self._youtube_ie.initialize()
2019
2020         def _real_extract(self, query):
2021                 mobj = re.match(self._VALID_QUERY, query)
2022                 if mobj is None:
2023                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2024                         return
2025
2026                 prefix, query = query.split(':')
2027                 prefix = prefix[8:]
2028                 query  = query.encode('utf-8')
2029                 if prefix == '':
2030                         self._download_n_results(query, 1)
2031                         return
2032                 elif prefix == 'all':
2033                         self._download_n_results(query, self._max_youtube_results)
2034                         return
2035                 else:
2036                         try:
2037                                 n = long(prefix)
2038                                 if n <= 0:
2039                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2040                                         return
2041                                 elif n > self._max_youtube_results:
2042                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2043                                         n = self._max_youtube_results
2044                                 self._download_n_results(query, n)
2045                                 return
2046                         except ValueError: # parsing prefix as integer fails
2047                                 self._download_n_results(query, 1)
2048                                 return
2049
2050         def _download_n_results(self, query, n):
2051                 """Downloads a specified number of results for a query"""
2052
2053                 video_ids = []
2054                 already_seen = set()
2055                 pagenum = 1
2056
2057                 while True:
2058                         self.report_download_page(query, pagenum)
2059                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2060                         request = urllib2.Request(result_url)
2061                         try:
2062                                 page = urllib2.urlopen(request).read()
2063                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2064                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2065                                 return
2066
2067                         # Extract video identifiers
2068                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2069                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2070                                 if video_id not in already_seen:
2071                                         video_ids.append(video_id)
2072                                         already_seen.add(video_id)
2073                                         if len(video_ids) == n:
2074                                                 # Specified n videos reached
2075                                                 for id in video_ids:
2076                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2077                                                 return
2078
2079                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2080                                 for id in video_ids:
2081                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2082                                 return
2083
2084                         pagenum = pagenum + 1
2085
2086 class GoogleSearchIE(InfoExtractor):
2087         """Information Extractor for Google Video search queries."""
2088         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2089         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2090         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2091         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2092         _google_ie = None
2093         _max_google_results = 1000
2094
2095         def __init__(self, google_ie, downloader=None):
2096                 InfoExtractor.__init__(self, downloader)
2097                 self._google_ie = google_ie
2098
2099         @staticmethod
2100         def suitable(url):
2101                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2102
2103         def report_download_page(self, query, pagenum):
2104                 """Report attempt to download playlist page with given number."""
2105                 query = query.decode(preferredencoding())
2106                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2107
2108         def _real_initialize(self):
2109                 self._google_ie.initialize()
2110
2111         def _real_extract(self, query):
2112                 mobj = re.match(self._VALID_QUERY, query)
2113                 if mobj is None:
2114                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2115                         return
2116
2117                 prefix, query = query.split(':')
2118                 prefix = prefix[8:]
2119                 query  = query.encode('utf-8')
2120                 if prefix == '':
2121                         self._download_n_results(query, 1)
2122                         return
2123                 elif prefix == 'all':
2124                         self._download_n_results(query, self._max_google_results)
2125                         return
2126                 else:
2127                         try:
2128                                 n = long(prefix)
2129                                 if n <= 0:
2130                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2131                                         return
2132                                 elif n > self._max_google_results:
2133                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2134                                         n = self._max_google_results
2135                                 self._download_n_results(query, n)
2136                                 return
2137                         except ValueError: # parsing prefix as integer fails
2138                                 self._download_n_results(query, 1)
2139                                 return
2140
2141         def _download_n_results(self, query, n):
2142                 """Downloads a specified number of results for a query"""
2143
2144                 video_ids = []
2145                 already_seen = set()
2146                 pagenum = 1
2147
2148                 while True:
2149                         self.report_download_page(query, pagenum)
2150                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2151                         request = urllib2.Request(result_url)
2152                         try:
2153                                 page = urllib2.urlopen(request).read()
2154                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2155                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2156                                 return
2157
2158                         # Extract video identifiers
2159                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2160                                 video_id = mobj.group(1)
2161                                 if video_id not in already_seen:
2162                                         video_ids.append(video_id)
2163                                         already_seen.add(video_id)
2164                                         if len(video_ids) == n:
2165                                                 # Specified n videos reached
2166                                                 for id in video_ids:
2167                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2168                                                 return
2169
2170                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2171                                 for id in video_ids:
2172                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2173                                 return
2174
2175                         pagenum = pagenum + 1
2176
2177 class YahooSearchIE(InfoExtractor):
2178         """Information Extractor for Yahoo! Video search queries."""
2179         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2180         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2181         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2182         _MORE_PAGES_INDICATOR = r'\s*Next'
2183         _yahoo_ie = None
2184         _max_yahoo_results = 1000
2185
2186         def __init__(self, yahoo_ie, downloader=None):
2187                 InfoExtractor.__init__(self, downloader)
2188                 self._yahoo_ie = yahoo_ie
2189
2190         @staticmethod
2191         def suitable(url):
2192                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2193
2194         def report_download_page(self, query, pagenum):
2195                 """Report attempt to download playlist page with given number."""
2196                 query = query.decode(preferredencoding())
2197                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2198
2199         def _real_initialize(self):
2200                 self._yahoo_ie.initialize()
2201
2202         def _real_extract(self, query):
2203                 mobj = re.match(self._VALID_QUERY, query)
2204                 if mobj is None:
2205                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2206                         return
2207
2208                 prefix, query = query.split(':')
2209                 prefix = prefix[8:]
2210                 query  = query.encode('utf-8')
2211                 if prefix == '':
2212                         self._download_n_results(query, 1)
2213                         return
2214                 elif prefix == 'all':
2215                         self._download_n_results(query, self._max_yahoo_results)
2216                         return
2217                 else:
2218                         try:
2219                                 n = long(prefix)
2220                                 if n <= 0:
2221                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2222                                         return
2223                                 elif n > self._max_yahoo_results:
2224                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2225                                         n = self._max_yahoo_results
2226                                 self._download_n_results(query, n)
2227                                 return
2228                         except ValueError: # parsing prefix as integer fails
2229                                 self._download_n_results(query, 1)
2230                                 return
2231
2232         def _download_n_results(self, query, n):
2233                 """Downloads a specified number of results for a query"""
2234
2235                 video_ids = []
2236                 already_seen = set()
2237                 pagenum = 1
2238
2239                 while True:
2240                         self.report_download_page(query, pagenum)
2241                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2242                         request = urllib2.Request(result_url)
2243                         try:
2244                                 page = urllib2.urlopen(request).read()
2245                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2246                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2247                                 return
2248
2249                         # Extract video identifiers
2250                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2251                                 video_id = mobj.group(1)
2252                                 if video_id not in already_seen:
2253                                         video_ids.append(video_id)
2254                                         already_seen.add(video_id)
2255                                         if len(video_ids) == n:
2256                                                 # Specified n videos reached
2257                                                 for id in video_ids:
2258                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2259                                                 return
2260
2261                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2262                                 for id in video_ids:
2263                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2264                                 return
2265
2266                         pagenum = pagenum + 1
2267
2268 class YoutubePlaylistIE(InfoExtractor):
2269         """Information Extractor for YouTube playlists."""
2270
2271         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2272         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2273         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2274         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2275         _youtube_ie = None
2276
2277         def __init__(self, youtube_ie, downloader=None):
2278                 InfoExtractor.__init__(self, downloader)
2279                 self._youtube_ie = youtube_ie
2280
2281         @staticmethod
2282         def suitable(url):
2283                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2284
2285         def report_download_page(self, playlist_id, pagenum):
2286                 """Report attempt to download playlist page with given number."""
2287                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2288
2289         def _real_initialize(self):
2290                 self._youtube_ie.initialize()
2291
2292         def _real_extract(self, url):
2293                 # Extract playlist id
2294                 mobj = re.match(self._VALID_URL, url)
2295                 if mobj is None:
2296                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2297                         return
2298
2299                 # Single video case
2300                 if mobj.group(3) is not None:
2301                         self._youtube_ie.extract(mobj.group(3))
2302                         return
2303
2304                 # Download playlist pages
2305                 # prefix is 'p' as default for playlists but there are other types that need extra care
2306                 playlist_prefix = mobj.group(1)
2307                 if playlist_prefix == 'a':
2308                         playlist_access = 'artist'
2309                 else:
2310                         playlist_prefix = 'p'
2311                         playlist_access = 'view_play_list'
2312                 playlist_id = mobj.group(2)
2313                 video_ids = []
2314                 pagenum = 1
2315
2316                 while True:
2317                         self.report_download_page(playlist_id, pagenum)
2318                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2319                         try:
2320                                 page = urllib2.urlopen(request).read()
2321                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2322                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2323                                 return
2324
2325                         # Extract video identifiers
2326                         ids_in_page = []
2327                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2328                                 if mobj.group(1) not in ids_in_page:
2329                                         ids_in_page.append(mobj.group(1))
2330                         video_ids.extend(ids_in_page)
2331
2332                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2333                                 break
2334                         pagenum = pagenum + 1
2335
2336                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2337                 playlistend = self._downloader.params.get('playlistend', -1)
2338                 video_ids = video_ids[playliststart:playlistend]
2339
2340                 for id in video_ids:
2341                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2342                 return
2343
2344 class YoutubeUserIE(InfoExtractor):
2345         """Information Extractor for YouTube users."""
2346
2347         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2348         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2349         _GDATA_PAGE_SIZE = 50
2350         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2351         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2352         _youtube_ie = None
2353
2354         def __init__(self, youtube_ie, downloader=None):
2355                 InfoExtractor.__init__(self, downloader)
2356                 self._youtube_ie = youtube_ie
2357
2358         @staticmethod
2359         def suitable(url):
2360                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2361
2362         def report_download_page(self, username, start_index):
2363                 """Report attempt to download user page."""
2364                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2365                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2366
2367         def _real_initialize(self):
2368                 self._youtube_ie.initialize()
2369
2370         def _real_extract(self, url):
2371                 # Extract username
2372                 mobj = re.match(self._VALID_URL, url)
2373                 if mobj is None:
2374                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2375                         return
2376
2377                 username = mobj.group(1)
2378
2379                 # Download video ids using YouTube Data API. Result size per
2380                 # query is limited (currently to 50 videos) so we need to query
2381                 # page by page until there are no video ids - it means we got
2382                 # all of them.
2383
2384                 video_ids = []
2385                 pagenum = 0
2386
2387                 while True:
2388                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2389                         self.report_download_page(username, start_index)
2390
2391                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2392
2393                         try:
2394                                 page = urllib2.urlopen(request).read()
2395                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2396                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2397                                 return
2398
2399                         # Extract video identifiers
2400                         ids_in_page = []
2401
2402                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2403                                 if mobj.group(1) not in ids_in_page:
2404                                         ids_in_page.append(mobj.group(1))
2405
2406                         video_ids.extend(ids_in_page)
2407
2408                         # A little optimization - if current page is not
2409                         # "full", ie. does not contain PAGE_SIZE video ids then
2410                         # we can assume that this page is the last one - there
2411                         # are no more ids on further pages - no need to query
2412                         # again.
2413
2414                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2415                                 break
2416
2417                         pagenum += 1
2418
2419                 all_ids_count = len(video_ids)
2420                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2421                 playlistend = self._downloader.params.get('playlistend', -1)
2422
2423                 if playlistend == -1:
2424                         video_ids = video_ids[playliststart:]
2425                 else:
2426                         video_ids = video_ids[playliststart:playlistend]
2427
2428                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2429                                            (username, all_ids_count, len(video_ids)))
2430
2431                 for video_id in video_ids:
2432                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2433
2434
2435 class DepositFilesIE(InfoExtractor):
2436         """Information extractor for depositfiles.com"""
2437
2438         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2439
2440         def __init__(self, downloader=None):
2441                 InfoExtractor.__init__(self, downloader)
2442
2443         @staticmethod
2444         def suitable(url):
2445                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2446
2447         def report_download_webpage(self, file_id):
2448                 """Report webpage download."""
2449                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2450
2451         def report_extraction(self, file_id):
2452                 """Report information extraction."""
2453                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2454
2455         def _real_initialize(self):
2456                 return
2457
2458         def _real_extract(self, url):
2459                 # At this point we have a new file
2460                 self._downloader.increment_downloads()
2461
2462                 file_id = url.split('/')[-1]
2463                 # Rebuild url in english locale
2464                 url = 'http://depositfiles.com/en/files/' + file_id
2465
2466                 # Retrieve file webpage with 'Free download' button pressed
2467                 free_download_indication = { 'gateway_result' : '1' }
2468                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2469                 try:
2470                         self.report_download_webpage(file_id)
2471                         webpage = urllib2.urlopen(request).read()
2472                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2473                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2474                         return
2475
2476                 # Search for the real file URL
2477                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2478                 if (mobj is None) or (mobj.group(1) is None):
2479                         # Try to figure out reason of the error.
2480                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2481                         if (mobj is not None) and (mobj.group(1) is not None):
2482                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2483                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2484                         else:
2485                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2486                         return
2487
2488                 file_url = mobj.group(1)
2489                 file_extension = os.path.splitext(file_url)[1][1:]
2490
2491                 # Search for file title
2492                 mobj = re.search(r'<b title="(.*?)">', webpage)
2493                 if mobj is None:
2494                         self._downloader.trouble(u'ERROR: unable to extract title')
2495                         return
2496                 file_title = mobj.group(1).decode('utf-8')
2497
2498                 try:
2499                         # Process file information
2500                         self._downloader.process_info({
2501                                 'id':           file_id.decode('utf-8'),
2502                                 'url':          file_url.decode('utf-8'),
2503                                 'uploader':     u'NA',
2504                                 'upload_date':  u'NA',
2505                                 'title':        file_title,
2506                                 'stitle':       file_title,
2507                                 'ext':          file_extension.decode('utf-8'),
2508                                 'format':       u'NA',
2509                                 'player_url':   None,
2510                         })
2511                 except UnavailableVideoError, err:
2512                         self._downloader.trouble(u'ERROR: unable to download file')
2513
2514 class FacebookIE(InfoExtractor):
2515         """Information Extractor for Facebook"""
2516
2517         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2518         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2519         _NETRC_MACHINE = 'facebook'
2520         _available_formats = ['highqual', 'lowqual']
2521         _video_extensions = {
2522                 'highqual': 'mp4',
2523                 'lowqual': 'mp4',
2524         }
2525
2526         def __init__(self, downloader=None):
2527                 InfoExtractor.__init__(self, downloader)
2528
2529         @staticmethod
2530         def suitable(url):
2531                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2532
2533         def _reporter(self, message):
2534                 """Add header and report message."""
2535                 self._downloader.to_screen(u'[facebook] %s' % message)
2536
2537         def report_login(self):
2538                 """Report attempt to log in."""
2539                 self._reporter(u'Logging in')
2540
2541         def report_video_webpage_download(self, video_id):
2542                 """Report attempt to download video webpage."""
2543                 self._reporter(u'%s: Downloading video webpage' % video_id)
2544
2545         def report_information_extraction(self, video_id):
2546                 """Report attempt to extract video information."""
2547                 self._reporter(u'%s: Extracting video information' % video_id)
2548
2549         def _parse_page(self, video_webpage):
2550                 """Extract video information from page"""
2551                 # General data
2552                 data = {'title': r'class="video_title datawrap">(.*?)</',
2553                         'description': r'<div class="datawrap">(.*?)</div>',
2554                         'owner': r'\("video_owner_name", "(.*?)"\)',
2555                         'upload_date': r'data-date="(.*?)"',
2556                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2557                         }
2558                 video_info = {}
2559                 for piece in data.keys():
2560                         mobj = re.search(data[piece], video_webpage)
2561                         if mobj is not None:
2562                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2563
2564                 # Video urls
2565                 video_urls = {}
2566                 for fmt in self._available_formats:
2567                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2568                         if mobj is not None:
2569                                 # URL is in a Javascript segment inside an escaped Unicode format within
2570                                 # the generally utf-8 page
2571                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2572                 video_info['video_urls'] = video_urls
2573
2574                 return video_info
2575
2576         def _real_initialize(self):
2577                 if self._downloader is None:
2578                         return
2579
2580                 useremail = None
2581                 password = None
2582                 downloader_params = self._downloader.params
2583
2584                 # Attempt to use provided username and password or .netrc data
2585                 if downloader_params.get('username', None) is not None:
2586                         useremail = downloader_params['username']
2587                         password = downloader_params['password']
2588                 elif downloader_params.get('usenetrc', False):
2589                         try:
2590                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2591                                 if info is not None:
2592                                         useremail = info[0]
2593                                         password = info[2]
2594                                 else:
2595                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2596                         except (IOError, netrc.NetrcParseError), err:
2597                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2598                                 return
2599
2600                 if useremail is None:
2601                         return
2602
2603                 # Log in
2604                 login_form = {
2605                         'email': useremail,
2606                         'pass': password,
2607                         'login': 'Log+In'
2608                         }
2609                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2610                 try:
2611                         self.report_login()
2612                         login_results = urllib2.urlopen(request).read()
2613                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2614                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2615                                 return
2616                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2617                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2618                         return
2619
2620         def _real_extract(self, url):
2621                 mobj = re.match(self._VALID_URL, url)
2622                 if mobj is None:
2623                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2624                         return
2625                 video_id = mobj.group('ID')
2626
2627                 # Get video webpage
2628                 self.report_video_webpage_download(video_id)
2629                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2630                 try:
2631                         page = urllib2.urlopen(request)
2632                         video_webpage = page.read()
2633                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2634                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2635                         return
2636
2637                 # Start extracting information
2638                 self.report_information_extraction(video_id)
2639
2640                 # Extract information
2641                 video_info = self._parse_page(video_webpage)
2642
2643                 # uploader
2644                 if 'owner' not in video_info:
2645                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2646                         return
2647                 video_uploader = video_info['owner']
2648
2649                 # title
2650                 if 'title' not in video_info:
2651                         self._downloader.trouble(u'ERROR: unable to extract video title')
2652                         return
2653                 video_title = video_info['title']
2654                 video_title = video_title.decode('utf-8')
2655                 video_title = sanitize_title(video_title)
2656
2657                 # simplified title
2658                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2659                 simple_title = simple_title.strip(ur'_')
2660
2661                 # thumbnail image
2662                 if 'thumbnail' not in video_info:
2663                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2664                         video_thumbnail = ''
2665                 else:
2666                         video_thumbnail = video_info['thumbnail']
2667
2668                 # upload date
2669                 upload_date = u'NA'
2670                 if 'upload_date' in video_info:
2671                         upload_time = video_info['upload_date']
2672                         timetuple = email.utils.parsedate_tz(upload_time)
2673                         if timetuple is not None:
2674                                 try:
2675                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2676                                 except:
2677                                         pass
2678
2679                 # description
2680                 video_description = video_info.get('description', 'No description available.')
2681
2682                 url_map = video_info['video_urls']
2683                 if len(url_map.keys()) > 0:
2684                         # Decide which formats to download
2685                         req_format = self._downloader.params.get('format', None)
2686                         format_limit = self._downloader.params.get('format_limit', None)
2687
2688                         if format_limit is not None and format_limit in self._available_formats:
2689                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2690                         else:
2691                                 format_list = self._available_formats
2692                         existing_formats = [x for x in format_list if x in url_map]
2693                         if len(existing_formats) == 0:
2694                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2695                                 return
2696                         if req_format is None:
2697                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2698                         elif req_format == '-1':
2699                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2700                         else:
2701                                 # Specific format
2702                                 if req_format not in url_map:
2703                                         self._downloader.trouble(u'ERROR: requested format not available')
2704                                         return
2705                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2706
2707                 for format_param, video_real_url in video_url_list:
2708
2709                         # At this point we have a new video
2710                         self._downloader.increment_downloads()
2711
2712                         # Extension
2713                         video_extension = self._video_extensions.get(format_param, 'mp4')
2714
2715                         # Find the video URL in fmt_url_map or conn paramters
2716                         try:
2717                                 # Process video information
2718                                 self._downloader.process_info({
2719                                         'id':           video_id.decode('utf-8'),
2720                                         'url':          video_real_url.decode('utf-8'),
2721                                         'uploader':     video_uploader.decode('utf-8'),
2722                                         'upload_date':  upload_date,
2723                                         'title':        video_title,
2724                                         'stitle':       simple_title,
2725                                         'ext':          video_extension.decode('utf-8'),
2726                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2727                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2728                                         'description':  video_description.decode('utf-8'),
2729                                         'player_url':   None,
2730                                 })
2731                         except UnavailableVideoError, err:
2732                                 self._downloader.trouble(u'\nERROR: unable to download video')
2733
2734 class BlipTVIE(InfoExtractor):
2735         """Information extractor for blip.tv"""
2736
2737         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2738         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2739
2740         @staticmethod
2741         def suitable(url):
2742                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2743
2744         def report_extraction(self, file_id):
2745                 """Report information extraction."""
2746                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2747
2748         def _simplify_title(self, title):
2749                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2750                 res = res.strip(ur'_')
2751                 return res
2752
2753         def _real_extract(self, url):
2754                 mobj = re.match(self._VALID_URL, url)
2755                 if mobj is None:
2756                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2757                         return
2758
2759                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2760                 request = urllib2.Request(json_url)
2761                 self.report_extraction(mobj.group(1))
2762                 try:
2763                         json_code = urllib2.urlopen(request).read()
2764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2765                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2766                         return
2767                 try:
2768                         json_data = json.loads(json_code)
2769                         data = json_data['Post'] if 'Post' in json_data else json_data
2770
2771                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2772                         video_url = data['media']['url']
2773                         umobj = re.match(self._URL_EXT, video_url)
2774                         if umobj is None:
2775                                 raise ValueError('Can not determine filename extension')
2776                         ext = umobj.group(1)
2777
2778                         self._downloader.increment_downloads()
2779
2780                         info = {
2781                                 'id': data['item_id'],
2782                                 'url': video_url,
2783                                 'uploader': data['display_name'],
2784                                 'upload_date': upload_date,
2785                                 'title': data['title'],
2786                                 'stitle': self._simplify_title(data['title']),
2787                                 'ext': ext,
2788                                 'format': data['media']['mimeType'],
2789                                 'thumbnail': data['thumbnailUrl'],
2790                                 'description': data['description'],
2791                                 'player_url': data['embedUrl']
2792                         }
2793                 except (ValueError,KeyError), err:
2794                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2795                         return
2796
2797                 try:
2798                         self._downloader.process_info(info)
2799                 except UnavailableVideoError, err:
2800                         self._downloader.trouble(u'\nERROR: unable to download video')
2801
2802
2803 class PostProcessor(object):
2804         """Post Processor class.
2805
2806         PostProcessor objects can be added to downloaders with their
2807         add_post_processor() method. When the downloader has finished a
2808         successful download, it will take its internal chain of PostProcessors
2809         and start calling the run() method on each one of them, first with
2810         an initial argument and then with the returned value of the previous
2811         PostProcessor.
2812
2813         The chain will be stopped if one of them ever returns None or the end
2814         of the chain is reached.
2815
2816         PostProcessor objects follow a "mutual registration" process similar
2817         to InfoExtractor objects.
2818         """
2819
2820         _downloader = None
2821
2822         def __init__(self, downloader=None):
2823                 self._downloader = downloader
2824
2825         def set_downloader(self, downloader):
2826                 """Sets the downloader for this PP."""
2827                 self._downloader = downloader
2828
2829         def run(self, information):
2830                 """Run the PostProcessor.
2831
2832                 The "information" argument is a dictionary like the ones
2833                 composed by InfoExtractors. The only difference is that this
2834                 one has an extra field called "filepath" that points to the
2835                 downloaded file.
2836
2837                 When this method returns None, the postprocessing chain is
2838                 stopped. However, this method may return an information
2839                 dictionary that will be passed to the next postprocessing
2840                 object in the chain. It can be the one it received after
2841                 changing some fields.
2842
2843                 In addition, this method may raise a PostProcessingError
2844                 exception that will be taken into account by the downloader
2845                 it was called from.
2846                 """
2847                 return information # by default, do nothing
2848
2849 class FFmpegExtractAudioPP(PostProcessor):
2850
2851         def __init__(self, downloader=None, preferredcodec=None):
2852                 PostProcessor.__init__(self, downloader)
2853                 if preferredcodec is None:
2854                         preferredcodec = 'best'
2855                 self._preferredcodec = preferredcodec
2856
2857         @staticmethod
2858         def get_audio_codec(path):
2859                 try:
2860                         cmd = ['ffprobe', '-show_streams', '--', path]
2861                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2862                         output = handle.communicate()[0]
2863                         if handle.wait() != 0:
2864                                 return None
2865                 except (IOError, OSError):
2866                         return None
2867                 audio_codec = None
2868                 for line in output.split('\n'):
2869                         if line.startswith('codec_name='):
2870                                 audio_codec = line.split('=')[1].strip()
2871                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2872                                 return audio_codec
2873                 return None
2874
2875         @staticmethod
2876         def run_ffmpeg(path, out_path, codec, more_opts):
2877                 try:
2878                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2879                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2880                         return (ret == 0)
2881                 except (IOError, OSError):
2882                         return False
2883
2884         def run(self, information):
2885                 path = information['filepath']
2886
2887                 filecodec = self.get_audio_codec(path)
2888                 if filecodec is None:
2889                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2890                         return None
2891
2892                 more_opts = []
2893                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2894                         if filecodec == 'aac' or filecodec == 'mp3':
2895                                 # Lossless if possible
2896                                 acodec = 'copy'
2897                                 extension = filecodec
2898                                 if filecodec == 'aac':
2899                                         more_opts = ['-f', 'adts']
2900                         else:
2901                                 # MP3 otherwise.
2902                                 acodec = 'libmp3lame'
2903                                 extension = 'mp3'
2904                                 more_opts = ['-ab', '128k']
2905                 else:
2906                         # We convert the audio (lossy)
2907                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2908                         extension = self._preferredcodec
2909                         more_opts = ['-ab', '128k']
2910                         if self._preferredcodec == 'aac':
2911                                 more_opts += ['-f', 'adts']
2912
2913                 (prefix, ext) = os.path.splitext(path)
2914                 new_path = prefix + '.' + extension
2915                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2916                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2917
2918                 if not status:
2919                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2920                         return None
2921
2922                 try:
2923                         os.remove(path)
2924                 except (IOError, OSError):
2925                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2926                         return None
2927
2928                 information['filepath'] = new_path
2929                 return information
2930
2931 ### MAIN PROGRAM ###
2932 if __name__ == '__main__':
2933         try:
2934                 # Modules needed only when running the main program
2935                 import getpass
2936                 import optparse
2937
2938                 # Function to update the program file with the latest version from the repository.
2939                 def update_self(downloader, filename):
2940                         # Note: downloader only used for options
2941                         if not os.access(filename, os.W_OK):
2942                                 sys.exit('ERROR: no write permissions on %s' % filename)
2943
2944                         downloader.to_screen('Updating to latest stable version...')
2945                         try:
2946                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2947                                 latest_version = urllib.urlopen(latest_url).read().strip()
2948                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2949                                 newcontent = urllib.urlopen(prog_url).read()
2950                         except (IOError, OSError), err:
2951                                 sys.exit('ERROR: unable to download latest version')
2952                         try:
2953                                 stream = open(filename, 'w')
2954                                 stream.write(newcontent)
2955                                 stream.close()
2956                         except (IOError, OSError), err:
2957                                 sys.exit('ERROR: unable to overwrite current version')
2958                         downloader.to_screen('Updated to version %s' % latest_version)
2959
2960                 # Parse command line
2961                 parser = optparse.OptionParser(
2962                         usage='Usage: %prog [options] url...',
2963                         version='2011.07.09-phihag',
2964                         conflict_handler='resolve',
2965                 )
2966
2967                 parser.add_option('-h', '--help',
2968                                 action='help', help='print this help text and exit')
2969                 parser.add_option('-v', '--version',
2970                                 action='version', help='print program version and exit')
2971                 parser.add_option('-U', '--update',
2972                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2973                 parser.add_option('-i', '--ignore-errors',
2974                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2975                 parser.add_option('-r', '--rate-limit',
2976                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2977                 parser.add_option('-R', '--retries',
2978                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2979                 parser.add_option('--playlist-start',
2980                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2981                 parser.add_option('--playlist-end',
2982                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2983                 parser.add_option('--dump-user-agent',
2984                                 action='store_true', dest='dump_user_agent',
2985                                 help='display the current browser identification', default=False)
2986
2987                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2988                 authentication.add_option('-u', '--username',
2989                                 dest='username', metavar='USERNAME', help='account username')
2990                 authentication.add_option('-p', '--password',
2991                                 dest='password', metavar='PASSWORD', help='account password')
2992                 authentication.add_option('-n', '--netrc',
2993                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2994                 parser.add_option_group(authentication)
2995
2996                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2997                 video_format.add_option('-f', '--format',
2998                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2999                 video_format.add_option('--all-formats',
3000                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3001                 video_format.add_option('--max-quality',
3002                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3003                 parser.add_option_group(video_format)
3004
3005                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3006                 verbosity.add_option('-q', '--quiet',
3007                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3008                 verbosity.add_option('-s', '--simulate',
3009                                 action='store_true', dest='simulate', help='do not download video', default=False)
3010                 verbosity.add_option('-g', '--get-url',
3011                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3012                 verbosity.add_option('-e', '--get-title',
3013                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3014                 verbosity.add_option('--get-thumbnail',
3015                                 action='store_true', dest='getthumbnail',
3016                                 help='simulate, quiet but print thumbnail URL', default=False)
3017                 verbosity.add_option('--get-description',
3018                                 action='store_true', dest='getdescription',
3019                                 help='simulate, quiet but print video description', default=False)
3020                 verbosity.add_option('--get-filename',
3021                                 action='store_true', dest='getfilename',
3022                                 help='simulate, quiet but print output filename', default=False)
3023                 verbosity.add_option('--no-progress',
3024                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3025                 verbosity.add_option('--console-title',
3026                                 action='store_true', dest='consoletitle',
3027                                 help='display progress in console titlebar', default=False)
3028                 parser.add_option_group(verbosity)
3029
3030                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3031                 filesystem.add_option('-t', '--title',
3032                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3033                 filesystem.add_option('-l', '--literal',
3034                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3035                 filesystem.add_option('-A', '--auto-number',
3036                                 action='store_true', dest='autonumber',
3037                                 help='number downloaded files starting from 00000', default=False)
3038                 filesystem.add_option('-o', '--output',
3039                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3040                 filesystem.add_option('-a', '--batch-file',
3041                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3042                 filesystem.add_option('-w', '--no-overwrites',
3043                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3044                 filesystem.add_option('-c', '--continue',
3045                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3046                 filesystem.add_option('--cookies',
3047                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3048                 filesystem.add_option('--no-part',
3049                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3050                 filesystem.add_option('--no-mtime',
3051                                 action='store_false', dest='updatetime',
3052                                 help='do not use the Last-modified header to set the file modification time', default=True)
3053                 filesystem.add_option('--write-description',
3054                                 action='store_true', dest='writedescription',
3055                                 help='write video description to a .description file', default=False)
3056                 filesystem.add_option('--write-info-json',
3057                                 action='store_true', dest='writeinfojson',
3058                                 help='write video metadata to a .info.json file', default=False)
3059                 parser.add_option_group(filesystem)
3060
3061                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3062                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3063                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3064                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3065                                 help='"best", "aac" or "mp3"; best by default')
3066                 parser.add_option_group(postproc)
3067
3068                 (opts, args) = parser.parse_args()
3069
3070                 # Open appropriate CookieJar
3071                 if opts.cookiefile is None:
3072                         jar = cookielib.CookieJar()
3073                 else:
3074                         try:
3075                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3076                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3077                                         jar.load()
3078                         except (IOError, OSError), err:
3079                                 sys.exit(u'ERROR: unable to open cookie file')
3080
3081                 # Dump user agent
3082                 if opts.dump_user_agent:
3083                         print std_headers['User-Agent']
3084                         sys.exit(0)
3085
3086                 # General configuration
3087                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3088                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3089                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3090
3091                 # Batch file verification
3092                 batchurls = []
3093                 if opts.batchfile is not None:
3094                         try:
3095                                 if opts.batchfile == '-':
3096                                         batchfd = sys.stdin
3097                                 else:
3098                                         batchfd = open(opts.batchfile, 'r')
3099                                 batchurls = batchfd.readlines()
3100                                 batchurls = [x.strip() for x in batchurls]
3101                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3102                         except IOError:
3103                                 sys.exit(u'ERROR: batch file could not be read')
3104                 all_urls = batchurls + args
3105
3106                 # Conflicting, missing and erroneous options
3107                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3108                         parser.error(u'using .netrc conflicts with giving username/password')
3109                 if opts.password is not None and opts.username is None:
3110                         parser.error(u'account username missing')
3111                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3112                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3113                 if opts.usetitle and opts.useliteral:
3114                         parser.error(u'using title conflicts with using literal title')
3115                 if opts.username is not None and opts.password is None:
3116                         opts.password = getpass.getpass(u'Type account password and press return:')
3117                 if opts.ratelimit is not None:
3118                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3119                         if numeric_limit is None:
3120                                 parser.error(u'invalid rate limit specified')
3121                         opts.ratelimit = numeric_limit
3122                 if opts.retries is not None:
3123                         try:
3124                                 opts.retries = long(opts.retries)
3125                         except (TypeError, ValueError), err:
3126                                 parser.error(u'invalid retry count specified')
3127                 try:
3128                         opts.playliststart = long(opts.playliststart)
3129                         if opts.playliststart <= 0:
3130                                 raise ValueError
3131                 except (TypeError, ValueError), err:
3132                         parser.error(u'invalid playlist start number specified')
3133                 try:
3134                         opts.playlistend = long(opts.playlistend)
3135                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3136                                 raise ValueError
3137                 except (TypeError, ValueError), err:
3138                         parser.error(u'invalid playlist end number specified')
3139                 if opts.extractaudio:
3140                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3141                                 parser.error(u'invalid audio format specified')
3142
3143                 # Information extractors
3144                 youtube_ie = YoutubeIE()
3145                 metacafe_ie = MetacafeIE(youtube_ie)
3146                 dailymotion_ie = DailymotionIE()
3147                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3148                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3149                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3150                 google_ie = GoogleIE()
3151                 google_search_ie = GoogleSearchIE(google_ie)
3152                 photobucket_ie = PhotobucketIE()
3153                 yahoo_ie = YahooIE()
3154                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3155                 deposit_files_ie = DepositFilesIE()
3156                 facebook_ie = FacebookIE()
3157                 bliptv_ie = BlipTVIE()
3158                 generic_ie = GenericIE()
3159
3160                 # File downloader
3161                 fd = FileDownloader({
3162                         'usenetrc': opts.usenetrc,
3163                         'username': opts.username,
3164                         'password': opts.password,
3165                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3166                         'forceurl': opts.geturl,
3167                         'forcetitle': opts.gettitle,
3168                         'forcethumbnail': opts.getthumbnail,
3169                         'forcedescription': opts.getdescription,
3170                         'forcefilename': opts.getfilename,
3171                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3172                         'format': opts.format,
3173                         'format_limit': opts.format_limit,
3174                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3175                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3176                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3177                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3178                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3179                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3180                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3181                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3182                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3183                                 or u'%(id)s.%(ext)s'),
3184                         'ignoreerrors': opts.ignoreerrors,
3185                         'ratelimit': opts.ratelimit,
3186                         'nooverwrites': opts.nooverwrites,
3187                         'retries': opts.retries,
3188                         'continuedl': opts.continue_dl,
3189                         'noprogress': opts.noprogress,
3190                         'playliststart': opts.playliststart,
3191                         'playlistend': opts.playlistend,
3192                         'logtostderr': opts.outtmpl == '-',
3193                         'consoletitle': opts.consoletitle,
3194                         'nopart': opts.nopart,
3195                         'updatetime': opts.updatetime,
3196                         'writedescription': opts.writedescription,
3197                         'writeinfojson': opts.writeinfojson,
3198                         })
3199                 fd.add_info_extractor(youtube_search_ie)
3200                 fd.add_info_extractor(youtube_pl_ie)
3201                 fd.add_info_extractor(youtube_user_ie)
3202                 fd.add_info_extractor(metacafe_ie)
3203                 fd.add_info_extractor(dailymotion_ie)
3204                 fd.add_info_extractor(youtube_ie)
3205                 fd.add_info_extractor(google_ie)
3206                 fd.add_info_extractor(google_search_ie)
3207                 fd.add_info_extractor(photobucket_ie)
3208                 fd.add_info_extractor(yahoo_ie)
3209                 fd.add_info_extractor(yahoo_search_ie)
3210                 fd.add_info_extractor(deposit_files_ie)
3211                 fd.add_info_extractor(facebook_ie)
3212                 fd.add_info_extractor(bliptv_ie)
3213
3214                 # This must come last since it's the
3215                 # fallback if none of the others work
3216                 fd.add_info_extractor(generic_ie)
3217
3218                 # PostProcessors
3219                 if opts.extractaudio:
3220                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3221
3222                 # Update version
3223                 if opts.update_self:
3224                         update_self(fd, sys.argv[0])
3225
3226                 # Maybe do nothing
3227                 if len(all_urls) < 1:
3228                         if not opts.update_self:
3229                                 parser.error(u'you must provide at least one URL')
3230                         else:
3231                                 sys.exit()
3232                 retcode = fd.download(all_urls)
3233
3234                 # Dump cookie jar if requested
3235                 if opts.cookiefile is not None:
3236                         try:
3237                                 jar.save()
3238                         except (IOError, OSError), err:
3239                                 sys.exit(u'ERROR: unable to save cookie jar')
3240
3241                 sys.exit(retcode)
3242
3243         except DownloadError:
3244                 sys.exit(1)
3245         except SameFileError:
3246                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3247         except KeyboardInterrupt:
3248                 sys.exit(u'\nERROR: Interrupted by user')