youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # License: Public domain code
   8 import cookielib
   9 import ctypes
  10 import datetime
  11 import gzip
  12 import htmlentitydefs
  13 import httplib
  14 import locale
  15 import math
  16 import netrc
  17 import os
  18 import os.path
  19 import re
  20 import socket
  21 import string
  22 import StringIO
  23 import subprocess
  24 import sys
  25 import time
  26 import urllib
  27 import urllib2
  28 import zlib
  29
  30 # parse_qs was moved from the cgi module to the urlparse module recently.
  31 try:
  32         from urlparse import parse_qs
  33 except ImportError:
  34         from cgi import parse_qs
  35
  36 std_headers = {
  37         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  38         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  39         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  40         'Accept-Encoding': 'gzip, deflate',
  41         'Accept-Language': 'en-us,en;q=0.5',
  42 }
  43
  44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  45
  46 def preferredencoding():
  47         """Get preferred encoding.
  48
  49         Returns the best encoding scheme for the system, based on
  50         locale.getpreferredencoding() and some further tweaks.
  51         """
  52         def yield_preferredencoding():
  53                 try:
  54                         pref = locale.getpreferredencoding()
  55                         u'TEST'.encode(pref)
  56                 except:
  57                         pref = 'UTF-8'
  58                 while True:
  59                         yield pref
  60         return yield_preferredencoding().next()
  61
  62 def htmlentity_transform(matchobj):
  63         """Transforms an HTML entity to a Unicode character.
  64
  65         This function receives a match object and is intended to be used with
  66         the re.sub() function.
  67         """
  68         entity = matchobj.group(1)
  69
  70         # Known non-numeric HTML entity
  71         if entity in htmlentitydefs.name2codepoint:
  72                 return unichr(htmlentitydefs.name2codepoint[entity])
  73
  74         # Unicode character
  75         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  76         if mobj is not None:
  77                 numstr = mobj.group(1)
  78                 if numstr.startswith(u'x'):
  79                         base = 16
  80                         numstr = u'0%s' % numstr
  81                 else:
  82                         base = 10
  83                 return unichr(long(numstr, base))
  84
  85         # Unknown entity in name, return its literal representation
  86         return (u'&%s;' % entity)
  87
  88 def sanitize_title(utitle):
  89         """Sanitizes a video title so it could be used as part of a filename."""
  90         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  91         return utitle.replace(unicode(os.sep), u'%')
  92
  93 def sanitize_open(filename, open_mode):
  94         """Try to open the given filename, and slightly tweak it if this fails.
  95
  96         Attempts to open the given filename. If this fails, it tries to change
  97         the filename slightly, step by step, until it's either able to open it
  98         or it fails and raises a final exception, like the standard open()
  99         function.
 100
 101         It returns the tuple (stream, definitive_file_name).
 102         """
 103         try:
 104                 if filename == u'-':
 105                         if sys.platform == 'win32':
 106                                 import msvcrt
 107                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 108                         return (sys.stdout, filename)
 109                 stream = open(filename, open_mode)
 110                 return (stream, filename)
 111         except (IOError, OSError), err:
 112                 # In case of error, try to remove win32 forbidden chars
 113                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 114
 115                 # An exception here should be caught in the caller
 116                 stream = open(filename, open_mode)
 117                 return (stream, filename)
 118
 119 class DownloadError(Exception):
 120         """Download Error exception.
 121
 122         This exception may be thrown by FileDownloader objects if they are not
 123         configured to continue on errors. They will contain the appropriate
 124         error message.
 125         """
 126         pass
 127
 128 class SameFileError(Exception):
 129         """Same File exception.
 130
 131         This exception will be thrown by FileDownloader objects if they detect
 132         multiple files would have to be downloaded to the same file on disk.
 133         """
 134         pass
 135
 136 class PostProcessingError(Exception):
 137         """Post Processing exception.
 138
 139         This exception may be raised by PostProcessor's .run() method to
 140         indicate an error in the postprocessing task.
 141         """
 142         pass
 143
 144 class UnavailableVideoError(Exception):
 145         """Unavailable Format exception.
 146
 147         This exception will be thrown when a video is requested
 148         in a format that is not available for that video.
 149         """
 150         pass
 151
 152 class ContentTooShortError(Exception):
 153         """Content Too Short exception.
 154
 155         This exception may be raised by FileDownloader objects when a file they
 156         download is too small for what the server announced first, indicating
 157         the connection was probably interrupted.
 158         """
 159         # Both in bytes
 160         downloaded = None
 161         expected = None
 162
 163         def __init__(self, downloaded, expected):
 164                 self.downloaded = downloaded
 165                 self.expected = expected
 166
 167 class YoutubeDLHandler(urllib2.HTTPHandler):
 168         """Handler for HTTP requests and responses.
 169
 170         This class, when installed with an OpenerDirector, automatically adds
 171         the standard headers to every HTTP request and handles gzipped and
 172         deflated responses from web servers. If compression is to be avoided in
 173         a particular request, the original request in the program code only has
 174         to include the HTTP header "Youtubedl-No-Compression", which will be
 175         removed before making the real request.
 176
 177         Part of this code was copied from:
 178
 179           http://techknack.net/python-urllib2-handlers/
 180
 181         Andrew Rowls, the author of that code, agreed to release it to the
 182         public domain.
 183         """
 184
 185         @staticmethod
 186         def deflate(data):
 187                 try:
 188                         return zlib.decompress(data, -zlib.MAX_WBITS)
 189                 except zlib.error:
 190                         return zlib.decompress(data)
 191
 192         @staticmethod
 193         def addinfourl_wrapper(stream, headers, url, code):
 194                 if hasattr(urllib2.addinfourl, 'getcode'):
 195                         return urllib2.addinfourl(stream, headers, url, code)
 196                 ret = urllib2.addinfourl(stream, headers, url)
 197                 ret.code = code
 198                 return ret
 199
 200         def http_request(self, req):
 201                 for h in std_headers:
 202                         if h in req.headers:
 203                                 del req.headers[h]
 204                         req.add_header(h, std_headers[h])
 205                 if 'Youtubedl-no-compression' in req.headers:
 206                         if 'Accept-encoding' in req.headers:
 207                                 del req.headers['Accept-encoding']
 208                         del req.headers['Youtubedl-no-compression']
 209                 return req
 210
 211         def http_response(self, req, resp):
 212                 old_resp = resp
 213                 # gzip
 214                 if resp.headers.get('Content-encoding', '') == 'gzip':
 215                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 216                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 217                         resp.msg = old_resp.msg
 218                 # deflate
 219                 if resp.headers.get('Content-encoding', '') == 'deflate':
 220                         gz = StringIO.StringIO(self.deflate(resp.read()))
 221                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 222                         resp.msg = old_resp.msg
 223                 return resp
 224
 225 class FileDownloader(object):
 226         """File Downloader class.
 227
 228         File downloader objects are the ones responsible of downloading the
 229         actual video file and writing it to disk if the user has requested
 230         it, among some other tasks. In most cases there should be one per
 231         program. As, given a video URL, the downloader doesn't know how to
 232         extract all the needed information, task that InfoExtractors do, it
 233         has to pass the URL to one of them.
 234
 235         For this, file downloader objects have a method that allows
 236         InfoExtractors to be registered in a given order. When it is passed
 237         a URL, the file downloader handles it to the first InfoExtractor it
 238         finds that reports being able to handle it. The InfoExtractor extracts
 239         all the information about the video or videos the URL refers to, and
 240         asks the FileDownloader to process the video information, possibly
 241         downloading the video.
 242
 243         File downloaders accept a lot of parameters. In order not to saturate
 244         the object constructor with arguments, it receives a dictionary of
 245         options instead. These options are available through the params
 246         attribute for the InfoExtractors to use. The FileDownloader also
 247         registers itself as the downloader in charge for the InfoExtractors
 248         that are added to it, so this is a "mutual registration".
 249
 250         Available options:
 251
 252         username:         Username for authentication purposes.
 253         password:         Password for authentication purposes.
 254         usenetrc:         Use netrc for authentication instead.
 255         quiet:            Do not print messages to stdout.
 256         forceurl:         Force printing final URL.
 257         forcetitle:       Force printing title.
 258         forcethumbnail:   Force printing thumbnail URL.
 259         forcedescription: Force printing description.
 260         simulate:         Do not download the video files.
 261         format:           Video format code.
 262         format_limit:     Highest quality format to try.
 263         outtmpl:          Template for output names.
 264         ignoreerrors:     Do not stop on download errors.
 265         ratelimit:        Download speed limit, in bytes/sec.
 266         nooverwrites:     Prevent overwriting files.
 267         retries:          Number of times to retry for HTTP error 5xx
 268         continuedl:       Try to continue downloads if possible.
 269         noprogress:       Do not print the progress bar.
 270         playliststart:    Playlist item to start at.
 271         playlistend:      Playlist item to end at.
 272         logtostderr:      Log messages to stderr instead of stdout.
 273         consoletitle:     Display progress in console window's titlebar.
 274         nopart:           Do not use temporary .part files.
 275         """
 276
 277         params = None
 278         _ies = []
 279         _pps = []
 280         _download_retcode = None
 281         _num_downloads = None
 282         _screen_file = None
 283
 284         def __init__(self, params):
 285                 """Create a FileDownloader object with the given options."""
 286                 self._ies = []
 287                 self._pps = []
 288                 self._download_retcode = 0
 289                 self._num_downloads = 0
 290                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 291                 self.params = params
 292
 293         @staticmethod
 294         def pmkdir(filename):
 295                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 296                 components = filename.split(os.sep)
 297                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 298                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 299                 for dir in aggregate:
 300                         if not os.path.exists(dir):
 301                                 os.mkdir(dir)
 302
 303         @staticmethod
 304         def format_bytes(bytes):
 305                 if bytes is None:
 306                         return 'N/A'
 307                 if type(bytes) is str:
 308                         bytes = float(bytes)
 309                 if bytes == 0.0:
 310                         exponent = 0
 311                 else:
 312                         exponent = long(math.log(bytes, 1024.0))
 313                 suffix = 'bkMGTPEZY'[exponent]
 314                 converted = float(bytes) / float(1024**exponent)
 315                 return '%.2f%s' % (converted, suffix)
 316
 317         @staticmethod
 318         def calc_percent(byte_counter, data_len):
 319                 if data_len is None:
 320                         return '---.-%'
 321                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 322
 323         @staticmethod
 324         def calc_eta(start, now, total, current):
 325                 if total is None:
 326                         return '--:--'
 327                 dif = now - start
 328                 if current == 0 or dif < 0.001: # One millisecond
 329                         return '--:--'
 330                 rate = float(current) / dif
 331                 eta = long((float(total) - float(current)) / rate)
 332                 (eta_mins, eta_secs) = divmod(eta, 60)
 333                 if eta_mins > 99:
 334                         return '--:--'
 335                 return '%02d:%02d' % (eta_mins, eta_secs)
 336
 337         @staticmethod
 338         def calc_speed(start, now, bytes):
 339                 dif = now - start
 340                 if bytes == 0 or dif < 0.001: # One millisecond
 341                         return '%10s' % '---b/s'
 342                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 343
 344         @staticmethod
 345         def best_block_size(elapsed_time, bytes):
 346                 new_min = max(bytes / 2.0, 1.0)
 347                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 348                 if elapsed_time < 0.001:
 349                         return long(new_max)
 350                 rate = bytes / elapsed_time
 351                 if rate > new_max:
 352                         return long(new_max)
 353                 if rate < new_min:
 354                         return long(new_min)
 355                 return long(rate)
 356
 357         @staticmethod
 358         def parse_bytes(bytestr):
 359                 """Parse a string indicating a byte quantity into a long integer."""
 360                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 361                 if matchobj is None:
 362                         return None
 363                 number = float(matchobj.group(1))
 364                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 365                 return long(round(number * multiplier))
 366
 367         def add_info_extractor(self, ie):
 368                 """Add an InfoExtractor object to the end of the list."""
 369                 self._ies.append(ie)
 370                 ie.set_downloader(self)
 371
 372         def add_post_processor(self, pp):
 373                 """Add a PostProcessor object to the end of the chain."""
 374                 self._pps.append(pp)
 375                 pp.set_downloader(self)
 376
 377         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 378                 """Print message to stdout if not in quiet mode."""
 379                 try:
 380                         if not self.params.get('quiet', False):
 381                                 terminator = [u'\n', u''][skip_eol]
 382                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 383                         self._screen_file.flush()
 384                 except (UnicodeEncodeError), err:
 385                         if not ignore_encoding_errors:
 386                                 raise
 387
 388         def to_stderr(self, message):
 389                 """Print message to stderr."""
 390                 print >>sys.stderr, message.encode(preferredencoding())
 391
 392         def to_cons_title(self, message):
 393                 """Set console/terminal window title to message."""
 394                 if not self.params.get('consoletitle', False):
 395                         return
 396                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 397                         # c_wchar_p() might not be necessary if `message` is
 398                         # already of type unicode()
 399                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 400                 elif 'TERM' in os.environ:
 401                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 402
 403         def fixed_template(self):
 404                 """Checks if the output template is fixed."""
 405                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 406
 407         def trouble(self, message=None):
 408                 """Determine action to take when a download problem appears.
 409
 410                 Depending on if the downloader has been configured to ignore
 411                 download errors or not, this method may throw an exception or
 412                 not when errors are found, after printing the message.
 413                 """
 414                 if message is not None:
 415                         self.to_stderr(message)
 416                 if not self.params.get('ignoreerrors', False):
 417                         raise DownloadError(message)
 418                 self._download_retcode = 1
 419
 420         def slow_down(self, start_time, byte_counter):
 421                 """Sleep if the download speed is over the rate limit."""
 422                 rate_limit = self.params.get('ratelimit', None)
 423                 if rate_limit is None or byte_counter == 0:
 424                         return
 425                 now = time.time()
 426                 elapsed = now - start_time
 427                 if elapsed <= 0.0:
 428                         return
 429                 speed = float(byte_counter) / elapsed
 430                 if speed > rate_limit:
 431                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 432
 433         def temp_name(self, filename):
 434                 """Returns a temporary filename for the given filename."""
 435                 if self.params.get('nopart', False) or filename == u'-' or \
 436                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 437                         return filename
 438                 return filename + u'.part'
 439
 440         def undo_temp_name(self, filename):
 441                 if filename.endswith(u'.part'):
 442                         return filename[:-len(u'.part')]
 443                 return filename
 444
 445         def try_rename(self, old_filename, new_filename):
 446                 try:
 447                         if old_filename == new_filename:
 448                                 return
 449                         os.rename(old_filename, new_filename)
 450                 except (IOError, OSError), err:
 451                         self.trouble(u'ERROR: unable to rename file')
 452
 453         def report_destination(self, filename):
 454                 """Report destination filename."""
 455                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 456
 457         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 458                 """Report download progress."""
 459                 if self.params.get('noprogress', False):
 460                         return
 461                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 462                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 463                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 464                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 465
 466         def report_resuming_byte(self, resume_len):
 467                 """Report attempt to resume at given byte."""
 468                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 469
 470         def report_retry(self, count, retries):
 471                 """Report retry in case of HTTP error 5xx"""
 472                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 473
 474         def report_file_already_downloaded(self, file_name):
 475                 """Report file has already been fully downloaded."""
 476                 try:
 477                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 478                 except (UnicodeEncodeError), err:
 479                         self.to_screen(u'[download] The file has already been downloaded')
 480
 481         def report_unable_to_resume(self):
 482                 """Report it was impossible to resume download."""
 483                 self.to_screen(u'[download] Unable to resume')
 484
 485         def report_finish(self):
 486                 """Report download finished."""
 487                 if self.params.get('noprogress', False):
 488                         self.to_screen(u'[download] Download completed')
 489                 else:
 490                         self.to_screen(u'')
 491
 492         def increment_downloads(self):
 493                 """Increment the ordinal that assigns a number to each file."""
 494                 self._num_downloads += 1
 495
 496         def process_info(self, info_dict):
 497                 """Process a single dictionary returned by an InfoExtractor."""
 498                 # Do nothing else if in simulate mode
 499                 if self.params.get('simulate', False):
 500                         # Forced printings
 501                         if self.params.get('forcetitle', False):
 502                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 503                         if self.params.get('forceurl', False):
 504                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 505                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 506                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 507                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 508                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 509
 510                         return
 511
 512                 try:
 513                         template_dict = dict(info_dict)
 514                         template_dict['epoch'] = unicode(long(time.time()))
 515                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 516                         filename = self.params['outtmpl'] % template_dict
 517                 except (ValueError, KeyError), err:
 518                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 519                         return
 520                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 521                         self.to_stderr(u'WARNING: file exists and will be skipped')
 522                         return
 523
 524                 try:
 525                         self.pmkdir(filename)
 526                 except (OSError, IOError), err:
 527                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 528                         return
 529
 530                 try:
 531                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 532                 except (OSError, IOError), err:
 533                         raise UnavailableVideoError
 534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 535                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 536                         return
 537                 except (ContentTooShortError, ), err:
 538                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 539                         return
 540
 541                 if success:
 542                         try:
 543                                 self.post_process(filename, info_dict)
 544                         except (PostProcessingError), err:
 545                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 546                                 return
 547
 548         def download(self, url_list):
 549                 """Download a given list of URLs."""
 550                 if len(url_list) > 1 and self.fixed_template():
 551                         raise SameFileError(self.params['outtmpl'])
 552
 553                 for url in url_list:
 554                         suitable_found = False
 555                         for ie in self._ies:
 556                                 # Go to next InfoExtractor if not suitable
 557                                 if not ie.suitable(url):
 558                                         continue
 559
 560                                 # Suitable InfoExtractor found
 561                                 suitable_found = True
 562
 563                                 # Extract information from URL and process it
 564                                 ie.extract(url)
 565
 566                                 # Suitable InfoExtractor had been found; go to next URL
 567                                 break
 568
 569                         if not suitable_found:
 570                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 571
 572                 return self._download_retcode
 573
 574         def post_process(self, filename, ie_info):
 575                 """Run the postprocessing chain on the given file."""
 576                 info = dict(ie_info)
 577                 info['filepath'] = filename
 578                 for pp in self._pps:
 579                         info = pp.run(info)
 580                         if info is None:
 581                                 break
 582
 583         def _download_with_rtmpdump(self, filename, url, player_url):
 584                 self.report_destination(filename)
 585                 tmpfilename = self.temp_name(filename)
 586
 587                 # Check for rtmpdump first
 588                 try:
 589                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 590                 except (OSError, IOError):
 591                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 592                         return False
 593
 594                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 595                 # the connection was interrumpted and resuming appears to be
 596                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 597                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 598                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 599                 while retval == 2 or retval == 1:
 600                         prevsize = os.path.getsize(tmpfilename)
 601                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 602                         time.sleep(5.0) # This seems to be needed
 603                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 604                         cursize = os.path.getsize(tmpfilename)
 605                         if prevsize == cursize and retval == 1:
 606                                 break
 607                 if retval == 0:
 608                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 609                         self.try_rename(tmpfilename, filename)
 610                         return True
 611                 else:
 612                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 613                         return False
 614
 615         def _do_download(self, filename, url, player_url):
 616                 # Check file already present
 617                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 618                         self.report_file_already_downloaded(filename)
 619                         return True
 620
 621                 # Attempt to download using rtmpdump
 622                 if url.startswith('rtmp'):
 623                         return self._download_with_rtmpdump(filename, url, player_url)
 624
 625                 tmpfilename = self.temp_name(filename)
 626                 stream = None
 627                 open_mode = 'wb'
 628
 629                 # Do not include the Accept-Encoding header
 630                 headers = {'Youtubedl-no-compression': 'True'}
 631                 basic_request = urllib2.Request(url, None, headers)
 632                 request = urllib2.Request(url, None, headers)
 633
 634                 # Establish possible resume length
 635                 if os.path.isfile(tmpfilename):
 636                         resume_len = os.path.getsize(tmpfilename)
 637                 else:
 638                         resume_len = 0
 639
 640                 # Request parameters in case of being able to resume
 641                 if self.params.get('continuedl', False) and resume_len != 0:
 642                         self.report_resuming_byte(resume_len)
 643                         request.add_header('Range','bytes=%d-' % resume_len)
 644                         open_mode = 'ab'
 645
 646                 count = 0
 647                 retries = self.params.get('retries', 0)
 648                 while count <= retries:
 649                         # Establish connection
 650                         try:
 651                                 data = urllib2.urlopen(request)
 652                                 break
 653                         except (urllib2.HTTPError, ), err:
 654                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 655                                         # Unexpected HTTP error
 656                                         raise
 657                                 elif err.code == 416:
 658                                         # Unable to resume (requested range not satisfiable)
 659                                         try:
 660                                                 # Open the connection again without the range header
 661                                                 data = urllib2.urlopen(basic_request)
 662                                                 content_length = data.info()['Content-Length']
 663                                         except (urllib2.HTTPError, ), err:
 664                                                 if err.code < 500 or err.code >= 600:
 665                                                         raise
 666                                         else:
 667                                                 # Examine the reported length
 668                                                 if (content_length is not None and
 669                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 670                                                         # The file had already been fully downloaded.
 671                                                         # Explanation to the above condition: in issue #175 it was revealed that
 672                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 673                                                         # changing the file size slightly and causing problems for some users. So
 674                                                         # I decided to implement a suggested change and consider the file
 675                                                         # completely downloaded if the file size differs less than 100 bytes from
 676                                                         # the one in the hard drive.
 677                                                         self.report_file_already_downloaded(filename)
 678                                                         self.try_rename(tmpfilename, filename)
 679                                                         return True
 680                                                 else:
 681                                                         # The length does not match, we start the download over
 682                                                         self.report_unable_to_resume()
 683                                                         open_mode = 'wb'
 684                                                         break
 685                         # Retry
 686                         count += 1
 687                         if count <= retries:
 688                                 self.report_retry(count, retries)
 689
 690                 if count > retries:
 691                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 692                         return False
 693
 694                 data_len = data.info().get('Content-length', None)
 695                 if data_len is not None:
 696                         data_len = long(data_len) + resume_len
 697                 data_len_str = self.format_bytes(data_len)
 698                 byte_counter = 0 + resume_len
 699                 block_size = 1024
 700                 start = time.time()
 701                 while True:
 702                         # Download and write
 703                         before = time.time()
 704                         data_block = data.read(block_size)
 705                         after = time.time()
 706                         if len(data_block) == 0:
 707                                 break
 708                         byte_counter += len(data_block)
 709
 710                         # Open file just in time
 711                         if stream is None:
 712                                 try:
 713                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 714                                         filename = self.undo_temp_name(tmpfilename)
 715                                         self.report_destination(filename)
 716                                 except (OSError, IOError), err:
 717                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 718                                         return False
 719                         try:
 720                                 stream.write(data_block)
 721                         except (IOError, OSError), err:
 722                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 723                                 return False
 724                         block_size = self.best_block_size(after - before, len(data_block))
 725
 726                         # Progress message
 727                         percent_str = self.calc_percent(byte_counter, data_len)
 728                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 729                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 730                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 731
 732                         # Apply rate limit
 733                         self.slow_down(start, byte_counter - resume_len)
 734
 735                 stream.close()
 736                 self.report_finish()
 737                 if data_len is not None and byte_counter != data_len:
 738                         raise ContentTooShortError(byte_counter, long(data_len))
 739                 self.try_rename(tmpfilename, filename)
 740                 return True
 741
 742 class InfoExtractor(object):
 743         """Information Extractor class.
 744
 745         Information extractors are the classes that, given a URL, extract
 746         information from the video (or videos) the URL refers to. This
 747         information includes the real video URL, the video title and simplified
 748         title, author and others. The information is stored in a dictionary
 749         which is then passed to the FileDownloader. The FileDownloader
 750         processes this information possibly downloading the video to the file
 751         system, among other possible outcomes. The dictionaries must include
 752         the following fields:
 753
 754         id:             Video identifier.
 755         url:            Final video URL.
 756         uploader:       Nickname of the video uploader.
 757         title:          Literal title.
 758         stitle:         Simplified title.
 759         ext:            Video filename extension.
 760         format:         Video format.
 761         player_url:     SWF Player URL (may be None).
 762
 763         The following fields are optional. Their primary purpose is to allow
 764         youtube-dl to serve as the backend for a video search function, such
 765         as the one in youtube2mp3.  They are only used when their respective
 766         forced printing functions are called:
 767
 768         thumbnail:      Full URL to a video thumbnail image.
 769         description:    One-line video description.
 770
 771         Subclasses of this one should re-define the _real_initialize() and
 772         _real_extract() methods, as well as the suitable() static method.
 773         Probably, they should also be instantiated and added to the main
 774         downloader.
 775         """
 776
 777         _ready = False
 778         _downloader = None
 779
 780         def __init__(self, downloader=None):
 781                 """Constructor. Receives an optional downloader."""
 782                 self._ready = False
 783                 self.set_downloader(downloader)
 784
 785         @staticmethod
 786         def suitable(url):
 787                 """Receives a URL and returns True if suitable for this IE."""
 788                 return False
 789
 790         def initialize(self):
 791                 """Initializes an instance (authentication, etc)."""
 792                 if not self._ready:
 793                         self._real_initialize()
 794                         self._ready = True
 795
 796         def extract(self, url):
 797                 """Extracts URL information and returns it in list of dicts."""
 798                 self.initialize()
 799                 return self._real_extract(url)
 800
 801         def set_downloader(self, downloader):
 802                 """Sets the downloader for this IE."""
 803                 self._downloader = downloader
 804
 805         def _real_initialize(self):
 806                 """Real initialization process. Redefine in subclasses."""
 807                 pass
 808
 809         def _real_extract(self, url):
 810                 """Real extraction process. Redefine in subclasses."""
 811                 pass
 812
 813 class YoutubeIE(InfoExtractor):
 814         """Information extractor for youtube.com."""
 815
 816         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 817         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 818         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 819         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 820         _NETRC_MACHINE = 'youtube'
 821         # Listed in order of quality
 822         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 823         _video_extensions = {
 824                 '13': '3gp',
 825                 '17': 'mp4',
 826                 '18': 'mp4',
 827                 '22': 'mp4',
 828                 '37': 'mp4',
 829                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 830                 '43': 'webm',
 831                 '45': 'webm',
 832         }
 833
 834         @staticmethod
 835         def suitable(url):
 836                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 837
 838         def report_lang(self):
 839                 """Report attempt to set language."""
 840                 self._downloader.to_screen(u'[youtube] Setting language')
 841
 842         def report_login(self):
 843                 """Report attempt to log in."""
 844                 self._downloader.to_screen(u'[youtube] Logging in')
 845
 846         def report_age_confirmation(self):
 847                 """Report attempt to confirm age."""
 848                 self._downloader.to_screen(u'[youtube] Confirming age')
 849
 850         def report_video_webpage_download(self, video_id):
 851                 """Report attempt to download video webpage."""
 852                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 853
 854         def report_video_info_webpage_download(self, video_id):
 855                 """Report attempt to download video info webpage."""
 856                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 857
 858         def report_information_extraction(self, video_id):
 859                 """Report attempt to extract video information."""
 860                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 861
 862         def report_unavailable_format(self, video_id, format):
 863                 """Report extracted video URL."""
 864                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 865
 866         def report_rtmp_download(self):
 867                 """Indicate the download will use the RTMP protocol."""
 868                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 869
 870         def _real_initialize(self):
 871                 if self._downloader is None:
 872                         return
 873
 874                 username = None
 875                 password = None
 876                 downloader_params = self._downloader.params
 877
 878                 # Attempt to use provided username and password or .netrc data
 879                 if downloader_params.get('username', None) is not None:
 880                         username = downloader_params['username']
 881                         password = downloader_params['password']
 882                 elif downloader_params.get('usenetrc', False):
 883                         try:
 884                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 885                                 if info is not None:
 886                                         username = info[0]
 887                                         password = info[2]
 888                                 else:
 889                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 890                         except (IOError, netrc.NetrcParseError), err:
 891                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 892                                 return
 893
 894                 # Set language
 895                 request = urllib2.Request(self._LANG_URL)
 896                 try:
 897                         self.report_lang()
 898                         urllib2.urlopen(request).read()
 899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 900                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 901                         return
 902
 903                 # No authentication to be performed
 904                 if username is None:
 905                         return
 906
 907                 # Log in
 908                 login_form = {
 909                                 'current_form': 'loginForm',
 910                                 'next':         '/',
 911                                 'action_login': 'Log In',
 912                                 'username':     username,
 913                                 'password':     password,
 914                                 }
 915                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 916                 try:
 917                         self.report_login()
 918                         login_results = urllib2.urlopen(request).read()
 919                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 920                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 921                                 return
 922                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 923                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 924                         return
 925
 926                 # Confirm age
 927                 age_form = {
 928                                 'next_url':             '/',
 929                                 'action_confirm':       'Confirm',
 930                                 }
 931                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 932                 try:
 933                         self.report_age_confirmation()
 934                         age_results = urllib2.urlopen(request).read()
 935                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 936                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 937                         return
 938
 939         def _real_extract(self, url):
 940                 # Extract video id from URL
 941                 mobj = re.match(self._VALID_URL, url)
 942                 if mobj is None:
 943                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 944                         return
 945                 video_id = mobj.group(2)
 946
 947                 # Get video webpage
 948                 self.report_video_webpage_download(video_id)
 949                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 950                 try:
 951                         video_webpage = urllib2.urlopen(request).read()
 952                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 953                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 954                         return
 955
 956                 # Attempt to extract SWF player URL
 957                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 958                 if mobj is not None:
 959                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 960                 else:
 961                         player_url = None
 962
 963                 # Get video info
 964                 self.report_video_info_webpage_download(video_id)
 965                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 966                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 967                                            % (video_id, el_type))
 968                         request = urllib2.Request(video_info_url)
 969                         try:
 970                                 video_info_webpage = urllib2.urlopen(request).read()
 971                                 video_info = parse_qs(video_info_webpage)
 972                                 if 'token' in video_info:
 973                                         break
 974                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 975                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 976                                 return
 977                 if 'token' not in video_info:
 978                         if 'reason' in video_info:
 979                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 980                         else:
 981                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 982                         return
 983
 984                 # Start extracting information
 985                 self.report_information_extraction(video_id)
 986
 987                 # uploader
 988                 if 'author' not in video_info:
 989                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 990                         return
 991                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 992
 993                 # title
 994                 if 'title' not in video_info:
 995                         self._downloader.trouble(u'ERROR: unable to extract video title')
 996                         return
 997                 video_title = urllib.unquote_plus(video_info['title'][0])
 998                 video_title = video_title.decode('utf-8')
 999                 video_title = sanitize_title(video_title)
1000
1001                 # simplified title
1002                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1003                 simple_title = simple_title.strip(ur'_')
1004
1005                 # thumbnail image
1006                 if 'thumbnail_url' not in video_info:
1007                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1008                         video_thumbnail = ''
1009                 else:   # don't panic if we can't find it
1010                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1011
1012                 # upload date
1013                 upload_date = u'NA'
1014                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1015                 if mobj is not None:
1016                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1017                         format_expressions = ['%d %B %Y', '%B %d %Y']
1018                         for expression in format_expressions:
1019                                 try:
1020                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1021                                 except:
1022                                         pass
1023
1024                 # description
1025                 video_description = 'No description available.'
1026                 if self._downloader.params.get('forcedescription', False):
1027                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1028                         if mobj is not None:
1029                                 video_description = mobj.group(1)
1030
1031                 # token
1032                 video_token = urllib.unquote_plus(video_info['token'][0])
1033
1034                 # Decide which formats to download
1035                 req_format = self._downloader.params.get('format', None)
1036
1037                 if 'fmt_url_map' in video_info:
1038                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1039                         format_limit = self._downloader.params.get('format_limit', None)
1040                         if format_limit is not None and format_limit in self._available_formats:
1041                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1042                         else:
1043                                 format_list = self._available_formats
1044                         existing_formats = [x for x in format_list if x in url_map]
1045                         if len(existing_formats) == 0:
1046                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1047                                 return
1048                         if req_format is None:
1049                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1050                         elif req_format == '-1':
1051                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1052                         else:
1053                                 # Specific format
1054                                 if req_format not in url_map:
1055                                         self._downloader.trouble(u'ERROR: requested format not available')
1056                                         return
1057                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1058
1059                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1060                         self.report_rtmp_download()
1061                         video_url_list = [(None, video_info['conn'][0])]
1062
1063                 else:
1064                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1065                         return
1066
1067                 for format_param, video_real_url in video_url_list:
1068                         # At this point we have a new video
1069                         self._downloader.increment_downloads()
1070
1071                         # Extension
1072                         video_extension = self._video_extensions.get(format_param, 'flv')
1073
1074                         # Find the video URL in fmt_url_map or conn paramters
1075                         try:
1076                                 # Process video information
1077                                 self._downloader.process_info({
1078                                         'id':           video_id.decode('utf-8'),
1079                                         'url':          video_real_url.decode('utf-8'),
1080                                         'uploader':     video_uploader.decode('utf-8'),
1081                                         'upload_date':  upload_date,
1082                                         'title':        video_title,
1083                                         'stitle':       simple_title,
1084                                         'ext':          video_extension.decode('utf-8'),
1085                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1086                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1087                                         'description':  video_description.decode('utf-8'),
1088                                         'player_url':   player_url,
1089                                 })
1090                         except UnavailableVideoError, err:
1091                                 self._downloader.trouble(u'\nERROR: unable to download video')
1092
1093
1094 class MetacafeIE(InfoExtractor):
1095         """Information Extractor for metacafe.com."""
1096
1097         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1098         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1099         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1100         _youtube_ie = None
1101
1102         def __init__(self, youtube_ie, downloader=None):
1103                 InfoExtractor.__init__(self, downloader)
1104                 self._youtube_ie = youtube_ie
1105
1106         @staticmethod
1107         def suitable(url):
1108                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1109
1110         def report_disclaimer(self):
1111                 """Report disclaimer retrieval."""
1112                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1113
1114         def report_age_confirmation(self):
1115                 """Report attempt to confirm age."""
1116                 self._downloader.to_screen(u'[metacafe] Confirming age')
1117
1118         def report_download_webpage(self, video_id):
1119                 """Report webpage download."""
1120                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1121
1122         def report_extraction(self, video_id):
1123                 """Report information extraction."""
1124                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1125
1126         def _real_initialize(self):
1127                 # Retrieve disclaimer
1128                 request = urllib2.Request(self._DISCLAIMER)
1129                 try:
1130                         self.report_disclaimer()
1131                         disclaimer = urllib2.urlopen(request).read()
1132                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1133                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1134                         return
1135
1136                 # Confirm age
1137                 disclaimer_form = {
1138                         'filters': '0',
1139                         'submit': "Continue - I'm over 18",
1140                         }
1141                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1142                 try:
1143                         self.report_age_confirmation()
1144                         disclaimer = urllib2.urlopen(request).read()
1145                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1146                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1147                         return
1148
1149         def _real_extract(self, url):
1150                 # Extract id and simplified title from URL
1151                 mobj = re.match(self._VALID_URL, url)
1152                 if mobj is None:
1153                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1154                         return
1155
1156                 video_id = mobj.group(1)
1157
1158                 # Check if video comes from YouTube
1159                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1160                 if mobj2 is not None:
1161                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1162                         return
1163
1164                 # At this point we have a new video
1165                 self._downloader.increment_downloads()
1166
1167                 simple_title = mobj.group(2).decode('utf-8')
1168
1169                 # Retrieve video webpage to extract further information
1170                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1171                 try:
1172                         self.report_download_webpage(video_id)
1173                         webpage = urllib2.urlopen(request).read()
1174                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1176                         return
1177
1178                 # Extract URL, uploader and title from webpage
1179                 self.report_extraction(video_id)
1180                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1181                 if mobj is not None:
1182                         mediaURL = urllib.unquote(mobj.group(1))
1183                         video_extension = mediaURL[-3:]
1184
1185                         # Extract gdaKey if available
1186                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1187                         if mobj is None:
1188                                 video_url = mediaURL
1189                         else:
1190                                 gdaKey = mobj.group(1)
1191                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1192                 else:
1193                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1194                         if mobj is None:
1195                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1196                                 return
1197                         vardict = parse_qs(mobj.group(1))
1198                         if 'mediaData' not in vardict:
1199                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1200                                 return
1201                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1202                         if mobj is None:
1203                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1204                                 return
1205                         mediaURL = mobj.group(1).replace('\\/', '/')
1206                         video_extension = mediaURL[-3:]
1207                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1208
1209                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: unable to extract title')
1212                         return
1213                 video_title = mobj.group(1).decode('utf-8')
1214                 video_title = sanitize_title(video_title)
1215
1216                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1217                 if mobj is None:
1218                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1219                         return
1220                 video_uploader = mobj.group(1)
1221
1222                 try:
1223                         # Process video information
1224                         self._downloader.process_info({
1225                                 'id':           video_id.decode('utf-8'),
1226                                 'url':          video_url.decode('utf-8'),
1227                                 'uploader':     video_uploader.decode('utf-8'),
1228                                 'upload_date':  u'NA',
1229                                 'title':        video_title,
1230                                 'stitle':       simple_title,
1231                                 'ext':          video_extension.decode('utf-8'),
1232                                 'format':       u'NA',
1233                                 'player_url':   None,
1234                         })
1235                 except UnavailableVideoError:
1236                         self._downloader.trouble(u'\nERROR: unable to download video')
1237
1238
1239 class DailymotionIE(InfoExtractor):
1240         """Information Extractor for Dailymotion"""
1241
1242         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1243
1244         def __init__(self, downloader=None):
1245                 InfoExtractor.__init__(self, downloader)
1246
1247         @staticmethod
1248         def suitable(url):
1249                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1250
1251         def report_download_webpage(self, video_id):
1252                 """Report webpage download."""
1253                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1254
1255         def report_extraction(self, video_id):
1256                 """Report information extraction."""
1257                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1258
1259         def _real_initialize(self):
1260                 return
1261
1262         def _real_extract(self, url):
1263                 # Extract id and simplified title from URL
1264                 mobj = re.match(self._VALID_URL, url)
1265                 if mobj is None:
1266                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1267                         return
1268
1269                 # At this point we have a new video
1270                 self._downloader.increment_downloads()
1271                 video_id = mobj.group(1)
1272
1273                 simple_title = mobj.group(2).decode('utf-8')
1274                 video_extension = 'flv'
1275
1276                 # Retrieve video webpage to extract further information
1277                 request = urllib2.Request(url)
1278                 try:
1279                         self.report_download_webpage(video_id)
1280                         webpage = urllib2.urlopen(request).read()
1281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1282                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1283                         return
1284
1285                 # Extract URL, uploader and title from webpage
1286                 self.report_extraction(video_id)
1287                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1288                 if mobj is None:
1289                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1290                         return
1291                 mediaURL = urllib.unquote(mobj.group(1))
1292
1293                 # if needed add http://www.dailymotion.com/ if relative URL
1294
1295                 video_url = mediaURL
1296
1297                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1298                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1299                 if mobj is None:
1300                         self._downloader.trouble(u'ERROR: unable to extract title')
1301                         return
1302                 video_title = mobj.group(1).decode('utf-8')
1303                 video_title = sanitize_title(video_title)
1304
1305                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1306                 if mobj is None:
1307                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1308                         return
1309                 video_uploader = mobj.group(1)
1310
1311                 try:
1312                         # Process video information
1313                         self._downloader.process_info({
1314                                 'id':           video_id.decode('utf-8'),
1315                                 'url':          video_url.decode('utf-8'),
1316                                 'uploader':     video_uploader.decode('utf-8'),
1317                                 'upload_date':  u'NA',
1318                                 'title':        video_title,
1319                                 'stitle':       simple_title,
1320                                 'ext':          video_extension.decode('utf-8'),
1321                                 'format':       u'NA',
1322                                 'player_url':   None,
1323                         })
1324                 except UnavailableVideoError:
1325                         self._downloader.trouble(u'\nERROR: unable to download video')
1326
1327 class GoogleIE(InfoExtractor):
1328         """Information extractor for video.google.com."""
1329
1330         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1331
1332         def __init__(self, downloader=None):
1333                 InfoExtractor.__init__(self, downloader)
1334
1335         @staticmethod
1336         def suitable(url):
1337                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1338
1339         def report_download_webpage(self, video_id):
1340                 """Report webpage download."""
1341                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1342
1343         def report_extraction(self, video_id):
1344                 """Report information extraction."""
1345                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1346
1347         def _real_initialize(self):
1348                 return
1349
1350         def _real_extract(self, url):
1351                 # Extract id from URL
1352                 mobj = re.match(self._VALID_URL, url)
1353                 if mobj is None:
1354                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1355                         return
1356
1357                 # At this point we have a new video
1358                 self._downloader.increment_downloads()
1359                 video_id = mobj.group(1)
1360
1361                 video_extension = 'mp4'
1362
1363                 # Retrieve video webpage to extract further information
1364                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1365                 try:
1366                         self.report_download_webpage(video_id)
1367                         webpage = urllib2.urlopen(request).read()
1368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1370                         return
1371
1372                 # Extract URL, uploader, and title from webpage
1373                 self.report_extraction(video_id)
1374                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1375                 if mobj is None:
1376                         video_extension = 'flv'
1377                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1378                 if mobj is None:
1379                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1380                         return
1381                 mediaURL = urllib.unquote(mobj.group(1))
1382                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1383                 mediaURL = mediaURL.replace('\\x26', '\x26')
1384
1385                 video_url = mediaURL
1386
1387                 mobj = re.search(r'<title>(.*)</title>', webpage)
1388                 if mobj is None:
1389                         self._downloader.trouble(u'ERROR: unable to extract title')
1390                         return
1391                 video_title = mobj.group(1).decode('utf-8')
1392                 video_title = sanitize_title(video_title)
1393                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1394
1395                 # Extract video description
1396                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1397                 if mobj is None:
1398                         self._downloader.trouble(u'ERROR: unable to extract video description')
1399                         return
1400                 video_description = mobj.group(1).decode('utf-8')
1401                 if not video_description:
1402                         video_description = 'No description available.'
1403
1404                 # Extract video thumbnail
1405                 if self._downloader.params.get('forcethumbnail', False):
1406                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1407                         try:
1408                                 webpage = urllib2.urlopen(request).read()
1409                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1411                                 return
1412                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1413                         if mobj is None:
1414                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1415                                 return
1416                         video_thumbnail = mobj.group(1)
1417                 else:   # we need something to pass to process_info
1418                         video_thumbnail = ''
1419
1420
1421                 try:
1422                         # Process video information
1423                         self._downloader.process_info({
1424                                 'id':           video_id.decode('utf-8'),
1425                                 'url':          video_url.decode('utf-8'),
1426                                 'uploader':     u'NA',
1427                                 'upload_date':  u'NA',
1428                                 'title':        video_title,
1429                                 'stitle':       simple_title,
1430                                 'ext':          video_extension.decode('utf-8'),
1431                                 'format':       u'NA',
1432                                 'player_url':   None,
1433                         })
1434                 except UnavailableVideoError:
1435                         self._downloader.trouble(u'\nERROR: unable to download video')
1436
1437
1438 class PhotobucketIE(InfoExtractor):
1439         """Information extractor for photobucket.com."""
1440
1441         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1442
1443         def __init__(self, downloader=None):
1444                 InfoExtractor.__init__(self, downloader)
1445
1446         @staticmethod
1447         def suitable(url):
1448                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1449
1450         def report_download_webpage(self, video_id):
1451                 """Report webpage download."""
1452                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1453
1454         def report_extraction(self, video_id):
1455                 """Report information extraction."""
1456                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1457
1458         def _real_initialize(self):
1459                 return
1460
1461         def _real_extract(self, url):
1462                 # Extract id from URL
1463                 mobj = re.match(self._VALID_URL, url)
1464                 if mobj is None:
1465                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1466                         return
1467
1468                 # At this point we have a new video
1469                 self._downloader.increment_downloads()
1470                 video_id = mobj.group(1)
1471
1472                 video_extension = 'flv'
1473
1474                 # Retrieve video webpage to extract further information
1475                 request = urllib2.Request(url)
1476                 try:
1477                         self.report_download_webpage(video_id)
1478                         webpage = urllib2.urlopen(request).read()
1479                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1480                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1481                         return
1482
1483                 # Extract URL, uploader, and title from webpage
1484                 self.report_extraction(video_id)
1485                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1488                         return
1489                 mediaURL = urllib.unquote(mobj.group(1))
1490
1491                 video_url = mediaURL
1492
1493                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1494                 if mobj is None:
1495                         self._downloader.trouble(u'ERROR: unable to extract title')
1496                         return
1497                 video_title = mobj.group(1).decode('utf-8')
1498                 video_title = sanitize_title(video_title)
1499                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1500
1501                 video_uploader = mobj.group(2).decode('utf-8')
1502
1503                 try:
1504                         # Process video information
1505                         self._downloader.process_info({
1506                                 'id':           video_id.decode('utf-8'),
1507                                 'url':          video_url.decode('utf-8'),
1508                                 'uploader':     video_uploader,
1509                                 'upload_date':  u'NA',
1510                                 'title':        video_title,
1511                                 'stitle':       simple_title,
1512                                 'ext':          video_extension.decode('utf-8'),
1513                                 'format':       u'NA',
1514                                 'player_url':   None,
1515                         })
1516                 except UnavailableVideoError:
1517                         self._downloader.trouble(u'\nERROR: unable to download video')
1518
1519
1520 class YahooIE(InfoExtractor):
1521         """Information extractor for video.yahoo.com."""
1522
1523         # _VALID_URL matches all Yahoo! Video URLs
1524         # _VPAGE_URL matches only the extractable '/watch/' URLs
1525         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1526         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1527
1528         def __init__(self, downloader=None):
1529                 InfoExtractor.__init__(self, downloader)
1530
1531         @staticmethod
1532         def suitable(url):
1533                 return (re.match(YahooIE._VALID_URL, url) is not None)
1534
1535         def report_download_webpage(self, video_id):
1536                 """Report webpage download."""
1537                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1538
1539         def report_extraction(self, video_id):
1540                 """Report information extraction."""
1541                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1542
1543         def _real_initialize(self):
1544                 return
1545
1546         def _real_extract(self, url, new_video=True):
1547                 # Extract ID from URL
1548                 mobj = re.match(self._VALID_URL, url)
1549                 if mobj is None:
1550                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1551                         return
1552
1553                 # At this point we have a new video
1554                 self._downloader.increment_downloads()
1555                 video_id = mobj.group(2)
1556                 video_extension = 'flv'
1557
1558                 # Rewrite valid but non-extractable URLs as
1559                 # extractable English language /watch/ URLs
1560                 if re.match(self._VPAGE_URL, url) is None:
1561                         request = urllib2.Request(url)
1562                         try:
1563                                 webpage = urllib2.urlopen(request).read()
1564                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1565                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1566                                 return
1567
1568                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1569                         if mobj is None:
1570                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1571                                 return
1572                         yahoo_id = mobj.group(1)
1573
1574                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1575                         if mobj is None:
1576                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1577                                 return
1578                         yahoo_vid = mobj.group(1)
1579
1580                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1581                         return self._real_extract(url, new_video=False)
1582
1583                 # Retrieve video webpage to extract further information
1584                 request = urllib2.Request(url)
1585                 try:
1586                         self.report_download_webpage(video_id)
1587                         webpage = urllib2.urlopen(request).read()
1588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1590                         return
1591
1592                 # Extract uploader and title from webpage
1593                 self.report_extraction(video_id)
1594                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1595                 if mobj is None:
1596                         self._downloader.trouble(u'ERROR: unable to extract video title')
1597                         return
1598                 video_title = mobj.group(1).decode('utf-8')
1599                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1600
1601                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1604                         return
1605                 video_uploader = mobj.group(1).decode('utf-8')
1606
1607                 # Extract video thumbnail
1608                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1609                 if mobj is None:
1610                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1611                         return
1612                 video_thumbnail = mobj.group(1).decode('utf-8')
1613
1614                 # Extract video description
1615                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1616                 if mobj is None:
1617                         self._downloader.trouble(u'ERROR: unable to extract video description')
1618                         return
1619                 video_description = mobj.group(1).decode('utf-8')
1620                 if not video_description: video_description = 'No description available.'
1621
1622                 # Extract video height and width
1623                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1624                 if mobj is None:
1625                         self._downloader.trouble(u'ERROR: unable to extract video height')
1626                         return
1627                 yv_video_height = mobj.group(1)
1628
1629                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1630                 if mobj is None:
1631                         self._downloader.trouble(u'ERROR: unable to extract video width')
1632                         return
1633                 yv_video_width = mobj.group(1)
1634
1635                 # Retrieve video playlist to extract media URL
1636                 # I'm not completely sure what all these options are, but we
1637                 # seem to need most of them, otherwise the server sends a 401.
1638                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1639                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1640                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1641                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1642                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1643                 try:
1644                         self.report_download_webpage(video_id)
1645                         webpage = urllib2.urlopen(request).read()
1646                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1648                         return
1649
1650                 # Extract media URL from playlist XML
1651                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1654                         return
1655                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1656                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1657
1658                 try:
1659                         # Process video information
1660                         self._downloader.process_info({
1661                                 'id':           video_id.decode('utf-8'),
1662                                 'url':          video_url,
1663                                 'uploader':     video_uploader,
1664                                 'upload_date':  u'NA',
1665                                 'title':        video_title,
1666                                 'stitle':       simple_title,
1667                                 'ext':          video_extension.decode('utf-8'),
1668                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1669                                 'description':  video_description,
1670                                 'thumbnail':    video_thumbnail,
1671                                 'description':  video_description,
1672                                 'player_url':   None,
1673                         })
1674                 except UnavailableVideoError:
1675                         self._downloader.trouble(u'\nERROR: unable to download video')
1676
1677
1678 class GenericIE(InfoExtractor):
1679         """Generic last-resort information extractor."""
1680
1681         def __init__(self, downloader=None):
1682                 InfoExtractor.__init__(self, downloader)
1683
1684         @staticmethod
1685         def suitable(url):
1686                 return True
1687
1688         def report_download_webpage(self, video_id):
1689                 """Report webpage download."""
1690                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1691                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1692
1693         def report_extraction(self, video_id):
1694                 """Report information extraction."""
1695                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1696
1697         def _real_initialize(self):
1698                 return
1699
1700         def _real_extract(self, url):
1701                 # At this point we have a new video
1702                 self._downloader.increment_downloads()
1703
1704                 video_id = url.split('/')[-1]
1705                 request = urllib2.Request(url)
1706                 try:
1707                         self.report_download_webpage(video_id)
1708                         webpage = urllib2.urlopen(request).read()
1709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1711                         return
1712                 except ValueError, err:
1713                         # since this is the last-resort InfoExtractor, if
1714                         # this error is thrown, it'll be thrown here
1715                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1716                         return
1717
1718                 self.report_extraction(video_id)
1719                 # Start with something easy: JW Player in SWFObject
1720                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1721                 if mobj is None:
1722                         # Broaden the search a little bit
1723                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1724                 if mobj is None:
1725                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1726                         return
1727
1728                 # It's possible that one of the regexes
1729                 # matched, but returned an empty group:
1730                 if mobj.group(1) is None:
1731                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1732                         return
1733
1734                 video_url = urllib.unquote(mobj.group(1))
1735                 video_id  = os.path.basename(video_url)
1736
1737                 # here's a fun little line of code for you:
1738                 video_extension = os.path.splitext(video_id)[1][1:]
1739                 video_id        = os.path.splitext(video_id)[0]
1740
1741                 # it's tempting to parse this further, but you would
1742                 # have to take into account all the variations like
1743                 #   Video Title - Site Name
1744                 #   Site Name | Video Title
1745                 #   Video Title - Tagline | Site Name
1746                 # and so on and so forth; it's just not practical
1747                 mobj = re.search(r'<title>(.*)</title>', webpage)
1748                 if mobj is None:
1749                         self._downloader.trouble(u'ERROR: unable to extract title')
1750                         return
1751                 video_title = mobj.group(1).decode('utf-8')
1752                 video_title = sanitize_title(video_title)
1753                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1754
1755                 # video uploader is domain name
1756                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1757                 if mobj is None:
1758                         self._downloader.trouble(u'ERROR: unable to extract title')
1759                         return
1760                 video_uploader = mobj.group(1).decode('utf-8')
1761
1762                 try:
1763                         # Process video information
1764                         self._downloader.process_info({
1765                                 'id':           video_id.decode('utf-8'),
1766                                 'url':          video_url.decode('utf-8'),
1767                                 'uploader':     video_uploader,
1768                                 'upload_date':  u'NA',
1769                                 'title':        video_title,
1770                                 'stitle':       simple_title,
1771                                 'ext':          video_extension.decode('utf-8'),
1772                                 'format':       u'NA',
1773                                 'player_url':   None,
1774                         })
1775                 except UnavailableVideoError, err:
1776                         self._downloader.trouble(u'\nERROR: unable to download video')
1777
1778
1779 class YoutubeSearchIE(InfoExtractor):
1780         """Information Extractor for YouTube search queries."""
1781         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1782         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1783         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1784         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1785         _youtube_ie = None
1786         _max_youtube_results = 1000
1787
1788         def __init__(self, youtube_ie, downloader=None):
1789                 InfoExtractor.__init__(self, downloader)
1790                 self._youtube_ie = youtube_ie
1791
1792         @staticmethod
1793         def suitable(url):
1794                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1795
1796         def report_download_page(self, query, pagenum):
1797                 """Report attempt to download playlist page with given number."""
1798                 query = query.decode(preferredencoding())
1799                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1800
1801         def _real_initialize(self):
1802                 self._youtube_ie.initialize()
1803
1804         def _real_extract(self, query):
1805                 mobj = re.match(self._VALID_QUERY, query)
1806                 if mobj is None:
1807                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1808                         return
1809
1810                 prefix, query = query.split(':')
1811                 prefix = prefix[8:]
1812                 query  = query.encode('utf-8')
1813                 if prefix == '':
1814                         self._download_n_results(query, 1)
1815                         return
1816                 elif prefix == 'all':
1817                         self._download_n_results(query, self._max_youtube_results)
1818                         return
1819                 else:
1820                         try:
1821                                 n = long(prefix)
1822                                 if n <= 0:
1823                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1824                                         return
1825                                 elif n > self._max_youtube_results:
1826                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1827                                         n = self._max_youtube_results
1828                                 self._download_n_results(query, n)
1829                                 return
1830                         except ValueError: # parsing prefix as integer fails
1831                                 self._download_n_results(query, 1)
1832                                 return
1833
1834         def _download_n_results(self, query, n):
1835                 """Downloads a specified number of results for a query"""
1836
1837                 video_ids = []
1838                 already_seen = set()
1839                 pagenum = 1
1840
1841                 while True:
1842                         self.report_download_page(query, pagenum)
1843                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1844                         request = urllib2.Request(result_url)
1845                         try:
1846                                 page = urllib2.urlopen(request).read()
1847                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1849                                 return
1850
1851                         # Extract video identifiers
1852                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1853                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1854                                 if video_id not in already_seen:
1855                                         video_ids.append(video_id)
1856                                         already_seen.add(video_id)
1857                                         if len(video_ids) == n:
1858                                                 # Specified n videos reached
1859                                                 for id in video_ids:
1860                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1861                                                 return
1862
1863                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1864                                 for id in video_ids:
1865                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1866                                 return
1867
1868                         pagenum = pagenum + 1
1869
1870 class GoogleSearchIE(InfoExtractor):
1871         """Information Extractor for Google Video search queries."""
1872         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1873         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1874         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1875         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1876         _google_ie = None
1877         _max_google_results = 1000
1878
1879         def __init__(self, google_ie, downloader=None):
1880                 InfoExtractor.__init__(self, downloader)
1881                 self._google_ie = google_ie
1882
1883         @staticmethod
1884         def suitable(url):
1885                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1886
1887         def report_download_page(self, query, pagenum):
1888                 """Report attempt to download playlist page with given number."""
1889                 query = query.decode(preferredencoding())
1890                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1891
1892         def _real_initialize(self):
1893                 self._google_ie.initialize()
1894
1895         def _real_extract(self, query):
1896                 mobj = re.match(self._VALID_QUERY, query)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1899                         return
1900
1901                 prefix, query = query.split(':')
1902                 prefix = prefix[8:]
1903                 query  = query.encode('utf-8')
1904                 if prefix == '':
1905                         self._download_n_results(query, 1)
1906                         return
1907                 elif prefix == 'all':
1908                         self._download_n_results(query, self._max_google_results)
1909                         return
1910                 else:
1911                         try:
1912                                 n = long(prefix)
1913                                 if n <= 0:
1914                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1915                                         return
1916                                 elif n > self._max_google_results:
1917                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1918                                         n = self._max_google_results
1919                                 self._download_n_results(query, n)
1920                                 return
1921                         except ValueError: # parsing prefix as integer fails
1922                                 self._download_n_results(query, 1)
1923                                 return
1924
1925         def _download_n_results(self, query, n):
1926                 """Downloads a specified number of results for a query"""
1927
1928                 video_ids = []
1929                 already_seen = set()
1930                 pagenum = 1
1931
1932                 while True:
1933                         self.report_download_page(query, pagenum)
1934                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1935                         request = urllib2.Request(result_url)
1936                         try:
1937                                 page = urllib2.urlopen(request).read()
1938                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1940                                 return
1941
1942                         # Extract video identifiers
1943                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1944                                 video_id = mobj.group(1)
1945                                 if video_id not in already_seen:
1946                                         video_ids.append(video_id)
1947                                         already_seen.add(video_id)
1948                                         if len(video_ids) == n:
1949                                                 # Specified n videos reached
1950                                                 for id in video_ids:
1951                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1952                                                 return
1953
1954                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1955                                 for id in video_ids:
1956                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1957                                 return
1958
1959                         pagenum = pagenum + 1
1960
1961 class YahooSearchIE(InfoExtractor):
1962         """Information Extractor for Yahoo! Video search queries."""
1963         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1964         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1965         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1966         _MORE_PAGES_INDICATOR = r'\s*Next'
1967         _yahoo_ie = None
1968         _max_yahoo_results = 1000
1969
1970         def __init__(self, yahoo_ie, downloader=None):
1971                 InfoExtractor.__init__(self, downloader)
1972                 self._yahoo_ie = yahoo_ie
1973
1974         @staticmethod
1975         def suitable(url):
1976                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1977
1978         def report_download_page(self, query, pagenum):
1979                 """Report attempt to download playlist page with given number."""
1980                 query = query.decode(preferredencoding())
1981                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1982
1983         def _real_initialize(self):
1984                 self._yahoo_ie.initialize()
1985
1986         def _real_extract(self, query):
1987                 mobj = re.match(self._VALID_QUERY, query)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1990                         return
1991
1992                 prefix, query = query.split(':')
1993                 prefix = prefix[8:]
1994                 query  = query.encode('utf-8')
1995                 if prefix == '':
1996                         self._download_n_results(query, 1)
1997                         return
1998                 elif prefix == 'all':
1999                         self._download_n_results(query, self._max_yahoo_results)
2000                         return
2001                 else:
2002                         try:
2003                                 n = long(prefix)
2004                                 if n <= 0:
2005                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2006                                         return
2007                                 elif n > self._max_yahoo_results:
2008                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2009                                         n = self._max_yahoo_results
2010                                 self._download_n_results(query, n)
2011                                 return
2012                         except ValueError: # parsing prefix as integer fails
2013                                 self._download_n_results(query, 1)
2014                                 return
2015
2016         def _download_n_results(self, query, n):
2017                 """Downloads a specified number of results for a query"""
2018
2019                 video_ids = []
2020                 already_seen = set()
2021                 pagenum = 1
2022
2023                 while True:
2024                         self.report_download_page(query, pagenum)
2025                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2026                         request = urllib2.Request(result_url)
2027                         try:
2028                                 page = urllib2.urlopen(request).read()
2029                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2030                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2031                                 return
2032
2033                         # Extract video identifiers
2034                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2035                                 video_id = mobj.group(1)
2036                                 if video_id not in already_seen:
2037                                         video_ids.append(video_id)
2038                                         already_seen.add(video_id)
2039                                         if len(video_ids) == n:
2040                                                 # Specified n videos reached
2041                                                 for id in video_ids:
2042                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2043                                                 return
2044
2045                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2046                                 for id in video_ids:
2047                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2048                                 return
2049
2050                         pagenum = pagenum + 1
2051
2052 class YoutubePlaylistIE(InfoExtractor):
2053         """Information Extractor for YouTube playlists."""
2054
2055         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2056         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2057         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2058         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2059         _youtube_ie = None
2060
2061         def __init__(self, youtube_ie, downloader=None):
2062                 InfoExtractor.__init__(self, downloader)
2063                 self._youtube_ie = youtube_ie
2064
2065         @staticmethod
2066         def suitable(url):
2067                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2068
2069         def report_download_page(self, playlist_id, pagenum):
2070                 """Report attempt to download playlist page with given number."""
2071                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2072
2073         def _real_initialize(self):
2074                 self._youtube_ie.initialize()
2075
2076         def _real_extract(self, url):
2077                 # Extract playlist id
2078                 mobj = re.match(self._VALID_URL, url)
2079                 if mobj is None:
2080                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2081                         return
2082
2083                 # Download playlist pages
2084                 playlist_id = mobj.group(1)
2085                 video_ids = []
2086                 pagenum = 1
2087
2088                 while True:
2089                         self.report_download_page(playlist_id, pagenum)
2090                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2091                         try:
2092                                 page = urllib2.urlopen(request).read()
2093                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2094                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2095                                 return
2096
2097                         # Extract video identifiers
2098                         ids_in_page = []
2099                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2100                                 if mobj.group(1) not in ids_in_page:
2101                                         ids_in_page.append(mobj.group(1))
2102                         video_ids.extend(ids_in_page)
2103
2104                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2105                                 break
2106                         pagenum = pagenum + 1
2107
2108                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2109                 playlistend = self._downloader.params.get('playlistend', -1)
2110                 video_ids = video_ids[playliststart:playlistend]
2111
2112                 for id in video_ids:
2113                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2114                 return
2115
2116 class YoutubeUserIE(InfoExtractor):
2117         """Information Extractor for YouTube users."""
2118
2119         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2120         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2121         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2122         _youtube_ie = None
2123
2124         def __init__(self, youtube_ie, downloader=None):
2125                 InfoExtractor.__init__(self, downloader)
2126                 self._youtube_ie = youtube_ie
2127
2128         @staticmethod
2129         def suitable(url):
2130                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2131
2132         def report_download_page(self, username):
2133                 """Report attempt to download user page."""
2134                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2135
2136         def _real_initialize(self):
2137                 self._youtube_ie.initialize()
2138
2139         def _real_extract(self, url):
2140                 # Extract username
2141                 mobj = re.match(self._VALID_URL, url)
2142                 if mobj is None:
2143                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2144                         return
2145
2146                 # Download user page
2147                 username = mobj.group(1)
2148                 video_ids = []
2149                 pagenum = 1
2150
2151                 self.report_download_page(username)
2152                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2153                 try:
2154                         page = urllib2.urlopen(request).read()
2155                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2156                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2157                         return
2158
2159                 # Extract video identifiers
2160                 ids_in_page = []
2161
2162                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2163                         if mobj.group(1) not in ids_in_page:
2164                                 ids_in_page.append(mobj.group(1))
2165                 video_ids.extend(ids_in_page)
2166
2167                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2168                 playlistend = self._downloader.params.get('playlistend', -1)
2169                 video_ids = video_ids[playliststart:playlistend]
2170
2171                 for id in video_ids:
2172                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2173                 return
2174
2175 class DepositFilesIE(InfoExtractor):
2176         """Information extractor for depositfiles.com"""
2177
2178         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2179
2180         def __init__(self, downloader=None):
2181                 InfoExtractor.__init__(self, downloader)
2182
2183         @staticmethod
2184         def suitable(url):
2185                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2186
2187         def report_download_webpage(self, file_id):
2188                 """Report webpage download."""
2189                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2190
2191         def report_extraction(self, file_id):
2192                 """Report information extraction."""
2193                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2194
2195         def _real_initialize(self):
2196                 return
2197
2198         def _real_extract(self, url):
2199                 # At this point we have a new file
2200                 self._downloader.increment_downloads()
2201
2202                 file_id = url.split('/')[-1]
2203                 # Rebuild url in english locale
2204                 url = 'http://depositfiles.com/en/files/' + file_id
2205
2206                 # Retrieve file webpage with 'Free download' button pressed
2207                 free_download_indication = { 'gateway_result' : '1' }
2208                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2209                 try:
2210                         self.report_download_webpage(file_id)
2211                         webpage = urllib2.urlopen(request).read()
2212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2213                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2214                         return
2215
2216                 # Search for the real file URL
2217                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2218                 if (mobj is None) or (mobj.group(1) is None):
2219                         # Try to figure out reason of the error.
2220                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2221                         if (mobj is not None) and (mobj.group(1) is not None):
2222                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2223                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2224                         else:
2225                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2226                         return
2227
2228                 file_url = mobj.group(1)
2229                 file_extension = os.path.splitext(file_url)[1][1:]
2230
2231                 # Search for file title
2232                 mobj = re.search(r'<b title="(.*?)">', webpage)
2233                 if mobj is None:
2234                         self._downloader.trouble(u'ERROR: unable to extract title')
2235                         return
2236                 file_title = mobj.group(1).decode('utf-8')
2237
2238                 try:
2239                         # Process file information
2240                         self._downloader.process_info({
2241                                 'id':           file_id.decode('utf-8'),
2242                                 'url':          file_url.decode('utf-8'),
2243                                 'uploader':     u'NA',
2244                                 'upload_date':  u'NA',
2245                                 'title':        file_title,
2246                                 'stitle':       file_title,
2247                                 'ext':          file_extension.decode('utf-8'),
2248                                 'format':       u'NA',
2249                                 'player_url':   None,
2250                         })
2251                 except UnavailableVideoError, err:
2252                         self._downloader.trouble(u'ERROR: unable to download file')
2253
2254 class PostProcessor(object):
2255         """Post Processor class.
2256
2257         PostProcessor objects can be added to downloaders with their
2258         add_post_processor() method. When the downloader has finished a
2259         successful download, it will take its internal chain of PostProcessors
2260         and start calling the run() method on each one of them, first with
2261         an initial argument and then with the returned value of the previous
2262         PostProcessor.
2263
2264         The chain will be stopped if one of them ever returns None or the end
2265         of the chain is reached.
2266
2267         PostProcessor objects follow a "mutual registration" process similar
2268         to InfoExtractor objects.
2269         """
2270
2271         _downloader = None
2272
2273         def __init__(self, downloader=None):
2274                 self._downloader = downloader
2275
2276         def set_downloader(self, downloader):
2277                 """Sets the downloader for this PP."""
2278                 self._downloader = downloader
2279
2280         def run(self, information):
2281                 """Run the PostProcessor.
2282
2283                 The "information" argument is a dictionary like the ones
2284                 composed by InfoExtractors. The only difference is that this
2285                 one has an extra field called "filepath" that points to the
2286                 downloaded file.
2287
2288                 When this method returns None, the postprocessing chain is
2289                 stopped. However, this method may return an information
2290                 dictionary that will be passed to the next postprocessing
2291                 object in the chain. It can be the one it received after
2292                 changing some fields.
2293
2294                 In addition, this method may raise a PostProcessingError
2295                 exception that will be taken into account by the downloader
2296                 it was called from.
2297                 """
2298                 return information # by default, do nothing
2299
2300 ### MAIN PROGRAM ###
2301 if __name__ == '__main__':
2302         try:
2303                 # Modules needed only when running the main program
2304                 import getpass
2305                 import optparse
2306
2307                 # Function to update the program file with the latest version from the repository.
2308                 def update_self(downloader, filename):
2309                         # Note: downloader only used for options
2310                         if not os.access(filename, os.W_OK):
2311                                 sys.exit('ERROR: no write permissions on %s' % filename)
2312
2313                         downloader.to_screen('Updating to latest stable version...')
2314                         try:
2315                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2316                                 latest_version = urllib.urlopen(latest_url).read().strip()
2317                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2318                                 newcontent = urllib.urlopen(prog_url).read()
2319                         except (IOError, OSError), err:
2320                                 sys.exit('ERROR: unable to download latest version')
2321                         try:
2322                                 stream = open(filename, 'w')
2323                                 stream.write(newcontent)
2324                                 stream.close()
2325                         except (IOError, OSError), err:
2326                                 sys.exit('ERROR: unable to overwrite current version')
2327                         downloader.to_screen('Updated to version %s' % latest_version)
2328
2329                 # Parse command line
2330                 parser = optparse.OptionParser(
2331                         usage='Usage: %prog [options] url...',
2332                         version='2010.12.09',
2333                         conflict_handler='resolve',
2334                 )
2335
2336                 parser.add_option('-h', '--help',
2337                                 action='help', help='print this help text and exit')
2338                 parser.add_option('-v', '--version',
2339                                 action='version', help='print program version and exit')
2340                 parser.add_option('-U', '--update',
2341                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2342                 parser.add_option('-i', '--ignore-errors',
2343                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2344                 parser.add_option('-r', '--rate-limit',
2345                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2346                 parser.add_option('-R', '--retries',
2347                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2348                 parser.add_option('--playlist-start',
2349                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2350                 parser.add_option('--playlist-end',
2351                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2352                 parser.add_option('--dump-user-agent',
2353                                 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2354
2355                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2356                 authentication.add_option('-u', '--username',
2357                                 dest='username', metavar='USERNAME', help='account username')
2358                 authentication.add_option('-p', '--password',
2359                                 dest='password', metavar='PASSWORD', help='account password')
2360                 authentication.add_option('-n', '--netrc',
2361                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2362                 parser.add_option_group(authentication)
2363
2364                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2365                 video_format.add_option('-f', '--format',
2366                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2367                 video_format.add_option('--all-formats',
2368                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2369                 video_format.add_option('--max-quality',
2370                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2371                 parser.add_option_group(video_format)
2372
2373                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2374                 verbosity.add_option('-q', '--quiet',
2375                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2376                 verbosity.add_option('-s', '--simulate',
2377                                 action='store_true', dest='simulate', help='do not download video', default=False)
2378                 verbosity.add_option('-g', '--get-url',
2379                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2380                 verbosity.add_option('-e', '--get-title',
2381                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2382                 verbosity.add_option('--get-thumbnail',
2383                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2384                 verbosity.add_option('--get-description',
2385                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2386                 verbosity.add_option('--no-progress',
2387                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2388                 verbosity.add_option('--console-title',
2389                                 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2390                 parser.add_option_group(verbosity)
2391
2392                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2393                 filesystem.add_option('-t', '--title',
2394                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2395                 filesystem.add_option('-l', '--literal',
2396                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2397                 filesystem.add_option('-A', '--auto-number',
2398                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2399                 filesystem.add_option('-o', '--output',
2400                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2401                 filesystem.add_option('-a', '--batch-file',
2402                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2403                 filesystem.add_option('-w', '--no-overwrites',
2404                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2405                 filesystem.add_option('-c', '--continue',
2406                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2407                 filesystem.add_option('--cookies',
2408                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2409                 filesystem.add_option('--no-part',
2410                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2411                 parser.add_option_group(filesystem)
2412
2413                 (opts, args) = parser.parse_args()
2414
2415                 # Open appropriate CookieJar
2416                 if opts.cookiefile is None:
2417                         jar = cookielib.CookieJar()
2418                 else:
2419                         try:
2420                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2421                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2422                                         jar.load()
2423                         except (IOError, OSError), err:
2424                                 sys.exit(u'ERROR: unable to open cookie file')
2425
2426                 # Dump user agent
2427                 if opts.dump_user_agent:
2428                         print std_headers['User-Agent']
2429                         sys.exit(0)
2430
2431                 # General configuration
2432                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2433                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2434                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2435
2436                 # Batch file verification
2437                 batchurls = []
2438                 if opts.batchfile is not None:
2439                         try:
2440                                 if opts.batchfile == '-':
2441                                         batchfd = sys.stdin
2442                                 else:
2443                                         batchfd = open(opts.batchfile, 'r')
2444                                 batchurls = batchfd.readlines()
2445                                 batchurls = [x.strip() for x in batchurls]
2446                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2447                         except IOError:
2448                                 sys.exit(u'ERROR: batch file could not be read')
2449                 all_urls = batchurls + args
2450
2451                 # Conflicting, missing and erroneous options
2452                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2453                         parser.error(u'using .netrc conflicts with giving username/password')
2454                 if opts.password is not None and opts.username is None:
2455                         parser.error(u'account username missing')
2456                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2457                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2458                 if opts.usetitle and opts.useliteral:
2459                         parser.error(u'using title conflicts with using literal title')
2460                 if opts.username is not None and opts.password is None:
2461                         opts.password = getpass.getpass(u'Type account password and press return:')
2462                 if opts.ratelimit is not None:
2463                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2464                         if numeric_limit is None:
2465                                 parser.error(u'invalid rate limit specified')
2466                         opts.ratelimit = numeric_limit
2467                 if opts.retries is not None:
2468                         try:
2469                                 opts.retries = long(opts.retries)
2470                         except (TypeError, ValueError), err:
2471                                 parser.error(u'invalid retry count specified')
2472                 try:
2473                         opts.playliststart = long(opts.playliststart)
2474                         if opts.playliststart <= 0:
2475                                 raise ValueError
2476                 except (TypeError, ValueError), err:
2477                         parser.error(u'invalid playlist start number specified')
2478                 try:
2479                         opts.playlistend = long(opts.playlistend)
2480                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2481                                 raise ValueError
2482                 except (TypeError, ValueError), err:
2483                         parser.error(u'invalid playlist end number specified')
2484
2485                 # Information extractors
2486                 youtube_ie = YoutubeIE()
2487                 metacafe_ie = MetacafeIE(youtube_ie)
2488                 dailymotion_ie = DailymotionIE()
2489                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2490                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2491                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2492                 google_ie = GoogleIE()
2493                 google_search_ie = GoogleSearchIE(google_ie)
2494                 photobucket_ie = PhotobucketIE()
2495                 yahoo_ie = YahooIE()
2496                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2497                 deposit_files_ie = DepositFilesIE()
2498                 generic_ie = GenericIE()
2499
2500                 # File downloader
2501                 fd = FileDownloader({
2502                         'usenetrc': opts.usenetrc,
2503                         'username': opts.username,
2504                         'password': opts.password,
2505                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2506                         'forceurl': opts.geturl,
2507                         'forcetitle': opts.gettitle,
2508                         'forcethumbnail': opts.getthumbnail,
2509                         'forcedescription': opts.getdescription,
2510                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2511                         'format': opts.format,
2512                         'format_limit': opts.format_limit,
2513                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2514                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2515                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2516                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2517                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2518                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2519                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2520                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2521                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2522                                 or u'%(id)s.%(ext)s'),
2523                         'ignoreerrors': opts.ignoreerrors,
2524                         'ratelimit': opts.ratelimit,
2525                         'nooverwrites': opts.nooverwrites,
2526                         'retries': opts.retries,
2527                         'continuedl': opts.continue_dl,
2528                         'noprogress': opts.noprogress,
2529                         'playliststart': opts.playliststart,
2530                         'playlistend': opts.playlistend,
2531                         'logtostderr': opts.outtmpl == '-',
2532                         'consoletitle': opts.consoletitle,
2533                         'nopart': opts.nopart,
2534                         })
2535                 fd.add_info_extractor(youtube_search_ie)
2536                 fd.add_info_extractor(youtube_pl_ie)
2537                 fd.add_info_extractor(youtube_user_ie)
2538                 fd.add_info_extractor(metacafe_ie)
2539                 fd.add_info_extractor(dailymotion_ie)
2540                 fd.add_info_extractor(youtube_ie)
2541                 fd.add_info_extractor(google_ie)
2542                 fd.add_info_extractor(google_search_ie)
2543                 fd.add_info_extractor(photobucket_ie)
2544                 fd.add_info_extractor(yahoo_ie)
2545                 fd.add_info_extractor(yahoo_search_ie)
2546                 fd.add_info_extractor(deposit_files_ie)
2547
2548                 # This must come last since it's the
2549                 # fallback if none of the others work
2550                 fd.add_info_extractor(generic_ie)
2551
2552                 # Update version
2553                 if opts.update_self:
2554                         update_self(fd, sys.argv[0])
2555
2556                 # Maybe do nothing
2557                 if len(all_urls) < 1:
2558                         if not opts.update_self:
2559                                 parser.error(u'you must provide at least one URL')
2560                         else:
2561                                 sys.exit()
2562                 retcode = fd.download(all_urls)
2563
2564                 # Dump cookie jar if requested
2565                 if opts.cookiefile is not None:
2566                         try:
2567                                 jar.save()
2568                         except (IOError, OSError), err:
2569                                 sys.exit(u'ERROR: unable to save cookie jar')
2570
2571                 sys.exit(retcode)
2572
2573         except DownloadError:
2574                 sys.exit(1)
2575         except SameFileError:
2576                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2577         except KeyboardInterrupt:
2578                 sys.exit(u'\nERROR: Interrupted by user')