youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # License: Public domain code
   8 import cookielib
   9 import ctypes
  10 import datetime
  11 import gzip
  12 import htmlentitydefs
  13 import httplib
  14 import locale
  15 import math
  16 import netrc
  17 import os
  18 import os.path
  19 import re
  20 import socket
  21 import string
  22 import StringIO
  23 import subprocess
  24 import sys
  25 import time
  26 import urllib
  27 import urllib2
  28 import zlib
  29
  30 # parse_qs was moved from the cgi module to the urlparse module recently.
  31 try:
  32         from urlparse import parse_qs
  33 except ImportError:
  34         from cgi import parse_qs
  35
  36 std_headers = {
  37         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  38         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  39         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  40         'Accept-Encoding': 'gzip, deflate',
  41         'Accept-Language': 'en-us,en;q=0.5',
  42 }
  43
  44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  45
  46 def preferredencoding():
  47         """Get preferred encoding.
  48
  49         Returns the best encoding scheme for the system, based on
  50         locale.getpreferredencoding() and some further tweaks.
  51         """
  52         def yield_preferredencoding():
  53                 try:
  54                         pref = locale.getpreferredencoding()
  55                         u'TEST'.encode(pref)
  56                 except:
  57                         pref = 'UTF-8'
  58                 while True:
  59                         yield pref
  60         return yield_preferredencoding().next()
  61
  62 def htmlentity_transform(matchobj):
  63         """Transforms an HTML entity to a Unicode character.
  64
  65         This function receives a match object and is intended to be used with
  66         the re.sub() function.
  67         """
  68         entity = matchobj.group(1)
  69
  70         # Known non-numeric HTML entity
  71         if entity in htmlentitydefs.name2codepoint:
  72                 return unichr(htmlentitydefs.name2codepoint[entity])
  73
  74         # Unicode character
  75         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  76         if mobj is not None:
  77                 numstr = mobj.group(1)
  78                 if numstr.startswith(u'x'):
  79                         base = 16
  80                         numstr = u'0%s' % numstr
  81                 else:
  82                         base = 10
  83                 return unichr(long(numstr, base))
  84
  85         # Unknown entity in name, return its literal representation
  86         return (u'&%s;' % entity)
  87
  88 def sanitize_title(utitle):
  89         """Sanitizes a video title so it could be used as part of a filename."""
  90         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  91         return utitle.replace(unicode(os.sep), u'%')
  92
  93 def sanitize_open(filename, open_mode):
  94         """Try to open the given filename, and slightly tweak it if this fails.
  95
  96         Attempts to open the given filename. If this fails, it tries to change
  97         the filename slightly, step by step, until it's either able to open it
  98         or it fails and raises a final exception, like the standard open()
  99         function.
 100
 101         It returns the tuple (stream, definitive_file_name).
 102         """
 103         try:
 104                 if filename == u'-':
 105                         if sys.platform == 'win32':
 106                                 import msvcrt
 107                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 108                         return (sys.stdout, filename)
 109                 stream = open(filename, open_mode)
 110                 return (stream, filename)
 111         except (IOError, OSError), err:
 112                 # In case of error, try to remove win32 forbidden chars
 113                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 114
 115                 # An exception here should be caught in the caller
 116                 stream = open(filename, open_mode)
 117                 return (stream, filename)
 118
 119 class DownloadError(Exception):
 120         """Download Error exception.
 121
 122         This exception may be thrown by FileDownloader objects if they are not
 123         configured to continue on errors. They will contain the appropriate
 124         error message.
 125         """
 126         pass
 127
 128 class SameFileError(Exception):
 129         """Same File exception.
 130
 131         This exception will be thrown by FileDownloader objects if they detect
 132         multiple files would have to be downloaded to the same file on disk.
 133         """
 134         pass
 135
 136 class PostProcessingError(Exception):
 137         """Post Processing exception.
 138
 139         This exception may be raised by PostProcessor's .run() method to
 140         indicate an error in the postprocessing task.
 141         """
 142         pass
 143
 144 class UnavailableVideoError(Exception):
 145         """Unavailable Format exception.
 146
 147         This exception will be thrown when a video is requested
 148         in a format that is not available for that video.
 149         """
 150         pass
 151
 152 class ContentTooShortError(Exception):
 153         """Content Too Short exception.
 154
 155         This exception may be raised by FileDownloader objects when a file they
 156         download is too small for what the server announced first, indicating
 157         the connection was probably interrupted.
 158         """
 159         # Both in bytes
 160         downloaded = None
 161         expected = None
 162
 163         def __init__(self, downloaded, expected):
 164                 self.downloaded = downloaded
 165                 self.expected = expected
 166
 167 class YoutubeDLHandler(urllib2.HTTPHandler):
 168         """Handler for HTTP requests and responses.
 169
 170         This class, when installed with an OpenerDirector, automatically adds
 171         the standard headers to every HTTP request and handles gzipped and
 172         deflated responses from web servers. If compression is to be avoided in
 173         a particular request, the original request in the program code only has
 174         to include the HTTP header "Youtubedl-No-Compression", which will be
 175         removed before making the real request.
 176
 177         Part of this code was copied from:
 178
 179           http://techknack.net/python-urllib2-handlers/
 180
 181         Andrew Rowls, the author of that code, agreed to release it to the
 182         public domain.
 183         """
 184
 185         @staticmethod
 186         def deflate(data):
 187                 try:
 188                         return zlib.decompress(data, -zlib.MAX_WBITS)
 189                 except zlib.error:
 190                         return zlib.decompress(data)
 191
 192         def http_request(self, req):
 193                 for h in std_headers:
 194                         if h in req.headers:
 195                                 del req.headers[h]
 196                         req.add_header(h, std_headers[h])
 197                 if 'Youtubedl-no-compression' in req.headers:
 198                         if 'Accept-encoding' in req.headers:
 199                                 del req.headers['Accept-encoding']
 200                         del req.headers['Youtubedl-no-compression']
 201                 return req
 202
 203         def http_response(self, req, resp):
 204                 old_resp = resp
 205                 # gzip
 206                 if resp.headers.get('Content-encoding', '') == 'gzip':
 207                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 208                         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
 209                         resp.msg = old_resp.msg
 210                 # deflate
 211                 if resp.headers.get('Content-encoding', '') == 'deflate':
 212                         gz = StringIO.StringIO(self.deflate(resp.read()))
 213                         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
 214                         resp.msg = old_resp.msg
 215                 return resp
 216
 217 class FileDownloader(object):
 218         """File Downloader class.
 219
 220         File downloader objects are the ones responsible of downloading the
 221         actual video file and writing it to disk if the user has requested
 222         it, among some other tasks. In most cases there should be one per
 223         program. As, given a video URL, the downloader doesn't know how to
 224         extract all the needed information, task that InfoExtractors do, it
 225         has to pass the URL to one of them.
 226
 227         For this, file downloader objects have a method that allows
 228         InfoExtractors to be registered in a given order. When it is passed
 229         a URL, the file downloader handles it to the first InfoExtractor it
 230         finds that reports being able to handle it. The InfoExtractor extracts
 231         all the information about the video or videos the URL refers to, and
 232         asks the FileDownloader to process the video information, possibly
 233         downloading the video.
 234
 235         File downloaders accept a lot of parameters. In order not to saturate
 236         the object constructor with arguments, it receives a dictionary of
 237         options instead. These options are available through the params
 238         attribute for the InfoExtractors to use. The FileDownloader also
 239         registers itself as the downloader in charge for the InfoExtractors
 240         that are added to it, so this is a "mutual registration".
 241
 242         Available options:
 243
 244         username:         Username for authentication purposes.
 245         password:         Password for authentication purposes.
 246         usenetrc:         Use netrc for authentication instead.
 247         quiet:            Do not print messages to stdout.
 248         forceurl:         Force printing final URL.
 249         forcetitle:       Force printing title.
 250         forcethumbnail:   Force printing thumbnail URL.
 251         forcedescription: Force printing description.
 252         simulate:         Do not download the video files.
 253         format:           Video format code.
 254         format_limit:     Highest quality format to try.
 255         outtmpl:          Template for output names.
 256         ignoreerrors:     Do not stop on download errors.
 257         ratelimit:        Download speed limit, in bytes/sec.
 258         nooverwrites:     Prevent overwriting files.
 259         retries:          Number of times to retry for HTTP error 5xx
 260         continuedl:       Try to continue downloads if possible.
 261         noprogress:       Do not print the progress bar.
 262         playliststart:    Playlist item to start at.
 263         playlistend:      Playlist item to end at.
 264         logtostderr:      Log messages to stderr instead of stdout.
 265         consoletitle:     Display progress in console window's titlebar.
 266         nopart:           Do not use temporary .part files.
 267         """
 268
 269         params = None
 270         _ies = []
 271         _pps = []
 272         _download_retcode = None
 273         _num_downloads = None
 274         _screen_file = None
 275
 276         def __init__(self, params):
 277                 """Create a FileDownloader object with the given options."""
 278                 self._ies = []
 279                 self._pps = []
 280                 self._download_retcode = 0
 281                 self._num_downloads = 0
 282                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 283                 self.params = params
 284
 285         @staticmethod
 286         def pmkdir(filename):
 287                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 288                 components = filename.split(os.sep)
 289                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 290                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 291                 for dir in aggregate:
 292                         if not os.path.exists(dir):
 293                                 os.mkdir(dir)
 294
 295         @staticmethod
 296         def format_bytes(bytes):
 297                 if bytes is None:
 298                         return 'N/A'
 299                 if type(bytes) is str:
 300                         bytes = float(bytes)
 301                 if bytes == 0.0:
 302                         exponent = 0
 303                 else:
 304                         exponent = long(math.log(bytes, 1024.0))
 305                 suffix = 'bkMGTPEZY'[exponent]
 306                 converted = float(bytes) / float(1024**exponent)
 307                 return '%.2f%s' % (converted, suffix)
 308
 309         @staticmethod
 310         def calc_percent(byte_counter, data_len):
 311                 if data_len is None:
 312                         return '---.-%'
 313                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 314
 315         @staticmethod
 316         def calc_eta(start, now, total, current):
 317                 if total is None:
 318                         return '--:--'
 319                 dif = now - start
 320                 if current == 0 or dif < 0.001: # One millisecond
 321                         return '--:--'
 322                 rate = float(current) / dif
 323                 eta = long((float(total) - float(current)) / rate)
 324                 (eta_mins, eta_secs) = divmod(eta, 60)
 325                 if eta_mins > 99:
 326                         return '--:--'
 327                 return '%02d:%02d' % (eta_mins, eta_secs)
 328
 329         @staticmethod
 330         def calc_speed(start, now, bytes):
 331                 dif = now - start
 332                 if bytes == 0 or dif < 0.001: # One millisecond
 333                         return '%10s' % '---b/s'
 334                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 335
 336         @staticmethod
 337         def best_block_size(elapsed_time, bytes):
 338                 new_min = max(bytes / 2.0, 1.0)
 339                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 340                 if elapsed_time < 0.001:
 341                         return long(new_max)
 342                 rate = bytes / elapsed_time
 343                 if rate > new_max:
 344                         return long(new_max)
 345                 if rate < new_min:
 346                         return long(new_min)
 347                 return long(rate)
 348
 349         @staticmethod
 350         def parse_bytes(bytestr):
 351                 """Parse a string indicating a byte quantity into a long integer."""
 352                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 353                 if matchobj is None:
 354                         return None
 355                 number = float(matchobj.group(1))
 356                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 357                 return long(round(number * multiplier))
 358
 359         def add_info_extractor(self, ie):
 360                 """Add an InfoExtractor object to the end of the list."""
 361                 self._ies.append(ie)
 362                 ie.set_downloader(self)
 363
 364         def add_post_processor(self, pp):
 365                 """Add a PostProcessor object to the end of the chain."""
 366                 self._pps.append(pp)
 367                 pp.set_downloader(self)
 368
 369         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 370                 """Print message to stdout if not in quiet mode."""
 371                 try:
 372                         if not self.params.get('quiet', False):
 373                                 terminator = [u'\n', u''][skip_eol]
 374                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 375                         self._screen_file.flush()
 376                 except (UnicodeEncodeError), err:
 377                         if not ignore_encoding_errors:
 378                                 raise
 379
 380         def to_stderr(self, message):
 381                 """Print message to stderr."""
 382                 print >>sys.stderr, message.encode(preferredencoding())
 383
 384         def to_cons_title(self, message):
 385                 """Set console/terminal window title to message."""
 386                 if not self.params.get('consoletitle', False):
 387                         return
 388                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 389                         # c_wchar_p() might not be necessary if `message` is
 390                         # already of type unicode()
 391                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 392                 elif 'TERM' in os.environ:
 393                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 394
 395         def fixed_template(self):
 396                 """Checks if the output template is fixed."""
 397                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 398
 399         def trouble(self, message=None):
 400                 """Determine action to take when a download problem appears.
 401
 402                 Depending on if the downloader has been configured to ignore
 403                 download errors or not, this method may throw an exception or
 404                 not when errors are found, after printing the message.
 405                 """
 406                 if message is not None:
 407                         self.to_stderr(message)
 408                 if not self.params.get('ignoreerrors', False):
 409                         raise DownloadError(message)
 410                 self._download_retcode = 1
 411
 412         def slow_down(self, start_time, byte_counter):
 413                 """Sleep if the download speed is over the rate limit."""
 414                 rate_limit = self.params.get('ratelimit', None)
 415                 if rate_limit is None or byte_counter == 0:
 416                         return
 417                 now = time.time()
 418                 elapsed = now - start_time
 419                 if elapsed <= 0.0:
 420                         return
 421                 speed = float(byte_counter) / elapsed
 422                 if speed > rate_limit:
 423                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 424
 425         def temp_name(self, filename):
 426                 """Returns a temporary filename for the given filename."""
 427                 if self.params.get('nopart', False) or filename == u'-' or \
 428                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 429                         return filename
 430                 return filename + u'.part'
 431
 432         def undo_temp_name(self, filename):
 433                 if filename.endswith(u'.part'):
 434                         return filename[:-len(u'.part')]
 435                 return filename
 436
 437         def try_rename(self, old_filename, new_filename):
 438                 try:
 439                         if old_filename == new_filename:
 440                                 return
 441                         os.rename(old_filename, new_filename)
 442                 except (IOError, OSError), err:
 443                         self.trouble(u'ERROR: unable to rename file')
 444
 445         def report_destination(self, filename):
 446                 """Report destination filename."""
 447                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 448
 449         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 450                 """Report download progress."""
 451                 if self.params.get('noprogress', False):
 452                         return
 453                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 454                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 455                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 456                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 457
 458         def report_resuming_byte(self, resume_len):
 459                 """Report attempt to resume at given byte."""
 460                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 461
 462         def report_retry(self, count, retries):
 463                 """Report retry in case of HTTP error 5xx"""
 464                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 465
 466         def report_file_already_downloaded(self, file_name):
 467                 """Report file has already been fully downloaded."""
 468                 try:
 469                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 470                 except (UnicodeEncodeError), err:
 471                         self.to_screen(u'[download] The file has already been downloaded')
 472
 473         def report_unable_to_resume(self):
 474                 """Report it was impossible to resume download."""
 475                 self.to_screen(u'[download] Unable to resume')
 476
 477         def report_finish(self):
 478                 """Report download finished."""
 479                 if self.params.get('noprogress', False):
 480                         self.to_screen(u'[download] Download completed')
 481                 else:
 482                         self.to_screen(u'')
 483
 484         def increment_downloads(self):
 485                 """Increment the ordinal that assigns a number to each file."""
 486                 self._num_downloads += 1
 487
 488         def process_info(self, info_dict):
 489                 """Process a single dictionary returned by an InfoExtractor."""
 490                 # Do nothing else if in simulate mode
 491                 if self.params.get('simulate', False):
 492                         # Forced printings
 493                         if self.params.get('forcetitle', False):
 494                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 495                         if self.params.get('forceurl', False):
 496                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 497                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 498                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 499                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 500                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 501
 502                         return
 503
 504                 try:
 505                         template_dict = dict(info_dict)
 506                         template_dict['epoch'] = unicode(long(time.time()))
 507                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 508                         filename = self.params['outtmpl'] % template_dict
 509                 except (ValueError, KeyError), err:
 510                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 511                         return
 512                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 513                         self.to_stderr(u'WARNING: file exists and will be skipped')
 514                         return
 515
 516                 try:
 517                         self.pmkdir(filename)
 518                 except (OSError, IOError), err:
 519                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 520                         return
 521
 522                 try:
 523                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 524                 except (OSError, IOError), err:
 525                         raise UnavailableVideoError
 526                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 527                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 528                         return
 529                 except (ContentTooShortError, ), err:
 530                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 531                         return
 532
 533                 if success:
 534                         try:
 535                                 self.post_process(filename, info_dict)
 536                         except (PostProcessingError), err:
 537                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 538                                 return
 539
 540         def download(self, url_list):
 541                 """Download a given list of URLs."""
 542                 if len(url_list) > 1 and self.fixed_template():
 543                         raise SameFileError(self.params['outtmpl'])
 544
 545                 for url in url_list:
 546                         suitable_found = False
 547                         for ie in self._ies:
 548                                 # Go to next InfoExtractor if not suitable
 549                                 if not ie.suitable(url):
 550                                         continue
 551
 552                                 # Suitable InfoExtractor found
 553                                 suitable_found = True
 554
 555                                 # Extract information from URL and process it
 556                                 ie.extract(url)
 557
 558                                 # Suitable InfoExtractor had been found; go to next URL
 559                                 break
 560
 561                         if not suitable_found:
 562                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 563
 564                 return self._download_retcode
 565
 566         def post_process(self, filename, ie_info):
 567                 """Run the postprocessing chain on the given file."""
 568                 info = dict(ie_info)
 569                 info['filepath'] = filename
 570                 for pp in self._pps:
 571                         info = pp.run(info)
 572                         if info is None:
 573                                 break
 574
 575         def _download_with_rtmpdump(self, filename, url, player_url):
 576                 self.report_destination(filename)
 577                 tmpfilename = self.temp_name(filename)
 578
 579                 # Check for rtmpdump first
 580                 try:
 581                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 582                 except (OSError, IOError):
 583                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 584                         return False
 585
 586                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 587                 # the connection was interrumpted and resuming appears to be
 588                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 589                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 590                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 591                 while retval == 2 or retval == 1:
 592                         prevsize = os.path.getsize(tmpfilename)
 593                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 594                         time.sleep(5.0) # This seems to be needed
 595                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 596                         cursize = os.path.getsize(tmpfilename)
 597                         if prevsize == cursize and retval == 1:
 598                                 break
 599                 if retval == 0:
 600                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 601                         self.try_rename(tmpfilename, filename)
 602                         return True
 603                 else:
 604                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 605                         return False
 606
 607         def _do_download(self, filename, url, player_url):
 608                 # Check file already present
 609                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 610                         self.report_file_already_downloaded(filename)
 611                         return True
 612
 613                 # Attempt to download using rtmpdump
 614                 if url.startswith('rtmp'):
 615                         return self._download_with_rtmpdump(filename, url, player_url)
 616
 617                 tmpfilename = self.temp_name(filename)
 618                 stream = None
 619                 open_mode = 'wb'
 620
 621                 # Do not include the Accept-Encoding header
 622                 headers = {'Youtubedl-no-compression': 'True'}
 623                 basic_request = urllib2.Request(url, None, headers)
 624                 request = urllib2.Request(url, None, headers)
 625
 626                 # Establish possible resume length
 627                 if os.path.isfile(tmpfilename):
 628                         resume_len = os.path.getsize(tmpfilename)
 629                 else:
 630                         resume_len = 0
 631
 632                 # Request parameters in case of being able to resume
 633                 if self.params.get('continuedl', False) and resume_len != 0:
 634                         self.report_resuming_byte(resume_len)
 635                         request.add_header('Range','bytes=%d-' % resume_len)
 636                         open_mode = 'ab'
 637
 638                 count = 0
 639                 retries = self.params.get('retries', 0)
 640                 while count <= retries:
 641                         # Establish connection
 642                         try:
 643                                 data = urllib2.urlopen(request)
 644                                 break
 645                         except (urllib2.HTTPError, ), err:
 646                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 647                                         # Unexpected HTTP error
 648                                         raise
 649                                 elif err.code == 416:
 650                                         # Unable to resume (requested range not satisfiable)
 651                                         try:
 652                                                 # Open the connection again without the range header
 653                                                 data = urllib2.urlopen(basic_request)
 654                                                 content_length = data.info()['Content-Length']
 655                                         except (urllib2.HTTPError, ), err:
 656                                                 if err.code < 500 or err.code >= 600:
 657                                                         raise
 658                                         else:
 659                                                 # Examine the reported length
 660                                                 if (content_length is not None and
 661                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 662                                                         # The file had already been fully downloaded.
 663                                                         # Explanation to the above condition: in issue #175 it was revealed that
 664                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 665                                                         # changing the file size slightly and causing problems for some users. So
 666                                                         # I decided to implement a suggested change and consider the file
 667                                                         # completely downloaded if the file size differs less than 100 bytes from
 668                                                         # the one in the hard drive.
 669                                                         self.report_file_already_downloaded(filename)
 670                                                         self.try_rename(tmpfilename, filename)
 671                                                         return True
 672                                                 else:
 673                                                         # The length does not match, we start the download over
 674                                                         self.report_unable_to_resume()
 675                                                         open_mode = 'wb'
 676                                                         break
 677                         # Retry
 678                         count += 1
 679                         if count <= retries:
 680                                 self.report_retry(count, retries)
 681
 682                 if count > retries:
 683                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 684                         return False
 685
 686                 data_len = data.info().get('Content-length', None)
 687                 if data_len is not None:
 688                         data_len = long(data_len) + resume_len
 689                 data_len_str = self.format_bytes(data_len)
 690                 byte_counter = 0 + resume_len
 691                 block_size = 1024
 692                 start = time.time()
 693                 while True:
 694                         # Download and write
 695                         before = time.time()
 696                         data_block = data.read(block_size)
 697                         after = time.time()
 698                         if len(data_block) == 0:
 699                                 break
 700                         byte_counter += len(data_block)
 701
 702                         # Open file just in time
 703                         if stream is None:
 704                                 try:
 705                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 706                                         filename = self.undo_temp_name(tmpfilename)
 707                                         self.report_destination(filename)
 708                                 except (OSError, IOError), err:
 709                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 710                                         return False
 711                         try:
 712                                 stream.write(data_block)
 713                         except (IOError, OSError), err:
 714                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 715                                 return False
 716                         block_size = self.best_block_size(after - before, len(data_block))
 717
 718                         # Progress message
 719                         percent_str = self.calc_percent(byte_counter, data_len)
 720                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 721                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 722                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 723
 724                         # Apply rate limit
 725                         self.slow_down(start, byte_counter - resume_len)
 726
 727                 stream.close()
 728                 self.report_finish()
 729                 if data_len is not None and byte_counter != data_len:
 730                         raise ContentTooShortError(byte_counter, long(data_len))
 731                 self.try_rename(tmpfilename, filename)
 732                 return True
 733
 734 class InfoExtractor(object):
 735         """Information Extractor class.
 736
 737         Information extractors are the classes that, given a URL, extract
 738         information from the video (or videos) the URL refers to. This
 739         information includes the real video URL, the video title and simplified
 740         title, author and others. The information is stored in a dictionary
 741         which is then passed to the FileDownloader. The FileDownloader
 742         processes this information possibly downloading the video to the file
 743         system, among other possible outcomes. The dictionaries must include
 744         the following fields:
 745
 746         id:             Video identifier.
 747         url:            Final video URL.
 748         uploader:       Nickname of the video uploader.
 749         title:          Literal title.
 750         stitle:         Simplified title.
 751         ext:            Video filename extension.
 752         format:         Video format.
 753         player_url:     SWF Player URL (may be None).
 754
 755         The following fields are optional. Their primary purpose is to allow
 756         youtube-dl to serve as the backend for a video search function, such
 757         as the one in youtube2mp3.  They are only used when their respective
 758         forced printing functions are called:
 759
 760         thumbnail:      Full URL to a video thumbnail image.
 761         description:    One-line video description.
 762
 763         Subclasses of this one should re-define the _real_initialize() and
 764         _real_extract() methods, as well as the suitable() static method.
 765         Probably, they should also be instantiated and added to the main
 766         downloader.
 767         """
 768
 769         _ready = False
 770         _downloader = None
 771
 772         def __init__(self, downloader=None):
 773                 """Constructor. Receives an optional downloader."""
 774                 self._ready = False
 775                 self.set_downloader(downloader)
 776
 777         @staticmethod
 778         def suitable(url):
 779                 """Receives a URL and returns True if suitable for this IE."""
 780                 return False
 781
 782         def initialize(self):
 783                 """Initializes an instance (authentication, etc)."""
 784                 if not self._ready:
 785                         self._real_initialize()
 786                         self._ready = True
 787
 788         def extract(self, url):
 789                 """Extracts URL information and returns it in list of dicts."""
 790                 self.initialize()
 791                 return self._real_extract(url)
 792
 793         def set_downloader(self, downloader):
 794                 """Sets the downloader for this IE."""
 795                 self._downloader = downloader
 796
 797         def _real_initialize(self):
 798                 """Real initialization process. Redefine in subclasses."""
 799                 pass
 800
 801         def _real_extract(self, url):
 802                 """Real extraction process. Redefine in subclasses."""
 803                 pass
 804
 805 class YoutubeIE(InfoExtractor):
 806         """Information extractor for youtube.com."""
 807
 808         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 809         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 810         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 811         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 812         _NETRC_MACHINE = 'youtube'
 813         # Listed in order of quality
 814         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 815         _video_extensions = {
 816                 '13': '3gp',
 817                 '17': 'mp4',
 818                 '18': 'mp4',
 819                 '22': 'mp4',
 820                 '37': 'mp4',
 821                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 822                 '43': 'webm',
 823                 '45': 'webm',
 824         }
 825
 826         @staticmethod
 827         def suitable(url):
 828                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 829
 830         def report_lang(self):
 831                 """Report attempt to set language."""
 832                 self._downloader.to_screen(u'[youtube] Setting language')
 833
 834         def report_login(self):
 835                 """Report attempt to log in."""
 836                 self._downloader.to_screen(u'[youtube] Logging in')
 837
 838         def report_age_confirmation(self):
 839                 """Report attempt to confirm age."""
 840                 self._downloader.to_screen(u'[youtube] Confirming age')
 841
 842         def report_video_webpage_download(self, video_id):
 843                 """Report attempt to download video webpage."""
 844                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 845
 846         def report_video_info_webpage_download(self, video_id):
 847                 """Report attempt to download video info webpage."""
 848                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 849
 850         def report_information_extraction(self, video_id):
 851                 """Report attempt to extract video information."""
 852                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 853
 854         def report_unavailable_format(self, video_id, format):
 855                 """Report extracted video URL."""
 856                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 857
 858         def report_rtmp_download(self):
 859                 """Indicate the download will use the RTMP protocol."""
 860                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 861
 862         def _real_initialize(self):
 863                 if self._downloader is None:
 864                         return
 865
 866                 username = None
 867                 password = None
 868                 downloader_params = self._downloader.params
 869
 870                 # Attempt to use provided username and password or .netrc data
 871                 if downloader_params.get('username', None) is not None:
 872                         username = downloader_params['username']
 873                         password = downloader_params['password']
 874                 elif downloader_params.get('usenetrc', False):
 875                         try:
 876                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 877                                 if info is not None:
 878                                         username = info[0]
 879                                         password = info[2]
 880                                 else:
 881                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 882                         except (IOError, netrc.NetrcParseError), err:
 883                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 884                                 return
 885
 886                 # Set language
 887                 request = urllib2.Request(self._LANG_URL)
 888                 try:
 889                         self.report_lang()
 890                         urllib2.urlopen(request).read()
 891                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 892                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 893                         return
 894
 895                 # No authentication to be performed
 896                 if username is None:
 897                         return
 898
 899                 # Log in
 900                 login_form = {
 901                                 'current_form': 'loginForm',
 902                                 'next':         '/',
 903                                 'action_login': 'Log In',
 904                                 'username':     username,
 905                                 'password':     password,
 906                                 }
 907                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 908                 try:
 909                         self.report_login()
 910                         login_results = urllib2.urlopen(request).read()
 911                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 912                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 913                                 return
 914                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 915                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 916                         return
 917
 918                 # Confirm age
 919                 age_form = {
 920                                 'next_url':             '/',
 921                                 'action_confirm':       'Confirm',
 922                                 }
 923                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 924                 try:
 925                         self.report_age_confirmation()
 926                         age_results = urllib2.urlopen(request).read()
 927                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 928                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 929                         return
 930
 931         def _real_extract(self, url):
 932                 # Extract video id from URL
 933                 mobj = re.match(self._VALID_URL, url)
 934                 if mobj is None:
 935                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 936                         return
 937                 video_id = mobj.group(2)
 938
 939                 # Get video webpage
 940                 self.report_video_webpage_download(video_id)
 941                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 942                 try:
 943                         video_webpage = urllib2.urlopen(request).read()
 944                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 945                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 946                         return
 947
 948                 # Attempt to extract SWF player URL
 949                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 950                 if mobj is not None:
 951                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 952                 else:
 953                         player_url = None
 954
 955                 # Get video info
 956                 self.report_video_info_webpage_download(video_id)
 957                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 958                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 959                                            % (video_id, el_type))
 960                         request = urllib2.Request(video_info_url)
 961                         try:
 962                                 video_info_webpage = urllib2.urlopen(request).read()
 963                                 video_info = parse_qs(video_info_webpage)
 964                                 if 'token' in video_info:
 965                                         break
 966                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 967                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 968                                 return
 969                 if 'token' not in video_info:
 970                         if 'reason' in video_info:
 971                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 972                         else:
 973                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 974                         return
 975
 976                 # Start extracting information
 977                 self.report_information_extraction(video_id)
 978
 979                 # uploader
 980                 if 'author' not in video_info:
 981                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 982                         return
 983                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 984
 985                 # title
 986                 if 'title' not in video_info:
 987                         self._downloader.trouble(u'ERROR: unable to extract video title')
 988                         return
 989                 video_title = urllib.unquote_plus(video_info['title'][0])
 990                 video_title = video_title.decode('utf-8')
 991                 video_title = sanitize_title(video_title)
 992
 993                 # simplified title
 994                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 995                 simple_title = simple_title.strip(ur'_')
 996
 997                 # thumbnail image
 998                 if 'thumbnail_url' not in video_info:
 999                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1000                         video_thumbnail = ''
1001                 else:   # don't panic if we can't find it
1002                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1003
1004                 # upload date
1005                 upload_date = u'NA'
1006                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1007                 if mobj is not None:
1008                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1009                         format_expressions = ['%d %B %Y', '%B %d %Y']
1010                         for expression in format_expressions:
1011                                 try:
1012                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1013                                 except:
1014                                         pass
1015
1016                 # description
1017                 video_description = 'No description available.'
1018                 if self._downloader.params.get('forcedescription', False):
1019                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1020                         if mobj is not None:
1021                                 video_description = mobj.group(1)
1022
1023                 # token
1024                 video_token = urllib.unquote_plus(video_info['token'][0])
1025
1026                 # Decide which formats to download
1027                 req_format = self._downloader.params.get('format', None)
1028
1029                 if 'fmt_url_map' in video_info:
1030                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1031                         format_limit = self._downloader.params.get('format_limit', None)
1032                         if format_limit is not None and format_limit in self._available_formats:
1033                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1034                         else:
1035                                 format_list = self._available_formats
1036                         existing_formats = [x for x in format_list if x in url_map]
1037                         if len(existing_formats) == 0:
1038                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1039                                 return
1040                         if req_format is None:
1041                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1042                         elif req_format == '-1':
1043                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1044                         else:
1045                                 # Specific format
1046                                 if req_format not in url_map:
1047                                         self._downloader.trouble(u'ERROR: requested format not available')
1048                                         return
1049                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1050
1051                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1052                         self.report_rtmp_download()
1053                         video_url_list = [(None, video_info['conn'][0])]
1054
1055                 else:
1056                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1057                         return
1058
1059                 for format_param, video_real_url in video_url_list:
1060                         # At this point we have a new video
1061                         self._downloader.increment_downloads()
1062
1063                         # Extension
1064                         video_extension = self._video_extensions.get(format_param, 'flv')
1065
1066                         # Find the video URL in fmt_url_map or conn paramters
1067                         try:
1068                                 # Process video information
1069                                 self._downloader.process_info({
1070                                         'id':           video_id.decode('utf-8'),
1071                                         'url':          video_real_url.decode('utf-8'),
1072                                         'uploader':     video_uploader.decode('utf-8'),
1073                                         'upload_date':  upload_date,
1074                                         'title':        video_title,
1075                                         'stitle':       simple_title,
1076                                         'ext':          video_extension.decode('utf-8'),
1077                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1078                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1079                                         'description':  video_description.decode('utf-8'),
1080                                         'player_url':   player_url,
1081                                 })
1082                         except UnavailableVideoError, err:
1083                                 self._downloader.trouble(u'\nERROR: unable to download video')
1084
1085
1086 class MetacafeIE(InfoExtractor):
1087         """Information Extractor for metacafe.com."""
1088
1089         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1090         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1091         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1092         _youtube_ie = None
1093
1094         def __init__(self, youtube_ie, downloader=None):
1095                 InfoExtractor.__init__(self, downloader)
1096                 self._youtube_ie = youtube_ie
1097
1098         @staticmethod
1099         def suitable(url):
1100                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1101
1102         def report_disclaimer(self):
1103                 """Report disclaimer retrieval."""
1104                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1105
1106         def report_age_confirmation(self):
1107                 """Report attempt to confirm age."""
1108                 self._downloader.to_screen(u'[metacafe] Confirming age')
1109
1110         def report_download_webpage(self, video_id):
1111                 """Report webpage download."""
1112                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1113
1114         def report_extraction(self, video_id):
1115                 """Report information extraction."""
1116                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1117
1118         def _real_initialize(self):
1119                 # Retrieve disclaimer
1120                 request = urllib2.Request(self._DISCLAIMER)
1121                 try:
1122                         self.report_disclaimer()
1123                         disclaimer = urllib2.urlopen(request).read()
1124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1125                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1126                         return
1127
1128                 # Confirm age
1129                 disclaimer_form = {
1130                         'filters': '0',
1131                         'submit': "Continue - I'm over 18",
1132                         }
1133                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1134                 try:
1135                         self.report_age_confirmation()
1136                         disclaimer = urllib2.urlopen(request).read()
1137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1139                         return
1140
1141         def _real_extract(self, url):
1142                 # Extract id and simplified title from URL
1143                 mobj = re.match(self._VALID_URL, url)
1144                 if mobj is None:
1145                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1146                         return
1147
1148                 video_id = mobj.group(1)
1149
1150                 # Check if video comes from YouTube
1151                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1152                 if mobj2 is not None:
1153                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1154                         return
1155
1156                 # At this point we have a new video
1157                 self._downloader.increment_downloads()
1158
1159                 simple_title = mobj.group(2).decode('utf-8')
1160
1161                 # Retrieve video webpage to extract further information
1162                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1163                 try:
1164                         self.report_download_webpage(video_id)
1165                         webpage = urllib2.urlopen(request).read()
1166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1168                         return
1169
1170                 # Extract URL, uploader and title from webpage
1171                 self.report_extraction(video_id)
1172                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1173                 if mobj is not None:
1174                         mediaURL = urllib.unquote(mobj.group(1))
1175                         video_extension = mediaURL[-3:]
1176
1177                         # Extract gdaKey if available
1178                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1179                         if mobj is None:
1180                                 video_url = mediaURL
1181                         else:
1182                                 gdaKey = mobj.group(1)
1183                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1184                 else:
1185                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1186                         if mobj is None:
1187                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1188                                 return
1189                         vardict = parse_qs(mobj.group(1))
1190                         if 'mediaData' not in vardict:
1191                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1192                                 return
1193                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1194                         if mobj is None:
1195                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1196                                 return
1197                         mediaURL = mobj.group(1).replace('\\/', '/')
1198                         video_extension = mediaURL[-3:]
1199                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1200
1201                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: unable to extract title')
1204                         return
1205                 video_title = mobj.group(1).decode('utf-8')
1206                 video_title = sanitize_title(video_title)
1207
1208                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1209                 if mobj is None:
1210                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1211                         return
1212                 video_uploader = mobj.group(1)
1213
1214                 try:
1215                         # Process video information
1216                         self._downloader.process_info({
1217                                 'id':           video_id.decode('utf-8'),
1218                                 'url':          video_url.decode('utf-8'),
1219                                 'uploader':     video_uploader.decode('utf-8'),
1220                                 'upload_date':  u'NA',
1221                                 'title':        video_title,
1222                                 'stitle':       simple_title,
1223                                 'ext':          video_extension.decode('utf-8'),
1224                                 'format':       u'NA',
1225                                 'player_url':   None,
1226                         })
1227                 except UnavailableVideoError:
1228                         self._downloader.trouble(u'\nERROR: unable to download video')
1229
1230
1231 class DailymotionIE(InfoExtractor):
1232         """Information Extractor for Dailymotion"""
1233
1234         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1235
1236         def __init__(self, downloader=None):
1237                 InfoExtractor.__init__(self, downloader)
1238
1239         @staticmethod
1240         def suitable(url):
1241                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1242
1243         def report_download_webpage(self, video_id):
1244                 """Report webpage download."""
1245                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1246
1247         def report_extraction(self, video_id):
1248                 """Report information extraction."""
1249                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1250
1251         def _real_initialize(self):
1252                 return
1253
1254         def _real_extract(self, url):
1255                 # Extract id and simplified title from URL
1256                 mobj = re.match(self._VALID_URL, url)
1257                 if mobj is None:
1258                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1259                         return
1260
1261                 # At this point we have a new video
1262                 self._downloader.increment_downloads()
1263                 video_id = mobj.group(1)
1264
1265                 simple_title = mobj.group(2).decode('utf-8')
1266                 video_extension = 'flv'
1267
1268                 # Retrieve video webpage to extract further information
1269                 request = urllib2.Request(url)
1270                 try:
1271                         self.report_download_webpage(video_id)
1272                         webpage = urllib2.urlopen(request).read()
1273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1274                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1275                         return
1276
1277                 # Extract URL, uploader and title from webpage
1278                 self.report_extraction(video_id)
1279                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1280                 if mobj is None:
1281                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1282                         return
1283                 mediaURL = urllib.unquote(mobj.group(1))
1284
1285                 # if needed add http://www.dailymotion.com/ if relative URL
1286
1287                 video_url = mediaURL
1288
1289                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1290                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1291                 if mobj is None:
1292                         self._downloader.trouble(u'ERROR: unable to extract title')
1293                         return
1294                 video_title = mobj.group(1).decode('utf-8')
1295                 video_title = sanitize_title(video_title)
1296
1297                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1298                 if mobj is None:
1299                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1300                         return
1301                 video_uploader = mobj.group(1)
1302
1303                 try:
1304                         # Process video information
1305                         self._downloader.process_info({
1306                                 'id':           video_id.decode('utf-8'),
1307                                 'url':          video_url.decode('utf-8'),
1308                                 'uploader':     video_uploader.decode('utf-8'),
1309                                 'upload_date':  u'NA',
1310                                 'title':        video_title,
1311                                 'stitle':       simple_title,
1312                                 'ext':          video_extension.decode('utf-8'),
1313                                 'format':       u'NA',
1314                                 'player_url':   None,
1315                         })
1316                 except UnavailableVideoError:
1317                         self._downloader.trouble(u'\nERROR: unable to download video')
1318
1319 class GoogleIE(InfoExtractor):
1320         """Information extractor for video.google.com."""
1321
1322         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1323
1324         def __init__(self, downloader=None):
1325                 InfoExtractor.__init__(self, downloader)
1326
1327         @staticmethod
1328         def suitable(url):
1329                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1330
1331         def report_download_webpage(self, video_id):
1332                 """Report webpage download."""
1333                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1334
1335         def report_extraction(self, video_id):
1336                 """Report information extraction."""
1337                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1338
1339         def _real_initialize(self):
1340                 return
1341
1342         def _real_extract(self, url):
1343                 # Extract id from URL
1344                 mobj = re.match(self._VALID_URL, url)
1345                 if mobj is None:
1346                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1347                         return
1348
1349                 # At this point we have a new video
1350                 self._downloader.increment_downloads()
1351                 video_id = mobj.group(1)
1352
1353                 video_extension = 'mp4'
1354
1355                 # Retrieve video webpage to extract further information
1356                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1357                 try:
1358                         self.report_download_webpage(video_id)
1359                         webpage = urllib2.urlopen(request).read()
1360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1362                         return
1363
1364                 # Extract URL, uploader, and title from webpage
1365                 self.report_extraction(video_id)
1366                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1367                 if mobj is None:
1368                         video_extension = 'flv'
1369                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1370                 if mobj is None:
1371                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1372                         return
1373                 mediaURL = urllib.unquote(mobj.group(1))
1374                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1375                 mediaURL = mediaURL.replace('\\x26', '\x26')
1376
1377                 video_url = mediaURL
1378
1379                 mobj = re.search(r'<title>(.*)</title>', webpage)
1380                 if mobj is None:
1381                         self._downloader.trouble(u'ERROR: unable to extract title')
1382                         return
1383                 video_title = mobj.group(1).decode('utf-8')
1384                 video_title = sanitize_title(video_title)
1385                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1386
1387                 # Extract video description
1388                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1389                 if mobj is None:
1390                         self._downloader.trouble(u'ERROR: unable to extract video description')
1391                         return
1392                 video_description = mobj.group(1).decode('utf-8')
1393                 if not video_description:
1394                         video_description = 'No description available.'
1395
1396                 # Extract video thumbnail
1397                 if self._downloader.params.get('forcethumbnail', False):
1398                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1399                         try:
1400                                 webpage = urllib2.urlopen(request).read()
1401                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1402                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1403                                 return
1404                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1405                         if mobj is None:
1406                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1407                                 return
1408                         video_thumbnail = mobj.group(1)
1409                 else:   # we need something to pass to process_info
1410                         video_thumbnail = ''
1411
1412
1413                 try:
1414                         # Process video information
1415                         self._downloader.process_info({
1416                                 'id':           video_id.decode('utf-8'),
1417                                 'url':          video_url.decode('utf-8'),
1418                                 'uploader':     u'NA',
1419                                 'upload_date':  u'NA',
1420                                 'title':        video_title,
1421                                 'stitle':       simple_title,
1422                                 'ext':          video_extension.decode('utf-8'),
1423                                 'format':       u'NA',
1424                                 'player_url':   None,
1425                         })
1426                 except UnavailableVideoError:
1427                         self._downloader.trouble(u'\nERROR: unable to download video')
1428
1429
1430 class PhotobucketIE(InfoExtractor):
1431         """Information extractor for photobucket.com."""
1432
1433         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1434
1435         def __init__(self, downloader=None):
1436                 InfoExtractor.__init__(self, downloader)
1437
1438         @staticmethod
1439         def suitable(url):
1440                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1441
1442         def report_download_webpage(self, video_id):
1443                 """Report webpage download."""
1444                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1445
1446         def report_extraction(self, video_id):
1447                 """Report information extraction."""
1448                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1449
1450         def _real_initialize(self):
1451                 return
1452
1453         def _real_extract(self, url):
1454                 # Extract id from URL
1455                 mobj = re.match(self._VALID_URL, url)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1458                         return
1459
1460                 # At this point we have a new video
1461                 self._downloader.increment_downloads()
1462                 video_id = mobj.group(1)
1463
1464                 video_extension = 'flv'
1465
1466                 # Retrieve video webpage to extract further information
1467                 request = urllib2.Request(url)
1468                 try:
1469                         self.report_download_webpage(video_id)
1470                         webpage = urllib2.urlopen(request).read()
1471                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1473                         return
1474
1475                 # Extract URL, uploader, and title from webpage
1476                 self.report_extraction(video_id)
1477                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1478                 if mobj is None:
1479                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1480                         return
1481                 mediaURL = urllib.unquote(mobj.group(1))
1482
1483                 video_url = mediaURL
1484
1485                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: unable to extract title')
1488                         return
1489                 video_title = mobj.group(1).decode('utf-8')
1490                 video_title = sanitize_title(video_title)
1491                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1492
1493                 video_uploader = mobj.group(2).decode('utf-8')
1494
1495                 try:
1496                         # Process video information
1497                         self._downloader.process_info({
1498                                 'id':           video_id.decode('utf-8'),
1499                                 'url':          video_url.decode('utf-8'),
1500                                 'uploader':     video_uploader,
1501                                 'upload_date':  u'NA',
1502                                 'title':        video_title,
1503                                 'stitle':       simple_title,
1504                                 'ext':          video_extension.decode('utf-8'),
1505                                 'format':       u'NA',
1506                                 'player_url':   None,
1507                         })
1508                 except UnavailableVideoError:
1509                         self._downloader.trouble(u'\nERROR: unable to download video')
1510
1511
1512 class YahooIE(InfoExtractor):
1513         """Information extractor for video.yahoo.com."""
1514
1515         # _VALID_URL matches all Yahoo! Video URLs
1516         # _VPAGE_URL matches only the extractable '/watch/' URLs
1517         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1518         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1519
1520         def __init__(self, downloader=None):
1521                 InfoExtractor.__init__(self, downloader)
1522
1523         @staticmethod
1524         def suitable(url):
1525                 return (re.match(YahooIE._VALID_URL, url) is not None)
1526
1527         def report_download_webpage(self, video_id):
1528                 """Report webpage download."""
1529                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1530
1531         def report_extraction(self, video_id):
1532                 """Report information extraction."""
1533                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1534
1535         def _real_initialize(self):
1536                 return
1537
1538         def _real_extract(self, url, new_video=True):
1539                 # Extract ID from URL
1540                 mobj = re.match(self._VALID_URL, url)
1541                 if mobj is None:
1542                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1543                         return
1544
1545                 # At this point we have a new video
1546                 self._downloader.increment_downloads()
1547                 video_id = mobj.group(2)
1548                 video_extension = 'flv'
1549
1550                 # Rewrite valid but non-extractable URLs as
1551                 # extractable English language /watch/ URLs
1552                 if re.match(self._VPAGE_URL, url) is None:
1553                         request = urllib2.Request(url)
1554                         try:
1555                                 webpage = urllib2.urlopen(request).read()
1556                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1557                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1558                                 return
1559
1560                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1561                         if mobj is None:
1562                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1563                                 return
1564                         yahoo_id = mobj.group(1)
1565
1566                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1567                         if mobj is None:
1568                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1569                                 return
1570                         yahoo_vid = mobj.group(1)
1571
1572                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1573                         return self._real_extract(url, new_video=False)
1574
1575                 # Retrieve video webpage to extract further information
1576                 request = urllib2.Request(url)
1577                 try:
1578                         self.report_download_webpage(video_id)
1579                         webpage = urllib2.urlopen(request).read()
1580                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1581                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1582                         return
1583
1584                 # Extract uploader and title from webpage
1585                 self.report_extraction(video_id)
1586                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1587                 if mobj is None:
1588                         self._downloader.trouble(u'ERROR: unable to extract video title')
1589                         return
1590                 video_title = mobj.group(1).decode('utf-8')
1591                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1592
1593                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1596                         return
1597                 video_uploader = mobj.group(1).decode('utf-8')
1598
1599                 # Extract video thumbnail
1600                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1601                 if mobj is None:
1602                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1603                         return
1604                 video_thumbnail = mobj.group(1).decode('utf-8')
1605
1606                 # Extract video description
1607                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract video description')
1610                         return
1611                 video_description = mobj.group(1).decode('utf-8')
1612                 if not video_description: video_description = 'No description available.'
1613
1614                 # Extract video height and width
1615                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1616                 if mobj is None:
1617                         self._downloader.trouble(u'ERROR: unable to extract video height')
1618                         return
1619                 yv_video_height = mobj.group(1)
1620
1621                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1622                 if mobj is None:
1623                         self._downloader.trouble(u'ERROR: unable to extract video width')
1624                         return
1625                 yv_video_width = mobj.group(1)
1626
1627                 # Retrieve video playlist to extract media URL
1628                 # I'm not completely sure what all these options are, but we
1629                 # seem to need most of them, otherwise the server sends a 401.
1630                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1631                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1632                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1633                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1634                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1635                 try:
1636                         self.report_download_webpage(video_id)
1637                         webpage = urllib2.urlopen(request).read()
1638                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1639                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1640                         return
1641
1642                 # Extract media URL from playlist XML
1643                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1644                 if mobj is None:
1645                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1646                         return
1647                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1648                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1649
1650                 try:
1651                         # Process video information
1652                         self._downloader.process_info({
1653                                 'id':           video_id.decode('utf-8'),
1654                                 'url':          video_url,
1655                                 'uploader':     video_uploader,
1656                                 'upload_date':  u'NA',
1657                                 'title':        video_title,
1658                                 'stitle':       simple_title,
1659                                 'ext':          video_extension.decode('utf-8'),
1660                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1661                                 'description':  video_description,
1662                                 'thumbnail':    video_thumbnail,
1663                                 'description':  video_description,
1664                                 'player_url':   None,
1665                         })
1666                 except UnavailableVideoError:
1667                         self._downloader.trouble(u'\nERROR: unable to download video')
1668
1669
1670 class GenericIE(InfoExtractor):
1671         """Generic last-resort information extractor."""
1672
1673         def __init__(self, downloader=None):
1674                 InfoExtractor.__init__(self, downloader)
1675
1676         @staticmethod
1677         def suitable(url):
1678                 return True
1679
1680         def report_download_webpage(self, video_id):
1681                 """Report webpage download."""
1682                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1683                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1684
1685         def report_extraction(self, video_id):
1686                 """Report information extraction."""
1687                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1688
1689         def _real_initialize(self):
1690                 return
1691
1692         def _real_extract(self, url):
1693                 # At this point we have a new video
1694                 self._downloader.increment_downloads()
1695
1696                 video_id = url.split('/')[-1]
1697                 request = urllib2.Request(url)
1698                 try:
1699                         self.report_download_webpage(video_id)
1700                         webpage = urllib2.urlopen(request).read()
1701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1702                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1703                         return
1704                 except ValueError, err:
1705                         # since this is the last-resort InfoExtractor, if
1706                         # this error is thrown, it'll be thrown here
1707                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1708                         return
1709
1710                 self.report_extraction(video_id)
1711                 # Start with something easy: JW Player in SWFObject
1712                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1713                 if mobj is None:
1714                         # Broaden the search a little bit
1715                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1716                 if mobj is None:
1717                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1718                         return
1719
1720                 # It's possible that one of the regexes
1721                 # matched, but returned an empty group:
1722                 if mobj.group(1) is None:
1723                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1724                         return
1725
1726                 video_url = urllib.unquote(mobj.group(1))
1727                 video_id  = os.path.basename(video_url)
1728
1729                 # here's a fun little line of code for you:
1730                 video_extension = os.path.splitext(video_id)[1][1:]
1731                 video_id        = os.path.splitext(video_id)[0]
1732
1733                 # it's tempting to parse this further, but you would
1734                 # have to take into account all the variations like
1735                 #   Video Title - Site Name
1736                 #   Site Name | Video Title
1737                 #   Video Title - Tagline | Site Name
1738                 # and so on and so forth; it's just not practical
1739                 mobj = re.search(r'<title>(.*)</title>', webpage)
1740                 if mobj is None:
1741                         self._downloader.trouble(u'ERROR: unable to extract title')
1742                         return
1743                 video_title = mobj.group(1).decode('utf-8')
1744                 video_title = sanitize_title(video_title)
1745                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1746
1747                 # video uploader is domain name
1748                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1749                 if mobj is None:
1750                         self._downloader.trouble(u'ERROR: unable to extract title')
1751                         return
1752                 video_uploader = mobj.group(1).decode('utf-8')
1753
1754                 try:
1755                         # Process video information
1756                         self._downloader.process_info({
1757                                 'id':           video_id.decode('utf-8'),
1758                                 'url':          video_url.decode('utf-8'),
1759                                 'uploader':     video_uploader,
1760                                 'upload_date':  u'NA',
1761                                 'title':        video_title,
1762                                 'stitle':       simple_title,
1763                                 'ext':          video_extension.decode('utf-8'),
1764                                 'format':       u'NA',
1765                                 'player_url':   None,
1766                         })
1767                 except UnavailableVideoError, err:
1768                         self._downloader.trouble(u'\nERROR: unable to download video')
1769
1770
1771 class YoutubeSearchIE(InfoExtractor):
1772         """Information Extractor for YouTube search queries."""
1773         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1774         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1775         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1776         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1777         _youtube_ie = None
1778         _max_youtube_results = 1000
1779
1780         def __init__(self, youtube_ie, downloader=None):
1781                 InfoExtractor.__init__(self, downloader)
1782                 self._youtube_ie = youtube_ie
1783
1784         @staticmethod
1785         def suitable(url):
1786                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1787
1788         def report_download_page(self, query, pagenum):
1789                 """Report attempt to download playlist page with given number."""
1790                 query = query.decode(preferredencoding())
1791                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1792
1793         def _real_initialize(self):
1794                 self._youtube_ie.initialize()
1795
1796         def _real_extract(self, query):
1797                 mobj = re.match(self._VALID_QUERY, query)
1798                 if mobj is None:
1799                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1800                         return
1801
1802                 prefix, query = query.split(':')
1803                 prefix = prefix[8:]
1804                 query  = query.encode('utf-8')
1805                 if prefix == '':
1806                         self._download_n_results(query, 1)
1807                         return
1808                 elif prefix == 'all':
1809                         self._download_n_results(query, self._max_youtube_results)
1810                         return
1811                 else:
1812                         try:
1813                                 n = long(prefix)
1814                                 if n <= 0:
1815                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1816                                         return
1817                                 elif n > self._max_youtube_results:
1818                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1819                                         n = self._max_youtube_results
1820                                 self._download_n_results(query, n)
1821                                 return
1822                         except ValueError: # parsing prefix as integer fails
1823                                 self._download_n_results(query, 1)
1824                                 return
1825
1826         def _download_n_results(self, query, n):
1827                 """Downloads a specified number of results for a query"""
1828
1829                 video_ids = []
1830                 already_seen = set()
1831                 pagenum = 1
1832
1833                 while True:
1834                         self.report_download_page(query, pagenum)
1835                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1836                         request = urllib2.Request(result_url)
1837                         try:
1838                                 page = urllib2.urlopen(request).read()
1839                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1841                                 return
1842
1843                         # Extract video identifiers
1844                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1846                                 if video_id not in already_seen:
1847                                         video_ids.append(video_id)
1848                                         already_seen.add(video_id)
1849                                         if len(video_ids) == n:
1850                                                 # Specified n videos reached
1851                                                 for id in video_ids:
1852                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1853                                                 return
1854
1855                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1856                                 for id in video_ids:
1857                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1858                                 return
1859
1860                         pagenum = pagenum + 1
1861
1862 class GoogleSearchIE(InfoExtractor):
1863         """Information Extractor for Google Video search queries."""
1864         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1865         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1866         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1867         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1868         _google_ie = None
1869         _max_google_results = 1000
1870
1871         def __init__(self, google_ie, downloader=None):
1872                 InfoExtractor.__init__(self, downloader)
1873                 self._google_ie = google_ie
1874
1875         @staticmethod
1876         def suitable(url):
1877                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1878
1879         def report_download_page(self, query, pagenum):
1880                 """Report attempt to download playlist page with given number."""
1881                 query = query.decode(preferredencoding())
1882                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1883
1884         def _real_initialize(self):
1885                 self._google_ie.initialize()
1886
1887         def _real_extract(self, query):
1888                 mobj = re.match(self._VALID_QUERY, query)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1891                         return
1892
1893                 prefix, query = query.split(':')
1894                 prefix = prefix[8:]
1895                 query  = query.encode('utf-8')
1896                 if prefix == '':
1897                         self._download_n_results(query, 1)
1898                         return
1899                 elif prefix == 'all':
1900                         self._download_n_results(query, self._max_google_results)
1901                         return
1902                 else:
1903                         try:
1904                                 n = long(prefix)
1905                                 if n <= 0:
1906                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1907                                         return
1908                                 elif n > self._max_google_results:
1909                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1910                                         n = self._max_google_results
1911                                 self._download_n_results(query, n)
1912                                 return
1913                         except ValueError: # parsing prefix as integer fails
1914                                 self._download_n_results(query, 1)
1915                                 return
1916
1917         def _download_n_results(self, query, n):
1918                 """Downloads a specified number of results for a query"""
1919
1920                 video_ids = []
1921                 already_seen = set()
1922                 pagenum = 1
1923
1924                 while True:
1925                         self.report_download_page(query, pagenum)
1926                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1927                         request = urllib2.Request(result_url)
1928                         try:
1929                                 page = urllib2.urlopen(request).read()
1930                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1932                                 return
1933
1934                         # Extract video identifiers
1935                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936                                 video_id = mobj.group(1)
1937                                 if video_id not in already_seen:
1938                                         video_ids.append(video_id)
1939                                         already_seen.add(video_id)
1940                                         if len(video_ids) == n:
1941                                                 # Specified n videos reached
1942                                                 for id in video_ids:
1943                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1944                                                 return
1945
1946                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1947                                 for id in video_ids:
1948                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1949                                 return
1950
1951                         pagenum = pagenum + 1
1952
1953 class YahooSearchIE(InfoExtractor):
1954         """Information Extractor for Yahoo! Video search queries."""
1955         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1956         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1957         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1958         _MORE_PAGES_INDICATOR = r'\s*Next'
1959         _yahoo_ie = None
1960         _max_yahoo_results = 1000
1961
1962         def __init__(self, yahoo_ie, downloader=None):
1963                 InfoExtractor.__init__(self, downloader)
1964                 self._yahoo_ie = yahoo_ie
1965
1966         @staticmethod
1967         def suitable(url):
1968                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1969
1970         def report_download_page(self, query, pagenum):
1971                 """Report attempt to download playlist page with given number."""
1972                 query = query.decode(preferredencoding())
1973                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1974
1975         def _real_initialize(self):
1976                 self._yahoo_ie.initialize()
1977
1978         def _real_extract(self, query):
1979                 mobj = re.match(self._VALID_QUERY, query)
1980                 if mobj is None:
1981                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1982                         return
1983
1984                 prefix, query = query.split(':')
1985                 prefix = prefix[8:]
1986                 query  = query.encode('utf-8')
1987                 if prefix == '':
1988                         self._download_n_results(query, 1)
1989                         return
1990                 elif prefix == 'all':
1991                         self._download_n_results(query, self._max_yahoo_results)
1992                         return
1993                 else:
1994                         try:
1995                                 n = long(prefix)
1996                                 if n <= 0:
1997                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1998                                         return
1999                                 elif n > self._max_yahoo_results:
2000                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2001                                         n = self._max_yahoo_results
2002                                 self._download_n_results(query, n)
2003                                 return
2004                         except ValueError: # parsing prefix as integer fails
2005                                 self._download_n_results(query, 1)
2006                                 return
2007
2008         def _download_n_results(self, query, n):
2009                 """Downloads a specified number of results for a query"""
2010
2011                 video_ids = []
2012                 already_seen = set()
2013                 pagenum = 1
2014
2015                 while True:
2016                         self.report_download_page(query, pagenum)
2017                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2018                         request = urllib2.Request(result_url)
2019                         try:
2020                                 page = urllib2.urlopen(request).read()
2021                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2022                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2023                                 return
2024
2025                         # Extract video identifiers
2026                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2027                                 video_id = mobj.group(1)
2028                                 if video_id not in already_seen:
2029                                         video_ids.append(video_id)
2030                                         already_seen.add(video_id)
2031                                         if len(video_ids) == n:
2032                                                 # Specified n videos reached
2033                                                 for id in video_ids:
2034                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2035                                                 return
2036
2037                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2038                                 for id in video_ids:
2039                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2040                                 return
2041
2042                         pagenum = pagenum + 1
2043
2044 class YoutubePlaylistIE(InfoExtractor):
2045         """Information Extractor for YouTube playlists."""
2046
2047         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2048         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2049         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2050         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2051         _youtube_ie = None
2052
2053         def __init__(self, youtube_ie, downloader=None):
2054                 InfoExtractor.__init__(self, downloader)
2055                 self._youtube_ie = youtube_ie
2056
2057         @staticmethod
2058         def suitable(url):
2059                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2060
2061         def report_download_page(self, playlist_id, pagenum):
2062                 """Report attempt to download playlist page with given number."""
2063                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2064
2065         def _real_initialize(self):
2066                 self._youtube_ie.initialize()
2067
2068         def _real_extract(self, url):
2069                 # Extract playlist id
2070                 mobj = re.match(self._VALID_URL, url)
2071                 if mobj is None:
2072                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2073                         return
2074
2075                 # Download playlist pages
2076                 playlist_id = mobj.group(1)
2077                 video_ids = []
2078                 pagenum = 1
2079
2080                 while True:
2081                         self.report_download_page(playlist_id, pagenum)
2082                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2083                         try:
2084                                 page = urllib2.urlopen(request).read()
2085                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2086                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2087                                 return
2088
2089                         # Extract video identifiers
2090                         ids_in_page = []
2091                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2092                                 if mobj.group(1) not in ids_in_page:
2093                                         ids_in_page.append(mobj.group(1))
2094                         video_ids.extend(ids_in_page)
2095
2096                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2097                                 break
2098                         pagenum = pagenum + 1
2099
2100                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2101                 playlistend = self._downloader.params.get('playlistend', -1)
2102                 video_ids = video_ids[playliststart:playlistend]
2103
2104                 for id in video_ids:
2105                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2106                 return
2107
2108 class YoutubeUserIE(InfoExtractor):
2109         """Information Extractor for YouTube users."""
2110
2111         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2112         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2113         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2114         _youtube_ie = None
2115
2116         def __init__(self, youtube_ie, downloader=None):
2117                 InfoExtractor.__init__(self, downloader)
2118                 self._youtube_ie = youtube_ie
2119
2120         @staticmethod
2121         def suitable(url):
2122                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2123
2124         def report_download_page(self, username):
2125                 """Report attempt to download user page."""
2126                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2127
2128         def _real_initialize(self):
2129                 self._youtube_ie.initialize()
2130
2131         def _real_extract(self, url):
2132                 # Extract username
2133                 mobj = re.match(self._VALID_URL, url)
2134                 if mobj is None:
2135                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2136                         return
2137
2138                 # Download user page
2139                 username = mobj.group(1)
2140                 video_ids = []
2141                 pagenum = 1
2142
2143                 self.report_download_page(username)
2144                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2145                 try:
2146                         page = urllib2.urlopen(request).read()
2147                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2148                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2149                         return
2150
2151                 # Extract video identifiers
2152                 ids_in_page = []
2153
2154                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2155                         if mobj.group(1) not in ids_in_page:
2156                                 ids_in_page.append(mobj.group(1))
2157                 video_ids.extend(ids_in_page)
2158
2159                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2160                 playlistend = self._downloader.params.get('playlistend', -1)
2161                 video_ids = video_ids[playliststart:playlistend]
2162
2163                 for id in video_ids:
2164                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2165                 return
2166
2167 class DepositFilesIE(InfoExtractor):
2168         """Information extractor for depositfiles.com"""
2169
2170         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2171
2172         def __init__(self, downloader=None):
2173                 InfoExtractor.__init__(self, downloader)
2174
2175         @staticmethod
2176         def suitable(url):
2177                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2178
2179         def report_download_webpage(self, file_id):
2180                 """Report webpage download."""
2181                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2182
2183         def report_extraction(self, file_id):
2184                 """Report information extraction."""
2185                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2186
2187         def _real_initialize(self):
2188                 return
2189
2190         def _real_extract(self, url):
2191                 # At this point we have a new file
2192                 self._downloader.increment_downloads()
2193
2194                 file_id = url.split('/')[-1]
2195                 # Rebuild url in english locale
2196                 url = 'http://depositfiles.com/en/files/' + file_id
2197
2198                 # Retrieve file webpage with 'Free download' button pressed
2199                 free_download_indication = { 'gateway_result' : '1' }
2200                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2201                 try:
2202                         self.report_download_webpage(file_id)
2203                         webpage = urllib2.urlopen(request).read()
2204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2206                         return
2207
2208                 # Search for the real file URL
2209                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2210                 if (mobj is None) or (mobj.group(1) is None):
2211                         # Try to figure out reason of the error.
2212                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2213                         if (mobj is not None) and (mobj.group(1) is not None):
2214                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2215                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2216                         else:
2217                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2218                         return
2219
2220                 file_url = mobj.group(1)
2221                 file_extension = os.path.splitext(file_url)[1][1:]
2222
2223                 # Search for file title
2224                 mobj = re.search(r'<b title="(.*?)">', webpage)
2225                 if mobj is None:
2226                         self._downloader.trouble(u'ERROR: unable to extract title')
2227                         return
2228                 file_title = mobj.group(1).decode('utf-8')
2229
2230                 try:
2231                         # Process file information
2232                         self._downloader.process_info({
2233                                 'id':           file_id.decode('utf-8'),
2234                                 'url':          file_url.decode('utf-8'),
2235                                 'uploader':     u'NA',
2236                                 'upload_date':  u'NA',
2237                                 'title':        file_title,
2238                                 'stitle':       file_title,
2239                                 'ext':          file_extension.decode('utf-8'),
2240                                 'format':       u'NA',
2241                                 'player_url':   None,
2242                         })
2243                 except UnavailableVideoError, err:
2244                         self._downloader.trouble(u'ERROR: unable to download file')
2245
2246 class PostProcessor(object):
2247         """Post Processor class.
2248
2249         PostProcessor objects can be added to downloaders with their
2250         add_post_processor() method. When the downloader has finished a
2251         successful download, it will take its internal chain of PostProcessors
2252         and start calling the run() method on each one of them, first with
2253         an initial argument and then with the returned value of the previous
2254         PostProcessor.
2255
2256         The chain will be stopped if one of them ever returns None or the end
2257         of the chain is reached.
2258
2259         PostProcessor objects follow a "mutual registration" process similar
2260         to InfoExtractor objects.
2261         """
2262
2263         _downloader = None
2264
2265         def __init__(self, downloader=None):
2266                 self._downloader = downloader
2267
2268         def set_downloader(self, downloader):
2269                 """Sets the downloader for this PP."""
2270                 self._downloader = downloader
2271
2272         def run(self, information):
2273                 """Run the PostProcessor.
2274
2275                 The "information" argument is a dictionary like the ones
2276                 composed by InfoExtractors. The only difference is that this
2277                 one has an extra field called "filepath" that points to the
2278                 downloaded file.
2279
2280                 When this method returns None, the postprocessing chain is
2281                 stopped. However, this method may return an information
2282                 dictionary that will be passed to the next postprocessing
2283                 object in the chain. It can be the one it received after
2284                 changing some fields.
2285
2286                 In addition, this method may raise a PostProcessingError
2287                 exception that will be taken into account by the downloader
2288                 it was called from.
2289                 """
2290                 return information # by default, do nothing
2291
2292 ### MAIN PROGRAM ###
2293 if __name__ == '__main__':
2294         try:
2295                 # Modules needed only when running the main program
2296                 import getpass
2297                 import optparse
2298
2299                 # Function to update the program file with the latest version from bitbucket.org
2300                 def update_self(downloader, filename):
2301                         # Note: downloader only used for options
2302                         if not os.access (filename, os.W_OK):
2303                                 sys.exit('ERROR: no write permissions on %s' % filename)
2304
2305                         downloader.to_screen('Updating to latest stable version...')
2306                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2307                         latest_version = urllib.urlopen(latest_url).read().strip()
2308                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2309                         newcontent = urllib.urlopen(prog_url).read()
2310                         stream = open(filename, 'w')
2311                         stream.write(newcontent)
2312                         stream.close()
2313                         downloader.to_screen('Updated to version %s' % latest_version)
2314
2315                 # Parse command line
2316                 parser = optparse.OptionParser(
2317                         usage='Usage: %prog [options] url...',
2318                         version='2010.12.09',
2319                         conflict_handler='resolve',
2320                 )
2321
2322                 parser.add_option('-h', '--help',
2323                                 action='help', help='print this help text and exit')
2324                 parser.add_option('-v', '--version',
2325                                 action='version', help='print program version and exit')
2326                 parser.add_option('-U', '--update',
2327                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2328                 parser.add_option('-i', '--ignore-errors',
2329                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2330                 parser.add_option('-r', '--rate-limit',
2331                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2332                 parser.add_option('-R', '--retries',
2333                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2334                 parser.add_option('--playlist-start',
2335                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2336                 parser.add_option('--playlist-end',
2337                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2338                 parser.add_option('--dump-user-agent',
2339                                 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2340
2341                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2342                 authentication.add_option('-u', '--username',
2343                                 dest='username', metavar='USERNAME', help='account username')
2344                 authentication.add_option('-p', '--password',
2345                                 dest='password', metavar='PASSWORD', help='account password')
2346                 authentication.add_option('-n', '--netrc',
2347                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2348                 parser.add_option_group(authentication)
2349
2350                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2351                 video_format.add_option('-f', '--format',
2352                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2353                 video_format.add_option('--all-formats',
2354                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2355                 video_format.add_option('--max-quality',
2356                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2357                 parser.add_option_group(video_format)
2358
2359                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2360                 verbosity.add_option('-q', '--quiet',
2361                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2362                 verbosity.add_option('-s', '--simulate',
2363                                 action='store_true', dest='simulate', help='do not download video', default=False)
2364                 verbosity.add_option('-g', '--get-url',
2365                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2366                 verbosity.add_option('-e', '--get-title',
2367                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2368                 verbosity.add_option('--get-thumbnail',
2369                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2370                 verbosity.add_option('--get-description',
2371                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2372                 verbosity.add_option('--no-progress',
2373                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2374                 verbosity.add_option('--console-title',
2375                                 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2376                 parser.add_option_group(verbosity)
2377
2378                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2379                 filesystem.add_option('-t', '--title',
2380                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2381                 filesystem.add_option('-l', '--literal',
2382                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2383                 filesystem.add_option('-A', '--auto-number',
2384                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2385                 filesystem.add_option('-o', '--output',
2386                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2387                 filesystem.add_option('-a', '--batch-file',
2388                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2389                 filesystem.add_option('-w', '--no-overwrites',
2390                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2391                 filesystem.add_option('-c', '--continue',
2392                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2393                 filesystem.add_option('--cookies',
2394                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2395                 filesystem.add_option('--no-part',
2396                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2397                 parser.add_option_group(filesystem)
2398
2399                 (opts, args) = parser.parse_args()
2400
2401                 # Open appropriate CookieJar
2402                 if opts.cookiefile is None:
2403                         jar = cookielib.CookieJar()
2404                 else:
2405                         try:
2406                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2407                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2408                                         jar.load()
2409                         except (IOError, OSError), err:
2410                                 sys.exit(u'ERROR: unable to open cookie file')
2411
2412                 # Dump user agent
2413                 if opts.dump_user_agent:
2414                         print std_headers['User-Agent']
2415                         sys.exit(0)
2416
2417                 # General configuration
2418                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2419                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2420                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2421
2422                 # Batch file verification
2423                 batchurls = []
2424                 if opts.batchfile is not None:
2425                         try:
2426                                 if opts.batchfile == '-':
2427                                         batchfd = sys.stdin
2428                                 else:
2429                                         batchfd = open(opts.batchfile, 'r')
2430                                 batchurls = batchfd.readlines()
2431                                 batchurls = [x.strip() for x in batchurls]
2432                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2433                         except IOError:
2434                                 sys.exit(u'ERROR: batch file could not be read')
2435                 all_urls = batchurls + args
2436
2437                 # Conflicting, missing and erroneous options
2438                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2439                         parser.error(u'using .netrc conflicts with giving username/password')
2440                 if opts.password is not None and opts.username is None:
2441                         parser.error(u'account username missing')
2442                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2443                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2444                 if opts.usetitle and opts.useliteral:
2445                         parser.error(u'using title conflicts with using literal title')
2446                 if opts.username is not None and opts.password is None:
2447                         opts.password = getpass.getpass(u'Type account password and press return:')
2448                 if opts.ratelimit is not None:
2449                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2450                         if numeric_limit is None:
2451                                 parser.error(u'invalid rate limit specified')
2452                         opts.ratelimit = numeric_limit
2453                 if opts.retries is not None:
2454                         try:
2455                                 opts.retries = long(opts.retries)
2456                         except (TypeError, ValueError), err:
2457                                 parser.error(u'invalid retry count specified')
2458                 try:
2459                         opts.playliststart = long(opts.playliststart)
2460                         if opts.playliststart <= 0:
2461                                 raise ValueError
2462                 except (TypeError, ValueError), err:
2463                         parser.error(u'invalid playlist start number specified')
2464                 try:
2465                         opts.playlistend = long(opts.playlistend)
2466                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2467                                 raise ValueError
2468                 except (TypeError, ValueError), err:
2469                         parser.error(u'invalid playlist end number specified')
2470
2471                 # Information extractors
2472                 youtube_ie = YoutubeIE()
2473                 metacafe_ie = MetacafeIE(youtube_ie)
2474                 dailymotion_ie = DailymotionIE()
2475                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2476                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2477                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2478                 google_ie = GoogleIE()
2479                 google_search_ie = GoogleSearchIE(google_ie)
2480                 photobucket_ie = PhotobucketIE()
2481                 yahoo_ie = YahooIE()
2482                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2483                 deposit_files_ie = DepositFilesIE()
2484                 generic_ie = GenericIE()
2485
2486                 # File downloader
2487                 fd = FileDownloader({
2488                         'usenetrc': opts.usenetrc,
2489                         'username': opts.username,
2490                         'password': opts.password,
2491                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2492                         'forceurl': opts.geturl,
2493                         'forcetitle': opts.gettitle,
2494                         'forcethumbnail': opts.getthumbnail,
2495                         'forcedescription': opts.getdescription,
2496                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2497                         'format': opts.format,
2498                         'format_limit': opts.format_limit,
2499                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2500                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2501                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2502                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2503                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2504                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2505                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2506                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2507                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2508                                 or u'%(id)s.%(ext)s'),
2509                         'ignoreerrors': opts.ignoreerrors,
2510                         'ratelimit': opts.ratelimit,
2511                         'nooverwrites': opts.nooverwrites,
2512                         'retries': opts.retries,
2513                         'continuedl': opts.continue_dl,
2514                         'noprogress': opts.noprogress,
2515                         'playliststart': opts.playliststart,
2516                         'playlistend': opts.playlistend,
2517                         'logtostderr': opts.outtmpl == '-',
2518                         'consoletitle': opts.consoletitle,
2519                         'nopart': opts.nopart,
2520                         })
2521                 fd.add_info_extractor(youtube_search_ie)
2522                 fd.add_info_extractor(youtube_pl_ie)
2523                 fd.add_info_extractor(youtube_user_ie)
2524                 fd.add_info_extractor(metacafe_ie)
2525                 fd.add_info_extractor(dailymotion_ie)
2526                 fd.add_info_extractor(youtube_ie)
2527                 fd.add_info_extractor(google_ie)
2528                 fd.add_info_extractor(google_search_ie)
2529                 fd.add_info_extractor(photobucket_ie)
2530                 fd.add_info_extractor(yahoo_ie)
2531                 fd.add_info_extractor(yahoo_search_ie)
2532                 fd.add_info_extractor(deposit_files_ie)
2533
2534                 # This must come last since it's the
2535                 # fallback if none of the others work
2536                 fd.add_info_extractor(generic_ie)
2537
2538                 # Update version
2539                 if opts.update_self:
2540                         update_self(fd, sys.argv[0])
2541
2542                 # Maybe do nothing
2543                 if len(all_urls) < 1:
2544                         if not opts.update_self:
2545                                 parser.error(u'you must provide at least one URL')
2546                         else:
2547                                 sys.exit()
2548                 retcode = fd.download(all_urls)
2549
2550                 # Dump cookie jar if requested
2551                 if opts.cookiefile is not None:
2552                         try:
2553                                 jar.save()
2554                         except (IOError, OSError), err:
2555                                 sys.exit(u'ERROR: unable to save cookie jar')
2556
2557                 sys.exit(retcode)
2558
2559         except DownloadError:
2560                 sys.exit(1)
2561         except SameFileError:
2562                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2563         except KeyboardInterrupt:
2564                 sys.exit(u'\nERROR: Interrupted by user')