youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # License: Public domain code
   9 import cookielib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import gzip
  14 import htmlentitydefs
  15 import httplib
  16 import locale
  17 import math
  18 import netrc
  19 import os
  20 import os.path
  21 import re
  22 import socket
  23 import string
  24 import StringIO
  25 import subprocess
  26 import sys
  27 import time
  28 import urllib
  29 import urllib2
  30 import zlib
  31
  32 # parse_qs was moved from the cgi module to the urlparse module recently.
  33 try:
  34         from urlparse import parse_qs
  35 except ImportError:
  36         from cgi import parse_qs
  37
  38 std_headers = {
  39         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  40         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  41         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  42         'Accept-Encoding': 'gzip, deflate',
  43         'Accept-Language': 'en-us,en;q=0.5',
  44 }
  45
  46 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  47
  48 def preferredencoding():
  49         """Get preferred encoding.
  50
  51         Returns the best encoding scheme for the system, based on
  52         locale.getpreferredencoding() and some further tweaks.
  53         """
  54         def yield_preferredencoding():
  55                 try:
  56                         pref = locale.getpreferredencoding()
  57                         u'TEST'.encode(pref)
  58                 except:
  59                         pref = 'UTF-8'
  60                 while True:
  61                         yield pref
  62         return yield_preferredencoding().next()
  63
  64 def htmlentity_transform(matchobj):
  65         """Transforms an HTML entity to a Unicode character.
  66
  67         This function receives a match object and is intended to be used with
  68         the re.sub() function.
  69         """
  70         entity = matchobj.group(1)
  71
  72         # Known non-numeric HTML entity
  73         if entity in htmlentitydefs.name2codepoint:
  74                 return unichr(htmlentitydefs.name2codepoint[entity])
  75
  76         # Unicode character
  77         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  78         if mobj is not None:
  79                 numstr = mobj.group(1)
  80                 if numstr.startswith(u'x'):
  81                         base = 16
  82                         numstr = u'0%s' % numstr
  83                 else:
  84                         base = 10
  85                 return unichr(long(numstr, base))
  86
  87         # Unknown entity in name, return its literal representation
  88         return (u'&%s;' % entity)
  89
  90 def sanitize_title(utitle):
  91         """Sanitizes a video title so it could be used as part of a filename."""
  92         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  93         return utitle.replace(unicode(os.sep), u'%')
  94
  95 def sanitize_open(filename, open_mode):
  96         """Try to open the given filename, and slightly tweak it if this fails.
  97
  98         Attempts to open the given filename. If this fails, it tries to change
  99         the filename slightly, step by step, until it's either able to open it
 100         or it fails and raises a final exception, like the standard open()
 101         function.
 102
 103         It returns the tuple (stream, definitive_file_name).
 104         """
 105         try:
 106                 if filename == u'-':
 107                         if sys.platform == 'win32':
 108                                 import msvcrt
 109                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 110                         return (sys.stdout, filename)
 111                 stream = open(filename, open_mode)
 112                 return (stream, filename)
 113         except (IOError, OSError), err:
 114                 # In case of error, try to remove win32 forbidden chars
 115                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 116
 117                 # An exception here should be caught in the caller
 118                 stream = open(filename, open_mode)
 119                 return (stream, filename)
 120
 121 def timeconvert(timestr):
 122     """Convert RFC 2822 defined time string into system timestamp"""
 123     timestamp = None
 124     timetuple = email.utils.parsedate_tz(timestr)
 125     if timetuple is not None:
 126         timestamp = email.utils.mktime_tz(timetuple)
 127     return timestamp
 128
 129 class DownloadError(Exception):
 130         """Download Error exception.
 131
 132         This exception may be thrown by FileDownloader objects if they are not
 133         configured to continue on errors. They will contain the appropriate
 134         error message.
 135         """
 136         pass
 137
 138 class SameFileError(Exception):
 139         """Same File exception.
 140
 141         This exception will be thrown by FileDownloader objects if they detect
 142         multiple files would have to be downloaded to the same file on disk.
 143         """
 144         pass
 145
 146 class PostProcessingError(Exception):
 147         """Post Processing exception.
 148
 149         This exception may be raised by PostProcessor's .run() method to
 150         indicate an error in the postprocessing task.
 151         """
 152         pass
 153
 154 class UnavailableVideoError(Exception):
 155         """Unavailable Format exception.
 156
 157         This exception will be thrown when a video is requested
 158         in a format that is not available for that video.
 159         """
 160         pass
 161
 162 class ContentTooShortError(Exception):
 163         """Content Too Short exception.
 164
 165         This exception may be raised by FileDownloader objects when a file they
 166         download is too small for what the server announced first, indicating
 167         the connection was probably interrupted.
 168         """
 169         # Both in bytes
 170         downloaded = None
 171         expected = None
 172
 173         def __init__(self, downloaded, expected):
 174                 self.downloaded = downloaded
 175                 self.expected = expected
 176
 177 class YoutubeDLHandler(urllib2.HTTPHandler):
 178         """Handler for HTTP requests and responses.
 179
 180         This class, when installed with an OpenerDirector, automatically adds
 181         the standard headers to every HTTP request and handles gzipped and
 182         deflated responses from web servers. If compression is to be avoided in
 183         a particular request, the original request in the program code only has
 184         to include the HTTP header "Youtubedl-No-Compression", which will be
 185         removed before making the real request.
 186
 187         Part of this code was copied from:
 188
 189           http://techknack.net/python-urllib2-handlers/
 190
 191         Andrew Rowls, the author of that code, agreed to release it to the
 192         public domain.
 193         """
 194
 195         @staticmethod
 196         def deflate(data):
 197                 try:
 198                         return zlib.decompress(data, -zlib.MAX_WBITS)
 199                 except zlib.error:
 200                         return zlib.decompress(data)
 201
 202         @staticmethod
 203         def addinfourl_wrapper(stream, headers, url, code):
 204                 if hasattr(urllib2.addinfourl, 'getcode'):
 205                         return urllib2.addinfourl(stream, headers, url, code)
 206                 ret = urllib2.addinfourl(stream, headers, url)
 207                 ret.code = code
 208                 return ret
 209
 210         def http_request(self, req):
 211                 for h in std_headers:
 212                         if h in req.headers:
 213                                 del req.headers[h]
 214                         req.add_header(h, std_headers[h])
 215                 if 'Youtubedl-no-compression' in req.headers:
 216                         if 'Accept-encoding' in req.headers:
 217                                 del req.headers['Accept-encoding']
 218                         del req.headers['Youtubedl-no-compression']
 219                 return req
 220
 221         def http_response(self, req, resp):
 222                 old_resp = resp
 223                 # gzip
 224                 if resp.headers.get('Content-encoding', '') == 'gzip':
 225                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 226                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 227                         resp.msg = old_resp.msg
 228                 # deflate
 229                 if resp.headers.get('Content-encoding', '') == 'deflate':
 230                         gz = StringIO.StringIO(self.deflate(resp.read()))
 231                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 232                         resp.msg = old_resp.msg
 233                 return resp
 234
 235 class FileDownloader(object):
 236         """File Downloader class.
 237
 238         File downloader objects are the ones responsible of downloading the
 239         actual video file and writing it to disk if the user has requested
 240         it, among some other tasks. In most cases there should be one per
 241         program. As, given a video URL, the downloader doesn't know how to
 242         extract all the needed information, task that InfoExtractors do, it
 243         has to pass the URL to one of them.
 244
 245         For this, file downloader objects have a method that allows
 246         InfoExtractors to be registered in a given order. When it is passed
 247         a URL, the file downloader handles it to the first InfoExtractor it
 248         finds that reports being able to handle it. The InfoExtractor extracts
 249         all the information about the video or videos the URL refers to, and
 250         asks the FileDownloader to process the video information, possibly
 251         downloading the video.
 252
 253         File downloaders accept a lot of parameters. In order not to saturate
 254         the object constructor with arguments, it receives a dictionary of
 255         options instead. These options are available through the params
 256         attribute for the InfoExtractors to use. The FileDownloader also
 257         registers itself as the downloader in charge for the InfoExtractors
 258         that are added to it, so this is a "mutual registration".
 259
 260         Available options:
 261
 262         username:         Username for authentication purposes.
 263         password:         Password for authentication purposes.
 264         usenetrc:         Use netrc for authentication instead.
 265         quiet:            Do not print messages to stdout.
 266         forceurl:         Force printing final URL.
 267         forcetitle:       Force printing title.
 268         forcethumbnail:   Force printing thumbnail URL.
 269         forcedescription: Force printing description.
 270         forcefilename:    Force printing final filename.
 271         simulate:         Do not download the video files.
 272         format:           Video format code.
 273         format_limit:     Highest quality format to try.
 274         outtmpl:          Template for output names.
 275         ignoreerrors:     Do not stop on download errors.
 276         ratelimit:        Download speed limit, in bytes/sec.
 277         nooverwrites:     Prevent overwriting files.
 278         retries:          Number of times to retry for HTTP error 5xx
 279         continuedl:       Try to continue downloads if possible.
 280         noprogress:       Do not print the progress bar.
 281         playliststart:    Playlist item to start at.
 282         playlistend:      Playlist item to end at.
 283         logtostderr:      Log messages to stderr instead of stdout.
 284         consoletitle:     Display progress in console window's titlebar.
 285         nopart:           Do not use temporary .part files.
 286         updatetime:       Use the Last-modified header to set output file timestamps.
 287         """
 288
 289         params = None
 290         _ies = []
 291         _pps = []
 292         _download_retcode = None
 293         _num_downloads = None
 294         _screen_file = None
 295
 296         def __init__(self, params):
 297                 """Create a FileDownloader object with the given options."""
 298                 self._ies = []
 299                 self._pps = []
 300                 self._download_retcode = 0
 301                 self._num_downloads = 0
 302                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 303                 self.params = params
 304
 305         @staticmethod
 306         def pmkdir(filename):
 307                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 308                 components = filename.split(os.sep)
 309                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 310                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 311                 for dir in aggregate:
 312                         if not os.path.exists(dir):
 313                                 os.mkdir(dir)
 314
 315         @staticmethod
 316         def format_bytes(bytes):
 317                 if bytes is None:
 318                         return 'N/A'
 319                 if type(bytes) is str:
 320                         bytes = float(bytes)
 321                 if bytes == 0.0:
 322                         exponent = 0
 323                 else:
 324                         exponent = long(math.log(bytes, 1024.0))
 325                 suffix = 'bkMGTPEZY'[exponent]
 326                 converted = float(bytes) / float(1024**exponent)
 327                 return '%.2f%s' % (converted, suffix)
 328
 329         @staticmethod
 330         def calc_percent(byte_counter, data_len):
 331                 if data_len is None:
 332                         return '---.-%'
 333                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 334
 335         @staticmethod
 336         def calc_eta(start, now, total, current):
 337                 if total is None:
 338                         return '--:--'
 339                 dif = now - start
 340                 if current == 0 or dif < 0.001: # One millisecond
 341                         return '--:--'
 342                 rate = float(current) / dif
 343                 eta = long((float(total) - float(current)) / rate)
 344                 (eta_mins, eta_secs) = divmod(eta, 60)
 345                 if eta_mins > 99:
 346                         return '--:--'
 347                 return '%02d:%02d' % (eta_mins, eta_secs)
 348
 349         @staticmethod
 350         def calc_speed(start, now, bytes):
 351                 dif = now - start
 352                 if bytes == 0 or dif < 0.001: # One millisecond
 353                         return '%10s' % '---b/s'
 354                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 355
 356         @staticmethod
 357         def best_block_size(elapsed_time, bytes):
 358                 new_min = max(bytes / 2.0, 1.0)
 359                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 360                 if elapsed_time < 0.001:
 361                         return long(new_max)
 362                 rate = bytes / elapsed_time
 363                 if rate > new_max:
 364                         return long(new_max)
 365                 if rate < new_min:
 366                         return long(new_min)
 367                 return long(rate)
 368
 369         @staticmethod
 370         def parse_bytes(bytestr):
 371                 """Parse a string indicating a byte quantity into a long integer."""
 372                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 373                 if matchobj is None:
 374                         return None
 375                 number = float(matchobj.group(1))
 376                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 377                 return long(round(number * multiplier))
 378
 379         def add_info_extractor(self, ie):
 380                 """Add an InfoExtractor object to the end of the list."""
 381                 self._ies.append(ie)
 382                 ie.set_downloader(self)
 383
 384         def add_post_processor(self, pp):
 385                 """Add a PostProcessor object to the end of the chain."""
 386                 self._pps.append(pp)
 387                 pp.set_downloader(self)
 388
 389         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 390                 """Print message to stdout if not in quiet mode."""
 391                 try:
 392                         if not self.params.get('quiet', False):
 393                                 terminator = [u'\n', u''][skip_eol]
 394                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 395                         self._screen_file.flush()
 396                 except (UnicodeEncodeError), err:
 397                         if not ignore_encoding_errors:
 398                                 raise
 399
 400         def to_stderr(self, message):
 401                 """Print message to stderr."""
 402                 print >>sys.stderr, message.encode(preferredencoding())
 403
 404         def to_cons_title(self, message):
 405                 """Set console/terminal window title to message."""
 406                 if not self.params.get('consoletitle', False):
 407                         return
 408                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 409                         # c_wchar_p() might not be necessary if `message` is
 410                         # already of type unicode()
 411                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 412                 elif 'TERM' in os.environ:
 413                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 414
 415         def fixed_template(self):
 416                 """Checks if the output template is fixed."""
 417                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 418
 419         def trouble(self, message=None):
 420                 """Determine action to take when a download problem appears.
 421
 422                 Depending on if the downloader has been configured to ignore
 423                 download errors or not, this method may throw an exception or
 424                 not when errors are found, after printing the message.
 425                 """
 426                 if message is not None:
 427                         self.to_stderr(message)
 428                 if not self.params.get('ignoreerrors', False):
 429                         raise DownloadError(message)
 430                 self._download_retcode = 1
 431
 432         def slow_down(self, start_time, byte_counter):
 433                 """Sleep if the download speed is over the rate limit."""
 434                 rate_limit = self.params.get('ratelimit', None)
 435                 if rate_limit is None or byte_counter == 0:
 436                         return
 437                 now = time.time()
 438                 elapsed = now - start_time
 439                 if elapsed <= 0.0:
 440                         return
 441                 speed = float(byte_counter) / elapsed
 442                 if speed > rate_limit:
 443                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 444
 445         def temp_name(self, filename):
 446                 """Returns a temporary filename for the given filename."""
 447                 if self.params.get('nopart', False) or filename == u'-' or \
 448                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 449                         return filename
 450                 return filename + u'.part'
 451
 452         def undo_temp_name(self, filename):
 453                 if filename.endswith(u'.part'):
 454                         return filename[:-len(u'.part')]
 455                 return filename
 456
 457         def try_rename(self, old_filename, new_filename):
 458                 try:
 459                         if old_filename == new_filename:
 460                                 return
 461                         os.rename(old_filename, new_filename)
 462                 except (IOError, OSError), err:
 463                         self.trouble(u'ERROR: unable to rename file')
 464
 465         def try_utime(self, filename, last_modified_hdr):
 466                 """Try to set the last-modified time of the given file."""
 467                 if last_modified_hdr is None:
 468                         return
 469                 if not os.path.isfile(filename):
 470                         return
 471                 timestr = last_modified_hdr
 472                 if timestr is None:
 473                         return
 474                 filetime = timeconvert(timestr)
 475                 if filetime is None:
 476                         return
 477                 try:
 478                         os.utime(filename,(time.time(), filetime))
 479                 except:
 480                         pass
 481
 482         def report_destination(self, filename):
 483                 """Report destination filename."""
 484                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 485
 486         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 487                 """Report download progress."""
 488                 if self.params.get('noprogress', False):
 489                         return
 490                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 491                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 492                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 493                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 494
 495         def report_resuming_byte(self, resume_len):
 496                 """Report attempt to resume at given byte."""
 497                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 498
 499         def report_retry(self, count, retries):
 500                 """Report retry in case of HTTP error 5xx"""
 501                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 502
 503         def report_file_already_downloaded(self, file_name):
 504                 """Report file has already been fully downloaded."""
 505                 try:
 506                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 507                 except (UnicodeEncodeError), err:
 508                         self.to_screen(u'[download] The file has already been downloaded')
 509
 510         def report_unable_to_resume(self):
 511                 """Report it was impossible to resume download."""
 512                 self.to_screen(u'[download] Unable to resume')
 513
 514         def report_finish(self):
 515                 """Report download finished."""
 516                 if self.params.get('noprogress', False):
 517                         self.to_screen(u'[download] Download completed')
 518                 else:
 519                         self.to_screen(u'')
 520
 521         def increment_downloads(self):
 522                 """Increment the ordinal that assigns a number to each file."""
 523                 self._num_downloads += 1
 524
 525         def prepare_filename(self, info_dict):
 526                 """Generate the output filename."""
 527                 try:
 528                         template_dict = dict(info_dict)
 529                         template_dict['epoch'] = unicode(long(time.time()))
 530                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 531                         filename = self.params['outtmpl'] % template_dict
 532                         return filename
 533                 except (ValueError, KeyError), err:
 534                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 535                         return None
 536
 537         def process_info(self, info_dict):
 538                 """Process a single dictionary returned by an InfoExtractor."""
 539                 filename = self.prepare_filename(info_dict)
 540                 # Do nothing else if in simulate mode
 541                 if self.params.get('simulate', False):
 542                         # Forced printings
 543                         if self.params.get('forcetitle', False):
 544                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 545                         if self.params.get('forceurl', False):
 546                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 547                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 548                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 549                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 550                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 551                         if self.params.get('forcefilename', False) and filename is not None:
 552                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 553
 554                         return
 555
 556                 if filename is None:
 557                         return
 558                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 559                         self.to_stderr(u'WARNING: file exists and will be skipped')
 560                         return
 561
 562                 try:
 563                         self.pmkdir(filename)
 564                 except (OSError, IOError), err:
 565                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 566                         return
 567
 568                 try:
 569                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 570                 except (OSError, IOError), err:
 571                         raise UnavailableVideoError
 572                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 573                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 574                         return
 575                 except (ContentTooShortError, ), err:
 576                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 577                         return
 578
 579                 if success:
 580                         try:
 581                                 self.post_process(filename, info_dict)
 582                         except (PostProcessingError), err:
 583                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 584                                 return
 585
 586         def download(self, url_list):
 587                 """Download a given list of URLs."""
 588                 if len(url_list) > 1 and self.fixed_template():
 589                         raise SameFileError(self.params['outtmpl'])
 590
 591                 for url in url_list:
 592                         suitable_found = False
 593                         for ie in self._ies:
 594                                 # Go to next InfoExtractor if not suitable
 595                                 if not ie.suitable(url):
 596                                         continue
 597
 598                                 # Suitable InfoExtractor found
 599                                 suitable_found = True
 600
 601                                 # Extract information from URL and process it
 602                                 ie.extract(url)
 603
 604                                 # Suitable InfoExtractor had been found; go to next URL
 605                                 break
 606
 607                         if not suitable_found:
 608                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 609
 610                 return self._download_retcode
 611
 612         def post_process(self, filename, ie_info):
 613                 """Run the postprocessing chain on the given file."""
 614                 info = dict(ie_info)
 615                 info['filepath'] = filename
 616                 for pp in self._pps:
 617                         info = pp.run(info)
 618                         if info is None:
 619                                 break
 620
 621         def _download_with_rtmpdump(self, filename, url, player_url):
 622                 self.report_destination(filename)
 623                 tmpfilename = self.temp_name(filename)
 624
 625                 # Check for rtmpdump first
 626                 try:
 627                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 628                 except (OSError, IOError):
 629                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 630                         return False
 631
 632                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 633                 # the connection was interrumpted and resuming appears to be
 634                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 635                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 636                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 637                 while retval == 2 or retval == 1:
 638                         prevsize = os.path.getsize(tmpfilename)
 639                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 640                         time.sleep(5.0) # This seems to be needed
 641                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 642                         cursize = os.path.getsize(tmpfilename)
 643                         if prevsize == cursize and retval == 1:
 644                                 break
 645                 if retval == 0:
 646                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 647                         self.try_rename(tmpfilename, filename)
 648                         return True
 649                 else:
 650                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 651                         return False
 652
 653         def _do_download(self, filename, url, player_url):
 654                 # Check file already present
 655                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 656                         self.report_file_already_downloaded(filename)
 657                         return True
 658
 659                 # Attempt to download using rtmpdump
 660                 if url.startswith('rtmp'):
 661                         return self._download_with_rtmpdump(filename, url, player_url)
 662
 663                 tmpfilename = self.temp_name(filename)
 664                 stream = None
 665                 open_mode = 'wb'
 666
 667                 # Do not include the Accept-Encoding header
 668                 headers = {'Youtubedl-no-compression': 'True'}
 669                 basic_request = urllib2.Request(url, None, headers)
 670                 request = urllib2.Request(url, None, headers)
 671
 672                 # Establish possible resume length
 673                 if os.path.isfile(tmpfilename):
 674                         resume_len = os.path.getsize(tmpfilename)
 675                 else:
 676                         resume_len = 0
 677
 678                 # Request parameters in case of being able to resume
 679                 if self.params.get('continuedl', False) and resume_len != 0:
 680                         self.report_resuming_byte(resume_len)
 681                         request.add_header('Range','bytes=%d-' % resume_len)
 682                         open_mode = 'ab'
 683
 684                 count = 0
 685                 retries = self.params.get('retries', 0)
 686                 while count <= retries:
 687                         # Establish connection
 688                         try:
 689                                 data = urllib2.urlopen(request)
 690                                 break
 691                         except (urllib2.HTTPError, ), err:
 692                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 693                                         # Unexpected HTTP error
 694                                         raise
 695                                 elif err.code == 416:
 696                                         # Unable to resume (requested range not satisfiable)
 697                                         try:
 698                                                 # Open the connection again without the range header
 699                                                 data = urllib2.urlopen(basic_request)
 700                                                 content_length = data.info()['Content-Length']
 701                                         except (urllib2.HTTPError, ), err:
 702                                                 if err.code < 500 or err.code >= 600:
 703                                                         raise
 704                                         else:
 705                                                 # Examine the reported length
 706                                                 if (content_length is not None and
 707                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 708                                                         # The file had already been fully downloaded.
 709                                                         # Explanation to the above condition: in issue #175 it was revealed that
 710                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 711                                                         # changing the file size slightly and causing problems for some users. So
 712                                                         # I decided to implement a suggested change and consider the file
 713                                                         # completely downloaded if the file size differs less than 100 bytes from
 714                                                         # the one in the hard drive.
 715                                                         self.report_file_already_downloaded(filename)
 716                                                         self.try_rename(tmpfilename, filename)
 717                                                         return True
 718                                                 else:
 719                                                         # The length does not match, we start the download over
 720                                                         self.report_unable_to_resume()
 721                                                         open_mode = 'wb'
 722                                                         break
 723                         # Retry
 724                         count += 1
 725                         if count <= retries:
 726                                 self.report_retry(count, retries)
 727
 728                 if count > retries:
 729                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 730                         return False
 731
 732                 data_len = data.info().get('Content-length', None)
 733                 if data_len is not None:
 734                         data_len = long(data_len) + resume_len
 735                 data_len_str = self.format_bytes(data_len)
 736                 byte_counter = 0 + resume_len
 737                 block_size = 1024
 738                 start = time.time()
 739                 while True:
 740                         # Download and write
 741                         before = time.time()
 742                         data_block = data.read(block_size)
 743                         after = time.time()
 744                         if len(data_block) == 0:
 745                                 break
 746                         byte_counter += len(data_block)
 747
 748                         # Open file just in time
 749                         if stream is None:
 750                                 try:
 751                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 752                                         filename = self.undo_temp_name(tmpfilename)
 753                                         self.report_destination(filename)
 754                                 except (OSError, IOError), err:
 755                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 756                                         return False
 757                         try:
 758                                 stream.write(data_block)
 759                         except (IOError, OSError), err:
 760                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 761                                 return False
 762                         block_size = self.best_block_size(after - before, len(data_block))
 763
 764                         # Progress message
 765                         percent_str = self.calc_percent(byte_counter, data_len)
 766                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 767                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 768                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 769
 770                         # Apply rate limit
 771                         self.slow_down(start, byte_counter - resume_len)
 772
 773                 stream.close()
 774                 self.report_finish()
 775                 if data_len is not None and byte_counter != data_len:
 776                         raise ContentTooShortError(byte_counter, long(data_len))
 777                 self.try_rename(tmpfilename, filename)
 778
 779                 # Update file modification time
 780                 if self.params.get('updatetime', True):
 781                         self.try_utime(filename, data.info().get('last-modified', None))
 782
 783                 return True
 784
 785 class InfoExtractor(object):
 786         """Information Extractor class.
 787
 788         Information extractors are the classes that, given a URL, extract
 789         information from the video (or videos) the URL refers to. This
 790         information includes the real video URL, the video title and simplified
 791         title, author and others. The information is stored in a dictionary
 792         which is then passed to the FileDownloader. The FileDownloader
 793         processes this information possibly downloading the video to the file
 794         system, among other possible outcomes. The dictionaries must include
 795         the following fields:
 796
 797         id:             Video identifier.
 798         url:            Final video URL.
 799         uploader:       Nickname of the video uploader.
 800         title:          Literal title.
 801         stitle:         Simplified title.
 802         ext:            Video filename extension.
 803         format:         Video format.
 804         player_url:     SWF Player URL (may be None).
 805
 806         The following fields are optional. Their primary purpose is to allow
 807         youtube-dl to serve as the backend for a video search function, such
 808         as the one in youtube2mp3.  They are only used when their respective
 809         forced printing functions are called:
 810
 811         thumbnail:      Full URL to a video thumbnail image.
 812         description:    One-line video description.
 813
 814         Subclasses of this one should re-define the _real_initialize() and
 815         _real_extract() methods, as well as the suitable() static method.
 816         Probably, they should also be instantiated and added to the main
 817         downloader.
 818         """
 819
 820         _ready = False
 821         _downloader = None
 822
 823         def __init__(self, downloader=None):
 824                 """Constructor. Receives an optional downloader."""
 825                 self._ready = False
 826                 self.set_downloader(downloader)
 827
 828         @staticmethod
 829         def suitable(url):
 830                 """Receives a URL and returns True if suitable for this IE."""
 831                 return False
 832
 833         def initialize(self):
 834                 """Initializes an instance (authentication, etc)."""
 835                 if not self._ready:
 836                         self._real_initialize()
 837                         self._ready = True
 838
 839         def extract(self, url):
 840                 """Extracts URL information and returns it in list of dicts."""
 841                 self.initialize()
 842                 return self._real_extract(url)
 843
 844         def set_downloader(self, downloader):
 845                 """Sets the downloader for this IE."""
 846                 self._downloader = downloader
 847
 848         def _real_initialize(self):
 849                 """Real initialization process. Redefine in subclasses."""
 850                 pass
 851
 852         def _real_extract(self, url):
 853                 """Real extraction process. Redefine in subclasses."""
 854                 pass
 855
 856 class YoutubeIE(InfoExtractor):
 857         """Information extractor for youtube.com."""
 858
 859         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 860         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 861         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 862         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 863         _NETRC_MACHINE = 'youtube'
 864         # Listed in order of quality
 865         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 866         _video_extensions = {
 867                 '13': '3gp',
 868                 '17': 'mp4',
 869                 '18': 'mp4',
 870                 '22': 'mp4',
 871                 '37': 'mp4',
 872                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 873                 '43': 'webm',
 874                 '45': 'webm',
 875         }
 876
 877         @staticmethod
 878         def suitable(url):
 879                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 880
 881         def report_lang(self):
 882                 """Report attempt to set language."""
 883                 self._downloader.to_screen(u'[youtube] Setting language')
 884
 885         def report_login(self):
 886                 """Report attempt to log in."""
 887                 self._downloader.to_screen(u'[youtube] Logging in')
 888
 889         def report_age_confirmation(self):
 890                 """Report attempt to confirm age."""
 891                 self._downloader.to_screen(u'[youtube] Confirming age')
 892
 893         def report_video_webpage_download(self, video_id):
 894                 """Report attempt to download video webpage."""
 895                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 896
 897         def report_video_info_webpage_download(self, video_id):
 898                 """Report attempt to download video info webpage."""
 899                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 900
 901         def report_information_extraction(self, video_id):
 902                 """Report attempt to extract video information."""
 903                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 904
 905         def report_unavailable_format(self, video_id, format):
 906                 """Report extracted video URL."""
 907                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 908
 909         def report_rtmp_download(self):
 910                 """Indicate the download will use the RTMP protocol."""
 911                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 912
 913         def _real_initialize(self):
 914                 if self._downloader is None:
 915                         return
 916
 917                 username = None
 918                 password = None
 919                 downloader_params = self._downloader.params
 920
 921                 # Attempt to use provided username and password or .netrc data
 922                 if downloader_params.get('username', None) is not None:
 923                         username = downloader_params['username']
 924                         password = downloader_params['password']
 925                 elif downloader_params.get('usenetrc', False):
 926                         try:
 927                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 928                                 if info is not None:
 929                                         username = info[0]
 930                                         password = info[2]
 931                                 else:
 932                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 933                         except (IOError, netrc.NetrcParseError), err:
 934                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 935                                 return
 936
 937                 # Set language
 938                 request = urllib2.Request(self._LANG_URL)
 939                 try:
 940                         self.report_lang()
 941                         urllib2.urlopen(request).read()
 942                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 943                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 944                         return
 945
 946                 # No authentication to be performed
 947                 if username is None:
 948                         return
 949
 950                 # Log in
 951                 login_form = {
 952                                 'current_form': 'loginForm',
 953                                 'next':         '/',
 954                                 'action_login': 'Log In',
 955                                 'username':     username,
 956                                 'password':     password,
 957                                 }
 958                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 959                 try:
 960                         self.report_login()
 961                         login_results = urllib2.urlopen(request).read()
 962                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 963                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 964                                 return
 965                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 966                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 967                         return
 968
 969                 # Confirm age
 970                 age_form = {
 971                                 'next_url':             '/',
 972                                 'action_confirm':       'Confirm',
 973                                 }
 974                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 975                 try:
 976                         self.report_age_confirmation()
 977                         age_results = urllib2.urlopen(request).read()
 978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 979                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 980                         return
 981
 982         def _real_extract(self, url):
 983                 # Extract video id from URL
 984                 mobj = re.match(self._VALID_URL, url)
 985                 if mobj is None:
 986                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 987                         return
 988                 video_id = mobj.group(2)
 989
 990                 # Get video webpage
 991                 self.report_video_webpage_download(video_id)
 992                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 993                 try:
 994                         video_webpage = urllib2.urlopen(request).read()
 995                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 996                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 997                         return
 998
 999                 # Attempt to extract SWF player URL
1000                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1001                 if mobj is not None:
1002                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1003                 else:
1004                         player_url = None
1005
1006                 # Get video info
1007                 self.report_video_info_webpage_download(video_id)
1008                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1009                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1010                                            % (video_id, el_type))
1011                         request = urllib2.Request(video_info_url)
1012                         try:
1013                                 video_info_webpage = urllib2.urlopen(request).read()
1014                                 video_info = parse_qs(video_info_webpage)
1015                                 if 'token' in video_info:
1016                                         break
1017                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1018                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1019                                 return
1020                 if 'token' not in video_info:
1021                         if 'reason' in video_info:
1022                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1023                         else:
1024                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1025                         return
1026
1027                 # Start extracting information
1028                 self.report_information_extraction(video_id)
1029
1030                 # uploader
1031                 if 'author' not in video_info:
1032                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1033                         return
1034                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1035
1036                 # title
1037                 if 'title' not in video_info:
1038                         self._downloader.trouble(u'ERROR: unable to extract video title')
1039                         return
1040                 video_title = urllib.unquote_plus(video_info['title'][0])
1041                 video_title = video_title.decode('utf-8')
1042                 video_title = sanitize_title(video_title)
1043
1044                 # simplified title
1045                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1046                 simple_title = simple_title.strip(ur'_')
1047
1048                 # thumbnail image
1049                 if 'thumbnail_url' not in video_info:
1050                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1051                         video_thumbnail = ''
1052                 else:   # don't panic if we can't find it
1053                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1054
1055                 # upload date
1056                 upload_date = u'NA'
1057                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1058                 if mobj is not None:
1059                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1060                         format_expressions = ['%d %B %Y', '%B %d %Y']
1061                         for expression in format_expressions:
1062                                 try:
1063                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1064                                 except:
1065                                         pass
1066
1067                 # description
1068                 video_description = 'No description available.'
1069                 if self._downloader.params.get('forcedescription', False):
1070                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1071                         if mobj is not None:
1072                                 video_description = mobj.group(1)
1073
1074                 # token
1075                 video_token = urllib.unquote_plus(video_info['token'][0])
1076
1077                 # Decide which formats to download
1078                 req_format = self._downloader.params.get('format', None)
1079
1080                 if 'fmt_url_map' in video_info:
1081                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1082                         format_limit = self._downloader.params.get('format_limit', None)
1083                         if format_limit is not None and format_limit in self._available_formats:
1084                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1085                         else:
1086                                 format_list = self._available_formats
1087                         existing_formats = [x for x in format_list if x in url_map]
1088                         if len(existing_formats) == 0:
1089                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1090                                 return
1091                         if req_format is None:
1092                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1093                         elif req_format == '-1':
1094                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1095                         else:
1096                                 # Specific format
1097                                 if req_format not in url_map:
1098                                         self._downloader.trouble(u'ERROR: requested format not available')
1099                                         return
1100                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1101
1102                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1103                         self.report_rtmp_download()
1104                         video_url_list = [(None, video_info['conn'][0])]
1105
1106                 else:
1107                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1108                         return
1109
1110                 for format_param, video_real_url in video_url_list:
1111                         # At this point we have a new video
1112                         self._downloader.increment_downloads()
1113
1114                         # Extension
1115                         video_extension = self._video_extensions.get(format_param, 'flv')
1116
1117                         # Find the video URL in fmt_url_map or conn paramters
1118                         try:
1119                                 # Process video information
1120                                 self._downloader.process_info({
1121                                         'id':           video_id.decode('utf-8'),
1122                                         'url':          video_real_url.decode('utf-8'),
1123                                         'uploader':     video_uploader.decode('utf-8'),
1124                                         'upload_date':  upload_date,
1125                                         'title':        video_title,
1126                                         'stitle':       simple_title,
1127                                         'ext':          video_extension.decode('utf-8'),
1128                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1129                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1130                                         'description':  video_description.decode('utf-8'),
1131                                         'player_url':   player_url,
1132                                 })
1133                         except UnavailableVideoError, err:
1134                                 self._downloader.trouble(u'\nERROR: unable to download video')
1135
1136
1137 class MetacafeIE(InfoExtractor):
1138         """Information Extractor for metacafe.com."""
1139
1140         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1141         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1142         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1143         _youtube_ie = None
1144
1145         def __init__(self, youtube_ie, downloader=None):
1146                 InfoExtractor.__init__(self, downloader)
1147                 self._youtube_ie = youtube_ie
1148
1149         @staticmethod
1150         def suitable(url):
1151                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1152
1153         def report_disclaimer(self):
1154                 """Report disclaimer retrieval."""
1155                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1156
1157         def report_age_confirmation(self):
1158                 """Report attempt to confirm age."""
1159                 self._downloader.to_screen(u'[metacafe] Confirming age')
1160
1161         def report_download_webpage(self, video_id):
1162                 """Report webpage download."""
1163                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1164
1165         def report_extraction(self, video_id):
1166                 """Report information extraction."""
1167                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1168
1169         def _real_initialize(self):
1170                 # Retrieve disclaimer
1171                 request = urllib2.Request(self._DISCLAIMER)
1172                 try:
1173                         self.report_disclaimer()
1174                         disclaimer = urllib2.urlopen(request).read()
1175                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1177                         return
1178
1179                 # Confirm age
1180                 disclaimer_form = {
1181                         'filters': '0',
1182                         'submit': "Continue - I'm over 18",
1183                         }
1184                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1185                 try:
1186                         self.report_age_confirmation()
1187                         disclaimer = urllib2.urlopen(request).read()
1188                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1190                         return
1191
1192         def _real_extract(self, url):
1193                 # Extract id and simplified title from URL
1194                 mobj = re.match(self._VALID_URL, url)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1197                         return
1198
1199                 video_id = mobj.group(1)
1200
1201                 # Check if video comes from YouTube
1202                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1203                 if mobj2 is not None:
1204                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1205                         return
1206
1207                 # At this point we have a new video
1208                 self._downloader.increment_downloads()
1209
1210                 simple_title = mobj.group(2).decode('utf-8')
1211
1212                 # Retrieve video webpage to extract further information
1213                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1214                 try:
1215                         self.report_download_webpage(video_id)
1216                         webpage = urllib2.urlopen(request).read()
1217                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1218                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1219                         return
1220
1221                 # Extract URL, uploader and title from webpage
1222                 self.report_extraction(video_id)
1223                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1224                 if mobj is not None:
1225                         mediaURL = urllib.unquote(mobj.group(1))
1226                         video_extension = mediaURL[-3:]
1227
1228                         # Extract gdaKey if available
1229                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1230                         if mobj is None:
1231                                 video_url = mediaURL
1232                         else:
1233                                 gdaKey = mobj.group(1)
1234                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1235                 else:
1236                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1237                         if mobj is None:
1238                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1239                                 return
1240                         vardict = parse_qs(mobj.group(1))
1241                         if 'mediaData' not in vardict:
1242                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1243                                 return
1244                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1245                         if mobj is None:
1246                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1247                                 return
1248                         mediaURL = mobj.group(1).replace('\\/', '/')
1249                         video_extension = mediaURL[-3:]
1250                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1251
1252                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1253                 if mobj is None:
1254                         self._downloader.trouble(u'ERROR: unable to extract title')
1255                         return
1256                 video_title = mobj.group(1).decode('utf-8')
1257                 video_title = sanitize_title(video_title)
1258
1259                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1260                 if mobj is None:
1261                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1262                         return
1263                 video_uploader = mobj.group(1)
1264
1265                 try:
1266                         # Process video information
1267                         self._downloader.process_info({
1268                                 'id':           video_id.decode('utf-8'),
1269                                 'url':          video_url.decode('utf-8'),
1270                                 'uploader':     video_uploader.decode('utf-8'),
1271                                 'upload_date':  u'NA',
1272                                 'title':        video_title,
1273                                 'stitle':       simple_title,
1274                                 'ext':          video_extension.decode('utf-8'),
1275                                 'format':       u'NA',
1276                                 'player_url':   None,
1277                         })
1278                 except UnavailableVideoError:
1279                         self._downloader.trouble(u'\nERROR: unable to download video')
1280
1281
1282 class DailymotionIE(InfoExtractor):
1283         """Information Extractor for Dailymotion"""
1284
1285         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1286
1287         def __init__(self, downloader=None):
1288                 InfoExtractor.__init__(self, downloader)
1289
1290         @staticmethod
1291         def suitable(url):
1292                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1293
1294         def report_download_webpage(self, video_id):
1295                 """Report webpage download."""
1296                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1297
1298         def report_extraction(self, video_id):
1299                 """Report information extraction."""
1300                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1301
1302         def _real_initialize(self):
1303                 return
1304
1305         def _real_extract(self, url):
1306                 # Extract id and simplified title from URL
1307                 mobj = re.match(self._VALID_URL, url)
1308                 if mobj is None:
1309                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1310                         return
1311
1312                 # At this point we have a new video
1313                 self._downloader.increment_downloads()
1314                 video_id = mobj.group(1)
1315
1316                 simple_title = mobj.group(2).decode('utf-8')
1317                 video_extension = 'flv'
1318
1319                 # Retrieve video webpage to extract further information
1320                 request = urllib2.Request(url)
1321                 try:
1322                         self.report_download_webpage(video_id)
1323                         webpage = urllib2.urlopen(request).read()
1324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1326                         return
1327
1328                 # Extract URL, uploader and title from webpage
1329                 self.report_extraction(video_id)
1330                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1331                 if mobj is None:
1332                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1333                         return
1334                 mediaURL = urllib.unquote(mobj.group(1))
1335
1336                 # if needed add http://www.dailymotion.com/ if relative URL
1337
1338                 video_url = mediaURL
1339
1340                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1341                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1342                 if mobj is None:
1343                         self._downloader.trouble(u'ERROR: unable to extract title')
1344                         return
1345                 video_title = mobj.group(1).decode('utf-8')
1346                 video_title = sanitize_title(video_title)
1347
1348                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1349                 if mobj is None:
1350                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1351                         return
1352                 video_uploader = mobj.group(1)
1353
1354                 try:
1355                         # Process video information
1356                         self._downloader.process_info({
1357                                 'id':           video_id.decode('utf-8'),
1358                                 'url':          video_url.decode('utf-8'),
1359                                 'uploader':     video_uploader.decode('utf-8'),
1360                                 'upload_date':  u'NA',
1361                                 'title':        video_title,
1362                                 'stitle':       simple_title,
1363                                 'ext':          video_extension.decode('utf-8'),
1364                                 'format':       u'NA',
1365                                 'player_url':   None,
1366                         })
1367                 except UnavailableVideoError:
1368                         self._downloader.trouble(u'\nERROR: unable to download video')
1369
1370 class GoogleIE(InfoExtractor):
1371         """Information extractor for video.google.com."""
1372
1373         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1374
1375         def __init__(self, downloader=None):
1376                 InfoExtractor.__init__(self, downloader)
1377
1378         @staticmethod
1379         def suitable(url):
1380                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1381
1382         def report_download_webpage(self, video_id):
1383                 """Report webpage download."""
1384                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1385
1386         def report_extraction(self, video_id):
1387                 """Report information extraction."""
1388                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1389
1390         def _real_initialize(self):
1391                 return
1392
1393         def _real_extract(self, url):
1394                 # Extract id from URL
1395                 mobj = re.match(self._VALID_URL, url)
1396                 if mobj is None:
1397                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1398                         return
1399
1400                 # At this point we have a new video
1401                 self._downloader.increment_downloads()
1402                 video_id = mobj.group(1)
1403
1404                 video_extension = 'mp4'
1405
1406                 # Retrieve video webpage to extract further information
1407                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1408                 try:
1409                         self.report_download_webpage(video_id)
1410                         webpage = urllib2.urlopen(request).read()
1411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1413                         return
1414
1415                 # Extract URL, uploader, and title from webpage
1416                 self.report_extraction(video_id)
1417                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1418                 if mobj is None:
1419                         video_extension = 'flv'
1420                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1421                 if mobj is None:
1422                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1423                         return
1424                 mediaURL = urllib.unquote(mobj.group(1))
1425                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1426                 mediaURL = mediaURL.replace('\\x26', '\x26')
1427
1428                 video_url = mediaURL
1429
1430                 mobj = re.search(r'<title>(.*)</title>', webpage)
1431                 if mobj is None:
1432                         self._downloader.trouble(u'ERROR: unable to extract title')
1433                         return
1434                 video_title = mobj.group(1).decode('utf-8')
1435                 video_title = sanitize_title(video_title)
1436                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1437
1438                 # Extract video description
1439                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: unable to extract video description')
1442                         return
1443                 video_description = mobj.group(1).decode('utf-8')
1444                 if not video_description:
1445                         video_description = 'No description available.'
1446
1447                 # Extract video thumbnail
1448                 if self._downloader.params.get('forcethumbnail', False):
1449                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1450                         try:
1451                                 webpage = urllib2.urlopen(request).read()
1452                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1454                                 return
1455                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1456                         if mobj is None:
1457                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1458                                 return
1459                         video_thumbnail = mobj.group(1)
1460                 else:   # we need something to pass to process_info
1461                         video_thumbnail = ''
1462
1463
1464                 try:
1465                         # Process video information
1466                         self._downloader.process_info({
1467                                 'id':           video_id.decode('utf-8'),
1468                                 'url':          video_url.decode('utf-8'),
1469                                 'uploader':     u'NA',
1470                                 'upload_date':  u'NA',
1471                                 'title':        video_title,
1472                                 'stitle':       simple_title,
1473                                 'ext':          video_extension.decode('utf-8'),
1474                                 'format':       u'NA',
1475                                 'player_url':   None,
1476                         })
1477                 except UnavailableVideoError:
1478                         self._downloader.trouble(u'\nERROR: unable to download video')
1479
1480
1481 class PhotobucketIE(InfoExtractor):
1482         """Information extractor for photobucket.com."""
1483
1484         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1485
1486         def __init__(self, downloader=None):
1487                 InfoExtractor.__init__(self, downloader)
1488
1489         @staticmethod
1490         def suitable(url):
1491                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1492
1493         def report_download_webpage(self, video_id):
1494                 """Report webpage download."""
1495                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1496
1497         def report_extraction(self, video_id):
1498                 """Report information extraction."""
1499                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1500
1501         def _real_initialize(self):
1502                 return
1503
1504         def _real_extract(self, url):
1505                 # Extract id from URL
1506                 mobj = re.match(self._VALID_URL, url)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1509                         return
1510
1511                 # At this point we have a new video
1512                 self._downloader.increment_downloads()
1513                 video_id = mobj.group(1)
1514
1515                 video_extension = 'flv'
1516
1517                 # Retrieve video webpage to extract further information
1518                 request = urllib2.Request(url)
1519                 try:
1520                         self.report_download_webpage(video_id)
1521                         webpage = urllib2.urlopen(request).read()
1522                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1523                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1524                         return
1525
1526                 # Extract URL, uploader, and title from webpage
1527                 self.report_extraction(video_id)
1528                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1529                 if mobj is None:
1530                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1531                         return
1532                 mediaURL = urllib.unquote(mobj.group(1))
1533
1534                 video_url = mediaURL
1535
1536                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1537                 if mobj is None:
1538                         self._downloader.trouble(u'ERROR: unable to extract title')
1539                         return
1540                 video_title = mobj.group(1).decode('utf-8')
1541                 video_title = sanitize_title(video_title)
1542                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1543
1544                 video_uploader = mobj.group(2).decode('utf-8')
1545
1546                 try:
1547                         # Process video information
1548                         self._downloader.process_info({
1549                                 'id':           video_id.decode('utf-8'),
1550                                 'url':          video_url.decode('utf-8'),
1551                                 'uploader':     video_uploader,
1552                                 'upload_date':  u'NA',
1553                                 'title':        video_title,
1554                                 'stitle':       simple_title,
1555                                 'ext':          video_extension.decode('utf-8'),
1556                                 'format':       u'NA',
1557                                 'player_url':   None,
1558                         })
1559                 except UnavailableVideoError:
1560                         self._downloader.trouble(u'\nERROR: unable to download video')
1561
1562
1563 class YahooIE(InfoExtractor):
1564         """Information extractor for video.yahoo.com."""
1565
1566         # _VALID_URL matches all Yahoo! Video URLs
1567         # _VPAGE_URL matches only the extractable '/watch/' URLs
1568         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1569         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1570
1571         def __init__(self, downloader=None):
1572                 InfoExtractor.__init__(self, downloader)
1573
1574         @staticmethod
1575         def suitable(url):
1576                 return (re.match(YahooIE._VALID_URL, url) is not None)
1577
1578         def report_download_webpage(self, video_id):
1579                 """Report webpage download."""
1580                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1581
1582         def report_extraction(self, video_id):
1583                 """Report information extraction."""
1584                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1585
1586         def _real_initialize(self):
1587                 return
1588
1589         def _real_extract(self, url, new_video=True):
1590                 # Extract ID from URL
1591                 mobj = re.match(self._VALID_URL, url)
1592                 if mobj is None:
1593                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1594                         return
1595
1596                 # At this point we have a new video
1597                 self._downloader.increment_downloads()
1598                 video_id = mobj.group(2)
1599                 video_extension = 'flv'
1600
1601                 # Rewrite valid but non-extractable URLs as
1602                 # extractable English language /watch/ URLs
1603                 if re.match(self._VPAGE_URL, url) is None:
1604                         request = urllib2.Request(url)
1605                         try:
1606                                 webpage = urllib2.urlopen(request).read()
1607                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1608                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1609                                 return
1610
1611                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1612                         if mobj is None:
1613                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1614                                 return
1615                         yahoo_id = mobj.group(1)
1616
1617                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1618                         if mobj is None:
1619                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1620                                 return
1621                         yahoo_vid = mobj.group(1)
1622
1623                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1624                         return self._real_extract(url, new_video=False)
1625
1626                 # Retrieve video webpage to extract further information
1627                 request = urllib2.Request(url)
1628                 try:
1629                         self.report_download_webpage(video_id)
1630                         webpage = urllib2.urlopen(request).read()
1631                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1632                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1633                         return
1634
1635                 # Extract uploader and title from webpage
1636                 self.report_extraction(video_id)
1637                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: unable to extract video title')
1640                         return
1641                 video_title = mobj.group(1).decode('utf-8')
1642                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1643
1644                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1645                 if mobj is None:
1646                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1647                         return
1648                 video_uploader = mobj.group(1).decode('utf-8')
1649
1650                 # Extract video thumbnail
1651                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1654                         return
1655                 video_thumbnail = mobj.group(1).decode('utf-8')
1656
1657                 # Extract video description
1658                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract video description')
1661                         return
1662                 video_description = mobj.group(1).decode('utf-8')
1663                 if not video_description: video_description = 'No description available.'
1664
1665                 # Extract video height and width
1666                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1667                 if mobj is None:
1668                         self._downloader.trouble(u'ERROR: unable to extract video height')
1669                         return
1670                 yv_video_height = mobj.group(1)
1671
1672                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1673                 if mobj is None:
1674                         self._downloader.trouble(u'ERROR: unable to extract video width')
1675                         return
1676                 yv_video_width = mobj.group(1)
1677
1678                 # Retrieve video playlist to extract media URL
1679                 # I'm not completely sure what all these options are, but we
1680                 # seem to need most of them, otherwise the server sends a 401.
1681                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1682                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1683                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1684                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1685                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1686                 try:
1687                         self.report_download_webpage(video_id)
1688                         webpage = urllib2.urlopen(request).read()
1689                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1691                         return
1692
1693                 # Extract media URL from playlist XML
1694                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1695                 if mobj is None:
1696                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1697                         return
1698                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1699                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1700
1701                 try:
1702                         # Process video information
1703                         self._downloader.process_info({
1704                                 'id':           video_id.decode('utf-8'),
1705                                 'url':          video_url,
1706                                 'uploader':     video_uploader,
1707                                 'upload_date':  u'NA',
1708                                 'title':        video_title,
1709                                 'stitle':       simple_title,
1710                                 'ext':          video_extension.decode('utf-8'),
1711                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1712                                 'description':  video_description,
1713                                 'thumbnail':    video_thumbnail,
1714                                 'description':  video_description,
1715                                 'player_url':   None,
1716                         })
1717                 except UnavailableVideoError:
1718                         self._downloader.trouble(u'\nERROR: unable to download video')
1719
1720
1721 class GenericIE(InfoExtractor):
1722         """Generic last-resort information extractor."""
1723
1724         def __init__(self, downloader=None):
1725                 InfoExtractor.__init__(self, downloader)
1726
1727         @staticmethod
1728         def suitable(url):
1729                 return True
1730
1731         def report_download_webpage(self, video_id):
1732                 """Report webpage download."""
1733                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1734                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1735
1736         def report_extraction(self, video_id):
1737                 """Report information extraction."""
1738                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1739
1740         def _real_initialize(self):
1741                 return
1742
1743         def _real_extract(self, url):
1744                 # At this point we have a new video
1745                 self._downloader.increment_downloads()
1746
1747                 video_id = url.split('/')[-1]
1748                 request = urllib2.Request(url)
1749                 try:
1750                         self.report_download_webpage(video_id)
1751                         webpage = urllib2.urlopen(request).read()
1752                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1753                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1754                         return
1755                 except ValueError, err:
1756                         # since this is the last-resort InfoExtractor, if
1757                         # this error is thrown, it'll be thrown here
1758                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1759                         return
1760
1761                 self.report_extraction(video_id)
1762                 # Start with something easy: JW Player in SWFObject
1763                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1764                 if mobj is None:
1765                         # Broaden the search a little bit
1766                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1767                 if mobj is None:
1768                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1769                         return
1770
1771                 # It's possible that one of the regexes
1772                 # matched, but returned an empty group:
1773                 if mobj.group(1) is None:
1774                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1775                         return
1776
1777                 video_url = urllib.unquote(mobj.group(1))
1778                 video_id  = os.path.basename(video_url)
1779
1780                 # here's a fun little line of code for you:
1781                 video_extension = os.path.splitext(video_id)[1][1:]
1782                 video_id        = os.path.splitext(video_id)[0]
1783
1784                 # it's tempting to parse this further, but you would
1785                 # have to take into account all the variations like
1786                 #   Video Title - Site Name
1787                 #   Site Name | Video Title
1788                 #   Video Title - Tagline | Site Name
1789                 # and so on and so forth; it's just not practical
1790                 mobj = re.search(r'<title>(.*)</title>', webpage)
1791                 if mobj is None:
1792                         self._downloader.trouble(u'ERROR: unable to extract title')
1793                         return
1794                 video_title = mobj.group(1).decode('utf-8')
1795                 video_title = sanitize_title(video_title)
1796                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1797
1798                 # video uploader is domain name
1799                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1800                 if mobj is None:
1801                         self._downloader.trouble(u'ERROR: unable to extract title')
1802                         return
1803                 video_uploader = mobj.group(1).decode('utf-8')
1804
1805                 try:
1806                         # Process video information
1807                         self._downloader.process_info({
1808                                 'id':           video_id.decode('utf-8'),
1809                                 'url':          video_url.decode('utf-8'),
1810                                 'uploader':     video_uploader,
1811                                 'upload_date':  u'NA',
1812                                 'title':        video_title,
1813                                 'stitle':       simple_title,
1814                                 'ext':          video_extension.decode('utf-8'),
1815                                 'format':       u'NA',
1816                                 'player_url':   None,
1817                         })
1818                 except UnavailableVideoError, err:
1819                         self._downloader.trouble(u'\nERROR: unable to download video')
1820
1821
1822 class YoutubeSearchIE(InfoExtractor):
1823         """Information Extractor for YouTube search queries."""
1824         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1825         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1826         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1827         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1828         _youtube_ie = None
1829         _max_youtube_results = 1000
1830
1831         def __init__(self, youtube_ie, downloader=None):
1832                 InfoExtractor.__init__(self, downloader)
1833                 self._youtube_ie = youtube_ie
1834
1835         @staticmethod
1836         def suitable(url):
1837                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1838
1839         def report_download_page(self, query, pagenum):
1840                 """Report attempt to download playlist page with given number."""
1841                 query = query.decode(preferredencoding())
1842                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1843
1844         def _real_initialize(self):
1845                 self._youtube_ie.initialize()
1846
1847         def _real_extract(self, query):
1848                 mobj = re.match(self._VALID_QUERY, query)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1851                         return
1852
1853                 prefix, query = query.split(':')
1854                 prefix = prefix[8:]
1855                 query  = query.encode('utf-8')
1856                 if prefix == '':
1857                         self._download_n_results(query, 1)
1858                         return
1859                 elif prefix == 'all':
1860                         self._download_n_results(query, self._max_youtube_results)
1861                         return
1862                 else:
1863                         try:
1864                                 n = long(prefix)
1865                                 if n <= 0:
1866                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1867                                         return
1868                                 elif n > self._max_youtube_results:
1869                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1870                                         n = self._max_youtube_results
1871                                 self._download_n_results(query, n)
1872                                 return
1873                         except ValueError: # parsing prefix as integer fails
1874                                 self._download_n_results(query, 1)
1875                                 return
1876
1877         def _download_n_results(self, query, n):
1878                 """Downloads a specified number of results for a query"""
1879
1880                 video_ids = []
1881                 already_seen = set()
1882                 pagenum = 1
1883
1884                 while True:
1885                         self.report_download_page(query, pagenum)
1886                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1887                         request = urllib2.Request(result_url)
1888                         try:
1889                                 page = urllib2.urlopen(request).read()
1890                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1891                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1892                                 return
1893
1894                         # Extract video identifiers
1895                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1896                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1897                                 if video_id not in already_seen:
1898                                         video_ids.append(video_id)
1899                                         already_seen.add(video_id)
1900                                         if len(video_ids) == n:
1901                                                 # Specified n videos reached
1902                                                 for id in video_ids:
1903                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1904                                                 return
1905
1906                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1907                                 for id in video_ids:
1908                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1909                                 return
1910
1911                         pagenum = pagenum + 1
1912
1913 class GoogleSearchIE(InfoExtractor):
1914         """Information Extractor for Google Video search queries."""
1915         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1916         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1917         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1918         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1919         _google_ie = None
1920         _max_google_results = 1000
1921
1922         def __init__(self, google_ie, downloader=None):
1923                 InfoExtractor.__init__(self, downloader)
1924                 self._google_ie = google_ie
1925
1926         @staticmethod
1927         def suitable(url):
1928                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1929
1930         def report_download_page(self, query, pagenum):
1931                 """Report attempt to download playlist page with given number."""
1932                 query = query.decode(preferredencoding())
1933                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1934
1935         def _real_initialize(self):
1936                 self._google_ie.initialize()
1937
1938         def _real_extract(self, query):
1939                 mobj = re.match(self._VALID_QUERY, query)
1940                 if mobj is None:
1941                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1942                         return
1943
1944                 prefix, query = query.split(':')
1945                 prefix = prefix[8:]
1946                 query  = query.encode('utf-8')
1947                 if prefix == '':
1948                         self._download_n_results(query, 1)
1949                         return
1950                 elif prefix == 'all':
1951                         self._download_n_results(query, self._max_google_results)
1952                         return
1953                 else:
1954                         try:
1955                                 n = long(prefix)
1956                                 if n <= 0:
1957                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1958                                         return
1959                                 elif n > self._max_google_results:
1960                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1961                                         n = self._max_google_results
1962                                 self._download_n_results(query, n)
1963                                 return
1964                         except ValueError: # parsing prefix as integer fails
1965                                 self._download_n_results(query, 1)
1966                                 return
1967
1968         def _download_n_results(self, query, n):
1969                 """Downloads a specified number of results for a query"""
1970
1971                 video_ids = []
1972                 already_seen = set()
1973                 pagenum = 1
1974
1975                 while True:
1976                         self.report_download_page(query, pagenum)
1977                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1978                         request = urllib2.Request(result_url)
1979                         try:
1980                                 page = urllib2.urlopen(request).read()
1981                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1982                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1983                                 return
1984
1985                         # Extract video identifiers
1986                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1987                                 video_id = mobj.group(1)
1988                                 if video_id not in already_seen:
1989                                         video_ids.append(video_id)
1990                                         already_seen.add(video_id)
1991                                         if len(video_ids) == n:
1992                                                 # Specified n videos reached
1993                                                 for id in video_ids:
1994                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1995                                                 return
1996
1997                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1998                                 for id in video_ids:
1999                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2000                                 return
2001
2002                         pagenum = pagenum + 1
2003
2004 class YahooSearchIE(InfoExtractor):
2005         """Information Extractor for Yahoo! Video search queries."""
2006         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2007         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2008         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2009         _MORE_PAGES_INDICATOR = r'\s*Next'
2010         _yahoo_ie = None
2011         _max_yahoo_results = 1000
2012
2013         def __init__(self, yahoo_ie, downloader=None):
2014                 InfoExtractor.__init__(self, downloader)
2015                 self._yahoo_ie = yahoo_ie
2016
2017         @staticmethod
2018         def suitable(url):
2019                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2020
2021         def report_download_page(self, query, pagenum):
2022                 """Report attempt to download playlist page with given number."""
2023                 query = query.decode(preferredencoding())
2024                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2025
2026         def _real_initialize(self):
2027                 self._yahoo_ie.initialize()
2028
2029         def _real_extract(self, query):
2030                 mobj = re.match(self._VALID_QUERY, query)
2031                 if mobj is None:
2032                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2033                         return
2034
2035                 prefix, query = query.split(':')
2036                 prefix = prefix[8:]
2037                 query  = query.encode('utf-8')
2038                 if prefix == '':
2039                         self._download_n_results(query, 1)
2040                         return
2041                 elif prefix == 'all':
2042                         self._download_n_results(query, self._max_yahoo_results)
2043                         return
2044                 else:
2045                         try:
2046                                 n = long(prefix)
2047                                 if n <= 0:
2048                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2049                                         return
2050                                 elif n > self._max_yahoo_results:
2051                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2052                                         n = self._max_yahoo_results
2053                                 self._download_n_results(query, n)
2054                                 return
2055                         except ValueError: # parsing prefix as integer fails
2056                                 self._download_n_results(query, 1)
2057                                 return
2058
2059         def _download_n_results(self, query, n):
2060                 """Downloads a specified number of results for a query"""
2061
2062                 video_ids = []
2063                 already_seen = set()
2064                 pagenum = 1
2065
2066                 while True:
2067                         self.report_download_page(query, pagenum)
2068                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2069                         request = urllib2.Request(result_url)
2070                         try:
2071                                 page = urllib2.urlopen(request).read()
2072                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2073                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2074                                 return
2075
2076                         # Extract video identifiers
2077                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2078                                 video_id = mobj.group(1)
2079                                 if video_id not in already_seen:
2080                                         video_ids.append(video_id)
2081                                         already_seen.add(video_id)
2082                                         if len(video_ids) == n:
2083                                                 # Specified n videos reached
2084                                                 for id in video_ids:
2085                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2086                                                 return
2087
2088                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2089                                 for id in video_ids:
2090                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2091                                 return
2092
2093                         pagenum = pagenum + 1
2094
2095 class YoutubePlaylistIE(InfoExtractor):
2096         """Information Extractor for YouTube playlists."""
2097
2098         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2099         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2100         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2101         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2102         _youtube_ie = None
2103
2104         def __init__(self, youtube_ie, downloader=None):
2105                 InfoExtractor.__init__(self, downloader)
2106                 self._youtube_ie = youtube_ie
2107
2108         @staticmethod
2109         def suitable(url):
2110                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2111
2112         def report_download_page(self, playlist_id, pagenum):
2113                 """Report attempt to download playlist page with given number."""
2114                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2115
2116         def _real_initialize(self):
2117                 self._youtube_ie.initialize()
2118
2119         def _real_extract(self, url):
2120                 # Extract playlist id
2121                 mobj = re.match(self._VALID_URL, url)
2122                 if mobj is None:
2123                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2124                         return
2125
2126                 # Download playlist pages
2127                 playlist_id = mobj.group(1)
2128                 video_ids = []
2129                 pagenum = 1
2130
2131                 while True:
2132                         self.report_download_page(playlist_id, pagenum)
2133                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2134                         try:
2135                                 page = urllib2.urlopen(request).read()
2136                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2138                                 return
2139
2140                         # Extract video identifiers
2141                         ids_in_page = []
2142                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2143                                 if mobj.group(1) not in ids_in_page:
2144                                         ids_in_page.append(mobj.group(1))
2145                         video_ids.extend(ids_in_page)
2146
2147                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2148                                 break
2149                         pagenum = pagenum + 1
2150
2151                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2152                 playlistend = self._downloader.params.get('playlistend', -1)
2153                 video_ids = video_ids[playliststart:playlistend]
2154
2155                 for id in video_ids:
2156                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2157                 return
2158
2159 class YoutubeUserIE(InfoExtractor):
2160         """Information Extractor for YouTube users."""
2161
2162         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2163         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2164         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2165         _youtube_ie = None
2166
2167         def __init__(self, youtube_ie, downloader=None):
2168                 InfoExtractor.__init__(self, downloader)
2169                 self._youtube_ie = youtube_ie
2170
2171         @staticmethod
2172         def suitable(url):
2173                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2174
2175         def report_download_page(self, username):
2176                 """Report attempt to download user page."""
2177                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2178
2179         def _real_initialize(self):
2180                 self._youtube_ie.initialize()
2181
2182         def _real_extract(self, url):
2183                 # Extract username
2184                 mobj = re.match(self._VALID_URL, url)
2185                 if mobj is None:
2186                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2187                         return
2188
2189                 # Download user page
2190                 username = mobj.group(1)
2191                 video_ids = []
2192                 pagenum = 1
2193
2194                 self.report_download_page(username)
2195                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2196                 try:
2197                         page = urllib2.urlopen(request).read()
2198                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2199                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2200                         return
2201
2202                 # Extract video identifiers
2203                 ids_in_page = []
2204
2205                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2206                         if mobj.group(1) not in ids_in_page:
2207                                 ids_in_page.append(mobj.group(1))
2208                 video_ids.extend(ids_in_page)
2209
2210                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2211                 playlistend = self._downloader.params.get('playlistend', -1)
2212                 video_ids = video_ids[playliststart:playlistend]
2213
2214                 for id in video_ids:
2215                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2216                 return
2217
2218 class DepositFilesIE(InfoExtractor):
2219         """Information extractor for depositfiles.com"""
2220
2221         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2222
2223         def __init__(self, downloader=None):
2224                 InfoExtractor.__init__(self, downloader)
2225
2226         @staticmethod
2227         def suitable(url):
2228                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2229
2230         def report_download_webpage(self, file_id):
2231                 """Report webpage download."""
2232                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2233
2234         def report_extraction(self, file_id):
2235                 """Report information extraction."""
2236                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2237
2238         def _real_initialize(self):
2239                 return
2240
2241         def _real_extract(self, url):
2242                 # At this point we have a new file
2243                 self._downloader.increment_downloads()
2244
2245                 file_id = url.split('/')[-1]
2246                 # Rebuild url in english locale
2247                 url = 'http://depositfiles.com/en/files/' + file_id
2248
2249                 # Retrieve file webpage with 'Free download' button pressed
2250                 free_download_indication = { 'gateway_result' : '1' }
2251                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2252                 try:
2253                         self.report_download_webpage(file_id)
2254                         webpage = urllib2.urlopen(request).read()
2255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2256                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2257                         return
2258
2259                 # Search for the real file URL
2260                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2261                 if (mobj is None) or (mobj.group(1) is None):
2262                         # Try to figure out reason of the error.
2263                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2264                         if (mobj is not None) and (mobj.group(1) is not None):
2265                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2266                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2267                         else:
2268                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2269                         return
2270
2271                 file_url = mobj.group(1)
2272                 file_extension = os.path.splitext(file_url)[1][1:]
2273
2274                 # Search for file title
2275                 mobj = re.search(r'<b title="(.*?)">', webpage)
2276                 if mobj is None:
2277                         self._downloader.trouble(u'ERROR: unable to extract title')
2278                         return
2279                 file_title = mobj.group(1).decode('utf-8')
2280
2281                 try:
2282                         # Process file information
2283                         self._downloader.process_info({
2284                                 'id':           file_id.decode('utf-8'),
2285                                 'url':          file_url.decode('utf-8'),
2286                                 'uploader':     u'NA',
2287                                 'upload_date':  u'NA',
2288                                 'title':        file_title,
2289                                 'stitle':       file_title,
2290                                 'ext':          file_extension.decode('utf-8'),
2291                                 'format':       u'NA',
2292                                 'player_url':   None,
2293                         })
2294                 except UnavailableVideoError, err:
2295                         self._downloader.trouble(u'ERROR: unable to download file')
2296
2297 class PostProcessor(object):
2298         """Post Processor class.
2299
2300         PostProcessor objects can be added to downloaders with their
2301         add_post_processor() method. When the downloader has finished a
2302         successful download, it will take its internal chain of PostProcessors
2303         and start calling the run() method on each one of them, first with
2304         an initial argument and then with the returned value of the previous
2305         PostProcessor.
2306
2307         The chain will be stopped if one of them ever returns None or the end
2308         of the chain is reached.
2309
2310         PostProcessor objects follow a "mutual registration" process similar
2311         to InfoExtractor objects.
2312         """
2313
2314         _downloader = None
2315
2316         def __init__(self, downloader=None):
2317                 self._downloader = downloader
2318
2319         def set_downloader(self, downloader):
2320                 """Sets the downloader for this PP."""
2321                 self._downloader = downloader
2322
2323         def run(self, information):
2324                 """Run the PostProcessor.
2325
2326                 The "information" argument is a dictionary like the ones
2327                 composed by InfoExtractors. The only difference is that this
2328                 one has an extra field called "filepath" that points to the
2329                 downloaded file.
2330
2331                 When this method returns None, the postprocessing chain is
2332                 stopped. However, this method may return an information
2333                 dictionary that will be passed to the next postprocessing
2334                 object in the chain. It can be the one it received after
2335                 changing some fields.
2336
2337                 In addition, this method may raise a PostProcessingError
2338                 exception that will be taken into account by the downloader
2339                 it was called from.
2340                 """
2341                 return information # by default, do nothing
2342
2343 ### MAIN PROGRAM ###
2344 if __name__ == '__main__':
2345         try:
2346                 # Modules needed only when running the main program
2347                 import getpass
2348                 import optparse
2349
2350                 # Function to update the program file with the latest version from the repository.
2351                 def update_self(downloader, filename):
2352                         # Note: downloader only used for options
2353                         if not os.access(filename, os.W_OK):
2354                                 sys.exit('ERROR: no write permissions on %s' % filename)
2355
2356                         downloader.to_screen('Updating to latest stable version...')
2357                         try:
2358                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2359                                 latest_version = urllib.urlopen(latest_url).read().strip()
2360                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2361                                 newcontent = urllib.urlopen(prog_url).read()
2362                         except (IOError, OSError), err:
2363                                 sys.exit('ERROR: unable to download latest version')
2364                         try:
2365                                 stream = open(filename, 'w')
2366                                 stream.write(newcontent)
2367                                 stream.close()
2368                         except (IOError, OSError), err:
2369                                 sys.exit('ERROR: unable to overwrite current version')
2370                         downloader.to_screen('Updated to version %s' % latest_version)
2371
2372                 # Parse command line
2373                 parser = optparse.OptionParser(
2374                         usage='Usage: %prog [options] url...',
2375                         version='2010.12.09',
2376                         conflict_handler='resolve',
2377                 )
2378
2379                 parser.add_option('-h', '--help',
2380                                 action='help', help='print this help text and exit')
2381                 parser.add_option('-v', '--version',
2382                                 action='version', help='print program version and exit')
2383                 parser.add_option('-U', '--update',
2384                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2385                 parser.add_option('-i', '--ignore-errors',
2386                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2387                 parser.add_option('-r', '--rate-limit',
2388                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2389                 parser.add_option('-R', '--retries',
2390                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2391                 parser.add_option('--playlist-start',
2392                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2393                 parser.add_option('--playlist-end',
2394                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2395                 parser.add_option('--dump-user-agent',
2396                                 action='store_true', dest='dump_user_agent',
2397                                 help='display the current browser identification', default=False)
2398
2399                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2400                 authentication.add_option('-u', '--username',
2401                                 dest='username', metavar='USERNAME', help='account username')
2402                 authentication.add_option('-p', '--password',
2403                                 dest='password', metavar='PASSWORD', help='account password')
2404                 authentication.add_option('-n', '--netrc',
2405                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2406                 parser.add_option_group(authentication)
2407
2408                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2409                 video_format.add_option('-f', '--format',
2410                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2411                 video_format.add_option('--all-formats',
2412                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2413                 video_format.add_option('--max-quality',
2414                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2415                 parser.add_option_group(video_format)
2416
2417                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2418                 verbosity.add_option('-q', '--quiet',
2419                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2420                 verbosity.add_option('-s', '--simulate',
2421                                 action='store_true', dest='simulate', help='do not download video', default=False)
2422                 verbosity.add_option('-g', '--get-url',
2423                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2424                 verbosity.add_option('-e', '--get-title',
2425                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2426                 verbosity.add_option('--get-thumbnail',
2427                                 action='store_true', dest='getthumbnail',
2428                                 help='simulate, quiet but print thumbnail URL', default=False)
2429                 verbosity.add_option('--get-description',
2430                                 action='store_true', dest='getdescription',
2431                                 help='simulate, quiet but print video description', default=False)
2432                 verbosity.add_option('--get-filename',
2433                                 action='store_true', dest='getfilename',
2434                                 help='simulate, quiet but print output filename', default=False)
2435                 verbosity.add_option('--no-progress',
2436                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2437                 verbosity.add_option('--console-title',
2438                                 action='store_true', dest='consoletitle',
2439                                 help='display progress in console titlebar', default=False)
2440                 parser.add_option_group(verbosity)
2441
2442                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2443                 filesystem.add_option('-t', '--title',
2444                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2445                 filesystem.add_option('-l', '--literal',
2446                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2447                 filesystem.add_option('-A', '--auto-number',
2448                                 action='store_true', dest='autonumber',
2449                                 help='number downloaded files starting from 00000', default=False)
2450                 filesystem.add_option('-o', '--output',
2451                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2452                 filesystem.add_option('-a', '--batch-file',
2453                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2454                 filesystem.add_option('-w', '--no-overwrites',
2455                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2456                 filesystem.add_option('-c', '--continue',
2457                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2458                 filesystem.add_option('--cookies',
2459                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2460                 filesystem.add_option('--no-part',
2461                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2462                 filesystem.add_option('--no-mtime',
2463                                 action='store_false', dest='updatetime',
2464                                 help='do not use the Last-modified header to set the file modification time', default=True)
2465                 parser.add_option_group(filesystem)
2466
2467                 (opts, args) = parser.parse_args()
2468
2469                 # Open appropriate CookieJar
2470                 if opts.cookiefile is None:
2471                         jar = cookielib.CookieJar()
2472                 else:
2473                         try:
2474                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2475                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2476                                         jar.load()
2477                         except (IOError, OSError), err:
2478                                 sys.exit(u'ERROR: unable to open cookie file')
2479
2480                 # Dump user agent
2481                 if opts.dump_user_agent:
2482                         print std_headers['User-Agent']
2483                         sys.exit(0)
2484
2485                 # General configuration
2486                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2487                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2488                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2489
2490                 # Batch file verification
2491                 batchurls = []
2492                 if opts.batchfile is not None:
2493                         try:
2494                                 if opts.batchfile == '-':
2495                                         batchfd = sys.stdin
2496                                 else:
2497                                         batchfd = open(opts.batchfile, 'r')
2498                                 batchurls = batchfd.readlines()
2499                                 batchurls = [x.strip() for x in batchurls]
2500                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2501                         except IOError:
2502                                 sys.exit(u'ERROR: batch file could not be read')
2503                 all_urls = batchurls + args
2504
2505                 # Conflicting, missing and erroneous options
2506                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2507                         parser.error(u'using .netrc conflicts with giving username/password')
2508                 if opts.password is not None and opts.username is None:
2509                         parser.error(u'account username missing')
2510                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2511                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2512                 if opts.usetitle and opts.useliteral:
2513                         parser.error(u'using title conflicts with using literal title')
2514                 if opts.username is not None and opts.password is None:
2515                         opts.password = getpass.getpass(u'Type account password and press return:')
2516                 if opts.ratelimit is not None:
2517                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2518                         if numeric_limit is None:
2519                                 parser.error(u'invalid rate limit specified')
2520                         opts.ratelimit = numeric_limit
2521                 if opts.retries is not None:
2522                         try:
2523                                 opts.retries = long(opts.retries)
2524                         except (TypeError, ValueError), err:
2525                                 parser.error(u'invalid retry count specified')
2526                 try:
2527                         opts.playliststart = long(opts.playliststart)
2528                         if opts.playliststart <= 0:
2529                                 raise ValueError
2530                 except (TypeError, ValueError), err:
2531                         parser.error(u'invalid playlist start number specified')
2532                 try:
2533                         opts.playlistend = long(opts.playlistend)
2534                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2535                                 raise ValueError
2536                 except (TypeError, ValueError), err:
2537                         parser.error(u'invalid playlist end number specified')
2538
2539                 # Information extractors
2540                 youtube_ie = YoutubeIE()
2541                 metacafe_ie = MetacafeIE(youtube_ie)
2542                 dailymotion_ie = DailymotionIE()
2543                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2544                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2545                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2546                 google_ie = GoogleIE()
2547                 google_search_ie = GoogleSearchIE(google_ie)
2548                 photobucket_ie = PhotobucketIE()
2549                 yahoo_ie = YahooIE()
2550                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2551                 deposit_files_ie = DepositFilesIE()
2552                 generic_ie = GenericIE()
2553
2554                 # File downloader
2555                 fd = FileDownloader({
2556                         'usenetrc': opts.usenetrc,
2557                         'username': opts.username,
2558                         'password': opts.password,
2559                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2560                         'forceurl': opts.geturl,
2561                         'forcetitle': opts.gettitle,
2562                         'forcethumbnail': opts.getthumbnail,
2563                         'forcedescription': opts.getdescription,
2564                         'forcefilename': opts.getfilename,
2565                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2566                         'format': opts.format,
2567                         'format_limit': opts.format_limit,
2568                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2569                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2570                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2571                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2572                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2573                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2574                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2575                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2576                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2577                                 or u'%(id)s.%(ext)s'),
2578                         'ignoreerrors': opts.ignoreerrors,
2579                         'ratelimit': opts.ratelimit,
2580                         'nooverwrites': opts.nooverwrites,
2581                         'retries': opts.retries,
2582                         'continuedl': opts.continue_dl,
2583                         'noprogress': opts.noprogress,
2584                         'playliststart': opts.playliststart,
2585                         'playlistend': opts.playlistend,
2586                         'logtostderr': opts.outtmpl == '-',
2587                         'consoletitle': opts.consoletitle,
2588                         'nopart': opts.nopart,
2589                         'updatetime': opts.updatetime,
2590                         })
2591                 fd.add_info_extractor(youtube_search_ie)
2592                 fd.add_info_extractor(youtube_pl_ie)
2593                 fd.add_info_extractor(youtube_user_ie)
2594                 fd.add_info_extractor(metacafe_ie)
2595                 fd.add_info_extractor(dailymotion_ie)
2596                 fd.add_info_extractor(youtube_ie)
2597                 fd.add_info_extractor(google_ie)
2598                 fd.add_info_extractor(google_search_ie)
2599                 fd.add_info_extractor(photobucket_ie)
2600                 fd.add_info_extractor(yahoo_ie)
2601                 fd.add_info_extractor(yahoo_search_ie)
2602                 fd.add_info_extractor(deposit_files_ie)
2603
2604                 # This must come last since it's the
2605                 # fallback if none of the others work
2606                 fd.add_info_extractor(generic_ie)
2607
2608                 # Update version
2609                 if opts.update_self:
2610                         update_self(fd, sys.argv[0])
2611
2612                 # Maybe do nothing
2613                 if len(all_urls) < 1:
2614                         if not opts.update_self:
2615                                 parser.error(u'you must provide at least one URL')
2616                         else:
2617                                 sys.exit()
2618                 retcode = fd.download(all_urls)
2619
2620                 # Dump cookie jar if requested
2621                 if opts.cookiefile is not None:
2622                         try:
2623                                 jar.save()
2624                         except (IOError, OSError), err:
2625                                 sys.exit(u'ERROR: unable to save cookie jar')
2626
2627                 sys.exit(retcode)
2628
2629         except DownloadError:
2630                 sys.exit(1)
2631         except SameFileError:
2632                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2633         except KeyboardInterrupt:
2634                 sys.exit(u'\nERROR: Interrupted by user')