youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # License: Public domain code
  11 import cookielib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import gzip
  16 import htmlentitydefs
  17 import httplib
  18 import locale
  19 import math
  20 import netrc
  21 import os
  22 import os.path
  23 import re
  24 import socket
  25 import string
  26 import subprocess
  27 import sys
  28 import time
  29 import urllib
  30 import urllib2
  31 import warnings
  32 import zlib
  33
  34 try:
  35         import json
  36 except ImportError:
  37         warnings.warn('No JSON support (TODO: insert trivialjson here)')
  38
  39 try:
  40         import cStringIO as StringIO
  41 except ImportError:
  42         import StringIO
  43
  44 # parse_qs was moved from the cgi module to the urlparse module recently.
  45 try:
  46         from urlparse import parse_qs
  47 except ImportError:
  48         from cgi import parse_qs
  49
  50 try:
  51         import lxml.etree
  52 except ImportError: # Python < 2.6
  53         pass # Handled below
  54
  55 std_headers = {
  56         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  57         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59         'Accept-Encoding': 'gzip, deflate',
  60         'Accept-Language': 'en-us,en;q=0.5',
  61 }
  62
  63 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  64
  65 def preferredencoding():
  66         """Get preferred encoding.
  67
  68         Returns the best encoding scheme for the system, based on
  69         locale.getpreferredencoding() and some further tweaks.
  70         """
  71         def yield_preferredencoding():
  72                 try:
  73                         pref = locale.getpreferredencoding()
  74                         u'TEST'.encode(pref)
  75                 except:
  76                         pref = 'UTF-8'
  77                 while True:
  78                         yield pref
  79         return yield_preferredencoding().next()
  80
  81 def htmlentity_transform(matchobj):
  82         """Transforms an HTML entity to a Unicode character.
  83
  84         This function receives a match object and is intended to be used with
  85         the re.sub() function.
  86         """
  87         entity = matchobj.group(1)
  88
  89         # Known non-numeric HTML entity
  90         if entity in htmlentitydefs.name2codepoint:
  91                 return unichr(htmlentitydefs.name2codepoint[entity])
  92
  93         # Unicode character
  94         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  95         if mobj is not None:
  96                 numstr = mobj.group(1)
  97                 if numstr.startswith(u'x'):
  98                         base = 16
  99                         numstr = u'0%s' % numstr
 100                 else:
 101                         base = 10
 102                 return unichr(long(numstr, base))
 103
 104         # Unknown entity in name, return its literal representation
 105         return (u'&%s;' % entity)
 106
 107 def sanitize_title(utitle):
 108         """Sanitizes a video title so it could be used as part of a filename."""
 109         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 110         return utitle.replace(unicode(os.sep), u'%')
 111
 112 def sanitize_open(filename, open_mode):
 113         """Try to open the given filename, and slightly tweak it if this fails.
 114
 115         Attempts to open the given filename. If this fails, it tries to change
 116         the filename slightly, step by step, until it's either able to open it
 117         or it fails and raises a final exception, like the standard open()
 118         function.
 119
 120         It returns the tuple (stream, definitive_file_name).
 121         """
 122         try:
 123                 if filename == u'-':
 124                         if sys.platform == 'win32':
 125                                 import msvcrt
 126                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 127                         return (sys.stdout, filename)
 128                 stream = open(filename, open_mode)
 129                 return (stream, filename)
 130         except (IOError, OSError), err:
 131                 # In case of error, try to remove win32 forbidden chars
 132                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 133
 134                 # An exception here should be caught in the caller
 135                 stream = open(filename, open_mode)
 136                 return (stream, filename)
 137
 138 def timeconvert(timestr):
 139     """Convert RFC 2822 defined time string into system timestamp"""
 140     timestamp = None
 141     timetuple = email.utils.parsedate_tz(timestr)
 142     if timetuple is not None:
 143         timestamp = email.utils.mktime_tz(timetuple)
 144     return timestamp
 145
 146 class DownloadError(Exception):
 147         """Download Error exception.
 148
 149         This exception may be thrown by FileDownloader objects if they are not
 150         configured to continue on errors. They will contain the appropriate
 151         error message.
 152         """
 153         pass
 154
 155 class SameFileError(Exception):
 156         """Same File exception.
 157
 158         This exception will be thrown by FileDownloader objects if they detect
 159         multiple files would have to be downloaded to the same file on disk.
 160         """
 161         pass
 162
 163 class PostProcessingError(Exception):
 164         """Post Processing exception.
 165
 166         This exception may be raised by PostProcessor's .run() method to
 167         indicate an error in the postprocessing task.
 168         """
 169         pass
 170
 171 class UnavailableVideoError(Exception):
 172         """Unavailable Format exception.
 173
 174         This exception will be thrown when a video is requested
 175         in a format that is not available for that video.
 176         """
 177         pass
 178
 179 class ContentTooShortError(Exception):
 180         """Content Too Short exception.
 181
 182         This exception may be raised by FileDownloader objects when a file they
 183         download is too small for what the server announced first, indicating
 184         the connection was probably interrupted.
 185         """
 186         # Both in bytes
 187         downloaded = None
 188         expected = None
 189
 190         def __init__(self, downloaded, expected):
 191                 self.downloaded = downloaded
 192                 self.expected = expected
 193
 194 class YoutubeDLHandler(urllib2.HTTPHandler):
 195         """Handler for HTTP requests and responses.
 196
 197         This class, when installed with an OpenerDirector, automatically adds
 198         the standard headers to every HTTP request and handles gzipped and
 199         deflated responses from web servers. If compression is to be avoided in
 200         a particular request, the original request in the program code only has
 201         to include the HTTP header "Youtubedl-No-Compression", which will be
 202         removed before making the real request.
 203
 204         Part of this code was copied from:
 205
 206           http://techknack.net/python-urllib2-handlers/
 207
 208         Andrew Rowls, the author of that code, agreed to release it to the
 209         public domain.
 210         """
 211
 212         @staticmethod
 213         def deflate(data):
 214                 try:
 215                         return zlib.decompress(data, -zlib.MAX_WBITS)
 216                 except zlib.error:
 217                         return zlib.decompress(data)
 218
 219         @staticmethod
 220         def addinfourl_wrapper(stream, headers, url, code):
 221                 if hasattr(urllib2.addinfourl, 'getcode'):
 222                         return urllib2.addinfourl(stream, headers, url, code)
 223                 ret = urllib2.addinfourl(stream, headers, url)
 224                 ret.code = code
 225                 return ret
 226
 227         def http_request(self, req):
 228                 for h in std_headers:
 229                         if h in req.headers:
 230                                 del req.headers[h]
 231                         req.add_header(h, std_headers[h])
 232                 if 'Youtubedl-no-compression' in req.headers:
 233                         if 'Accept-encoding' in req.headers:
 234                                 del req.headers['Accept-encoding']
 235                         del req.headers['Youtubedl-no-compression']
 236                 return req
 237
 238         def http_response(self, req, resp):
 239                 old_resp = resp
 240                 # gzip
 241                 if resp.headers.get('Content-encoding', '') == 'gzip':
 242                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 243                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 244                         resp.msg = old_resp.msg
 245                 # deflate
 246                 if resp.headers.get('Content-encoding', '') == 'deflate':
 247                         gz = StringIO.StringIO(self.deflate(resp.read()))
 248                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 249                         resp.msg = old_resp.msg
 250                 return resp
 251
 252 class FileDownloader(object):
 253         """File Downloader class.
 254
 255         File downloader objects are the ones responsible of downloading the
 256         actual video file and writing it to disk if the user has requested
 257         it, among some other tasks. In most cases there should be one per
 258         program. As, given a video URL, the downloader doesn't know how to
 259         extract all the needed information, task that InfoExtractors do, it
 260         has to pass the URL to one of them.
 261
 262         For this, file downloader objects have a method that allows
 263         InfoExtractors to be registered in a given order. When it is passed
 264         a URL, the file downloader handles it to the first InfoExtractor it
 265         finds that reports being able to handle it. The InfoExtractor extracts
 266         all the information about the video or videos the URL refers to, and
 267         asks the FileDownloader to process the video information, possibly
 268         downloading the video.
 269
 270         File downloaders accept a lot of parameters. In order not to saturate
 271         the object constructor with arguments, it receives a dictionary of
 272         options instead. These options are available through the params
 273         attribute for the InfoExtractors to use. The FileDownloader also
 274         registers itself as the downloader in charge for the InfoExtractors
 275         that are added to it, so this is a "mutual registration".
 276
 277         Available options:
 278
 279         username:         Username for authentication purposes.
 280         password:         Password for authentication purposes.
 281         usenetrc:         Use netrc for authentication instead.
 282         quiet:            Do not print messages to stdout.
 283         forceurl:         Force printing final URL.
 284         forcetitle:       Force printing title.
 285         forcethumbnail:   Force printing thumbnail URL.
 286         forcedescription: Force printing description.
 287         forcefilename:    Force printing final filename.
 288         simulate:         Do not download the video files.
 289         format:           Video format code.
 290         format_limit:     Highest quality format to try.
 291         outtmpl:          Template for output names.
 292         ignoreerrors:     Do not stop on download errors.
 293         ratelimit:        Download speed limit, in bytes/sec.
 294         nooverwrites:     Prevent overwriting files.
 295         retries:          Number of times to retry for HTTP error 5xx
 296         continuedl:       Try to continue downloads if possible.
 297         noprogress:       Do not print the progress bar.
 298         playliststart:    Playlist item to start at.
 299         playlistend:      Playlist item to end at.
 300         logtostderr:      Log messages to stderr instead of stdout.
 301         consoletitle:     Display progress in console window's titlebar.
 302         nopart:           Do not use temporary .part files.
 303         updatetime:       Use the Last-modified header to set output file timestamps.
 304         """
 305
 306         params = None
 307         _ies = []
 308         _pps = []
 309         _download_retcode = None
 310         _num_downloads = None
 311         _screen_file = None
 312
 313         def __init__(self, params):
 314                 """Create a FileDownloader object with the given options."""
 315                 self._ies = []
 316                 self._pps = []
 317                 self._download_retcode = 0
 318                 self._num_downloads = 0
 319                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 320                 self.params = params
 321
 322         @staticmethod
 323         def pmkdir(filename):
 324                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 325                 components = filename.split(os.sep)
 326                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 327                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 328                 for dir in aggregate:
 329                         if not os.path.exists(dir):
 330                                 os.mkdir(dir)
 331
 332         @staticmethod
 333         def format_bytes(bytes):
 334                 if bytes is None:
 335                         return 'N/A'
 336                 if type(bytes) is str:
 337                         bytes = float(bytes)
 338                 if bytes == 0.0:
 339                         exponent = 0
 340                 else:
 341                         exponent = long(math.log(bytes, 1024.0))
 342                 suffix = 'bkMGTPEZY'[exponent]
 343                 converted = float(bytes) / float(1024**exponent)
 344                 return '%.2f%s' % (converted, suffix)
 345
 346         @staticmethod
 347         def calc_percent(byte_counter, data_len):
 348                 if data_len is None:
 349                         return '---.-%'
 350                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 351
 352         @staticmethod
 353         def calc_eta(start, now, total, current):
 354                 if total is None:
 355                         return '--:--'
 356                 dif = now - start
 357                 if current == 0 or dif < 0.001: # One millisecond
 358                         return '--:--'
 359                 rate = float(current) / dif
 360                 eta = long((float(total) - float(current)) / rate)
 361                 (eta_mins, eta_secs) = divmod(eta, 60)
 362                 if eta_mins > 99:
 363                         return '--:--'
 364                 return '%02d:%02d' % (eta_mins, eta_secs)
 365
 366         @staticmethod
 367         def calc_speed(start, now, bytes):
 368                 dif = now - start
 369                 if bytes == 0 or dif < 0.001: # One millisecond
 370                         return '%10s' % '---b/s'
 371                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 372
 373         @staticmethod
 374         def best_block_size(elapsed_time, bytes):
 375                 new_min = max(bytes / 2.0, 1.0)
 376                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 377                 if elapsed_time < 0.001:
 378                         return long(new_max)
 379                 rate = bytes / elapsed_time
 380                 if rate > new_max:
 381                         return long(new_max)
 382                 if rate < new_min:
 383                         return long(new_min)
 384                 return long(rate)
 385
 386         @staticmethod
 387         def parse_bytes(bytestr):
 388                 """Parse a string indicating a byte quantity into a long integer."""
 389                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 390                 if matchobj is None:
 391                         return None
 392                 number = float(matchobj.group(1))
 393                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 394                 return long(round(number * multiplier))
 395
 396         def add_info_extractor(self, ie):
 397                 """Add an InfoExtractor object to the end of the list."""
 398                 self._ies.append(ie)
 399                 ie.set_downloader(self)
 400
 401         def add_post_processor(self, pp):
 402                 """Add a PostProcessor object to the end of the chain."""
 403                 self._pps.append(pp)
 404                 pp.set_downloader(self)
 405
 406         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 407                 """Print message to stdout if not in quiet mode."""
 408                 try:
 409                         if not self.params.get('quiet', False):
 410                                 terminator = [u'\n', u''][skip_eol]
 411                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 412                         self._screen_file.flush()
 413                 except (UnicodeEncodeError), err:
 414                         if not ignore_encoding_errors:
 415                                 raise
 416
 417         def to_stderr(self, message):
 418                 """Print message to stderr."""
 419                 print >>sys.stderr, message.encode(preferredencoding())
 420
 421         def to_cons_title(self, message):
 422                 """Set console/terminal window title to message."""
 423                 if not self.params.get('consoletitle', False):
 424                         return
 425                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 426                         # c_wchar_p() might not be necessary if `message` is
 427                         # already of type unicode()
 428                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 429                 elif 'TERM' in os.environ:
 430                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 431
 432         def fixed_template(self):
 433                 """Checks if the output template is fixed."""
 434                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 435
 436         def trouble(self, message=None):
 437                 """Determine action to take when a download problem appears.
 438
 439                 Depending on if the downloader has been configured to ignore
 440                 download errors or not, this method may throw an exception or
 441                 not when errors are found, after printing the message.
 442                 """
 443                 if message is not None:
 444                         self.to_stderr(message)
 445                 if not self.params.get('ignoreerrors', False):
 446                         raise DownloadError(message)
 447                 self._download_retcode = 1
 448
 449         def slow_down(self, start_time, byte_counter):
 450                 """Sleep if the download speed is over the rate limit."""
 451                 rate_limit = self.params.get('ratelimit', None)
 452                 if rate_limit is None or byte_counter == 0:
 453                         return
 454                 now = time.time()
 455                 elapsed = now - start_time
 456                 if elapsed <= 0.0:
 457                         return
 458                 speed = float(byte_counter) / elapsed
 459                 if speed > rate_limit:
 460                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 461
 462         def temp_name(self, filename):
 463                 """Returns a temporary filename for the given filename."""
 464                 if self.params.get('nopart', False) or filename == u'-' or \
 465                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 466                         return filename
 467                 return filename + u'.part'
 468
 469         def undo_temp_name(self, filename):
 470                 if filename.endswith(u'.part'):
 471                         return filename[:-len(u'.part')]
 472                 return filename
 473
 474         def try_rename(self, old_filename, new_filename):
 475                 try:
 476                         if old_filename == new_filename:
 477                                 return
 478                         os.rename(old_filename, new_filename)
 479                 except (IOError, OSError), err:
 480                         self.trouble(u'ERROR: unable to rename file')
 481
 482         def try_utime(self, filename, last_modified_hdr):
 483                 """Try to set the last-modified time of the given file."""
 484                 if last_modified_hdr is None:
 485                         return
 486                 if not os.path.isfile(filename):
 487                         return
 488                 timestr = last_modified_hdr
 489                 if timestr is None:
 490                         return
 491                 filetime = timeconvert(timestr)
 492                 if filetime is None:
 493                         return
 494                 try:
 495                         os.utime(filename,(time.time(), filetime))
 496                 except:
 497                         pass
 498
 499         def report_destination(self, filename):
 500                 """Report destination filename."""
 501                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 502
 503         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 504                 """Report download progress."""
 505                 if self.params.get('noprogress', False):
 506                         return
 507                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 508                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 509                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 510                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 511
 512         def report_resuming_byte(self, resume_len):
 513                 """Report attempt to resume at given byte."""
 514                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 515
 516         def report_retry(self, count, retries):
 517                 """Report retry in case of HTTP error 5xx"""
 518                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 519
 520         def report_file_already_downloaded(self, file_name):
 521                 """Report file has already been fully downloaded."""
 522                 try:
 523                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 524                 except (UnicodeEncodeError), err:
 525                         self.to_screen(u'[download] The file has already been downloaded')
 526
 527         def report_unable_to_resume(self):
 528                 """Report it was impossible to resume download."""
 529                 self.to_screen(u'[download] Unable to resume')
 530
 531         def report_finish(self):
 532                 """Report download finished."""
 533                 if self.params.get('noprogress', False):
 534                         self.to_screen(u'[download] Download completed')
 535                 else:
 536                         self.to_screen(u'')
 537
 538         def increment_downloads(self):
 539                 """Increment the ordinal that assigns a number to each file."""
 540                 self._num_downloads += 1
 541
 542         def prepare_filename(self, info_dict):
 543                 """Generate the output filename."""
 544                 try:
 545                         template_dict = dict(info_dict)
 546                         template_dict['epoch'] = unicode(long(time.time()))
 547                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 548                         filename = self.params['outtmpl'] % template_dict
 549                         return filename
 550                 except (ValueError, KeyError), err:
 551                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 552                         return None
 553
 554         def process_info(self, info_dict):
 555                 """Process a single dictionary returned by an InfoExtractor."""
 556                 filename = self.prepare_filename(info_dict)
 557                 # Do nothing else if in simulate mode
 558                 if self.params.get('simulate', False):
 559                         # Forced printings
 560                         if self.params.get('forcetitle', False):
 561                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 562                         if self.params.get('forceurl', False):
 563                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 564                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 565                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 566                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 567                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 568                         if self.params.get('forcefilename', False) and filename is not None:
 569                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 570
 571                         return
 572
 573                 if filename is None:
 574                         return
 575                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 576                         self.to_stderr(u'WARNING: file exists and will be skipped')
 577                         return
 578
 579                 try:
 580                         self.pmkdir(filename)
 581                 except (OSError, IOError), err:
 582                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 583                         return
 584
 585                 try:
 586                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 587                 except (OSError, IOError), err:
 588                         raise UnavailableVideoError
 589                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 590                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 591                         return
 592                 except (ContentTooShortError, ), err:
 593                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 594                         return
 595
 596                 if success:
 597                         try:
 598                                 self.post_process(filename, info_dict)
 599                         except (PostProcessingError), err:
 600                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 601                                 return
 602
 603         def download(self, url_list):
 604                 """Download a given list of URLs."""
 605                 if len(url_list) > 1 and self.fixed_template():
 606                         raise SameFileError(self.params['outtmpl'])
 607
 608                 for url in url_list:
 609                         suitable_found = False
 610                         for ie in self._ies:
 611                                 # Go to next InfoExtractor if not suitable
 612                                 if not ie.suitable(url):
 613                                         continue
 614
 615                                 # Suitable InfoExtractor found
 616                                 suitable_found = True
 617
 618                                 # Extract information from URL and process it
 619                                 ie.extract(url)
 620
 621                                 # Suitable InfoExtractor had been found; go to next URL
 622                                 break
 623
 624                         if not suitable_found:
 625                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 626
 627                 return self._download_retcode
 628
 629         def post_process(self, filename, ie_info):
 630                 """Run the postprocessing chain on the given file."""
 631                 info = dict(ie_info)
 632                 info['filepath'] = filename
 633                 for pp in self._pps:
 634                         info = pp.run(info)
 635                         if info is None:
 636                                 break
 637
 638         def _download_with_rtmpdump(self, filename, url, player_url):
 639                 self.report_destination(filename)
 640                 tmpfilename = self.temp_name(filename)
 641
 642                 # Check for rtmpdump first
 643                 try:
 644                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 645                 except (OSError, IOError):
 646                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 647                         return False
 648
 649                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 650                 # the connection was interrumpted and resuming appears to be
 651                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 652                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 653                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 654                 while retval == 2 or retval == 1:
 655                         prevsize = os.path.getsize(tmpfilename)
 656                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 657                         time.sleep(5.0) # This seems to be needed
 658                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 659                         cursize = os.path.getsize(tmpfilename)
 660                         if prevsize == cursize and retval == 1:
 661                                 break
 662                 if retval == 0:
 663                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 664                         self.try_rename(tmpfilename, filename)
 665                         return True
 666                 else:
 667                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 668                         return False
 669
 670         def _do_download(self, filename, url, player_url):
 671                 # Check file already present
 672                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 673                         self.report_file_already_downloaded(filename)
 674                         return True
 675
 676                 # Attempt to download using rtmpdump
 677                 if url.startswith('rtmp'):
 678                         return self._download_with_rtmpdump(filename, url, player_url)
 679
 680                 tmpfilename = self.temp_name(filename)
 681                 stream = None
 682                 open_mode = 'wb'
 683
 684                 # Do not include the Accept-Encoding header
 685                 headers = {'Youtubedl-no-compression': 'True'}
 686                 basic_request = urllib2.Request(url, None, headers)
 687                 request = urllib2.Request(url, None, headers)
 688
 689                 # Establish possible resume length
 690                 if os.path.isfile(tmpfilename):
 691                         resume_len = os.path.getsize(tmpfilename)
 692                 else:
 693                         resume_len = 0
 694
 695                 # Request parameters in case of being able to resume
 696                 if self.params.get('continuedl', False) and resume_len != 0:
 697                         self.report_resuming_byte(resume_len)
 698                         request.add_header('Range','bytes=%d-' % resume_len)
 699                         open_mode = 'ab'
 700
 701                 count = 0
 702                 retries = self.params.get('retries', 0)
 703                 while count <= retries:
 704                         # Establish connection
 705                         try:
 706                                 data = urllib2.urlopen(request)
 707                                 break
 708                         except (urllib2.HTTPError, ), err:
 709                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 710                                         # Unexpected HTTP error
 711                                         raise
 712                                 elif err.code == 416:
 713                                         # Unable to resume (requested range not satisfiable)
 714                                         try:
 715                                                 # Open the connection again without the range header
 716                                                 data = urllib2.urlopen(basic_request)
 717                                                 content_length = data.info()['Content-Length']
 718                                         except (urllib2.HTTPError, ), err:
 719                                                 if err.code < 500 or err.code >= 600:
 720                                                         raise
 721                                         else:
 722                                                 # Examine the reported length
 723                                                 if (content_length is not None and
 724                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 725                                                         # The file had already been fully downloaded.
 726                                                         # Explanation to the above condition: in issue #175 it was revealed that
 727                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 728                                                         # changing the file size slightly and causing problems for some users. So
 729                                                         # I decided to implement a suggested change and consider the file
 730                                                         # completely downloaded if the file size differs less than 100 bytes from
 731                                                         # the one in the hard drive.
 732                                                         self.report_file_already_downloaded(filename)
 733                                                         self.try_rename(tmpfilename, filename)
 734                                                         return True
 735                                                 else:
 736                                                         # The length does not match, we start the download over
 737                                                         self.report_unable_to_resume()
 738                                                         open_mode = 'wb'
 739                                                         break
 740                         # Retry
 741                         count += 1
 742                         if count <= retries:
 743                                 self.report_retry(count, retries)
 744
 745                 if count > retries:
 746                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 747                         return False
 748
 749                 data_len = data.info().get('Content-length', None)
 750                 if data_len is not None:
 751                         data_len = long(data_len) + resume_len
 752                 data_len_str = self.format_bytes(data_len)
 753                 byte_counter = 0 + resume_len
 754                 block_size = 1024
 755                 start = time.time()
 756                 while True:
 757                         # Download and write
 758                         before = time.time()
 759                         data_block = data.read(block_size)
 760                         after = time.time()
 761                         if len(data_block) == 0:
 762                                 break
 763                         byte_counter += len(data_block)
 764
 765                         # Open file just in time
 766                         if stream is None:
 767                                 try:
 768                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 769                                         filename = self.undo_temp_name(tmpfilename)
 770                                         self.report_destination(filename)
 771                                 except (OSError, IOError), err:
 772                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 773                                         return False
 774                         try:
 775                                 stream.write(data_block)
 776                         except (IOError, OSError), err:
 777                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 778                                 return False
 779                         block_size = self.best_block_size(after - before, len(data_block))
 780
 781                         # Progress message
 782                         percent_str = self.calc_percent(byte_counter, data_len)
 783                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 784                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 785                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 786
 787                         # Apply rate limit
 788                         self.slow_down(start, byte_counter - resume_len)
 789
 790                 stream.close()
 791                 self.report_finish()
 792                 if data_len is not None and byte_counter != data_len:
 793                         raise ContentTooShortError(byte_counter, long(data_len))
 794                 self.try_rename(tmpfilename, filename)
 795
 796                 # Update file modification time
 797                 if self.params.get('updatetime', True):
 798                         self.try_utime(filename, data.info().get('last-modified', None))
 799
 800                 return True
 801
 802 class InfoExtractor(object):
 803         """Information Extractor class.
 804
 805         Information extractors are the classes that, given a URL, extract
 806         information from the video (or videos) the URL refers to. This
 807         information includes the real video URL, the video title and simplified
 808         title, author and others. The information is stored in a dictionary
 809         which is then passed to the FileDownloader. The FileDownloader
 810         processes this information possibly downloading the video to the file
 811         system, among other possible outcomes. The dictionaries must include
 812         the following fields:
 813
 814         id:             Video identifier.
 815         url:            Final video URL.
 816         uploader:       Nickname of the video uploader.
 817         title:          Literal title.
 818         stitle:         Simplified title.
 819         ext:            Video filename extension.
 820         format:         Video format.
 821         player_url:     SWF Player URL (may be None).
 822
 823         The following fields are optional. Their primary purpose is to allow
 824         youtube-dl to serve as the backend for a video search function, such
 825         as the one in youtube2mp3.  They are only used when their respective
 826         forced printing functions are called:
 827
 828         thumbnail:      Full URL to a video thumbnail image.
 829         description:    One-line video description.
 830
 831         Subclasses of this one should re-define the _real_initialize() and
 832         _real_extract() methods, as well as the suitable() static method.
 833         Probably, they should also be instantiated and added to the main
 834         downloader.
 835         """
 836
 837         _ready = False
 838         _downloader = None
 839
 840         def __init__(self, downloader=None):
 841                 """Constructor. Receives an optional downloader."""
 842                 self._ready = False
 843                 self.set_downloader(downloader)
 844
 845         @staticmethod
 846         def suitable(url):
 847                 """Receives a URL and returns True if suitable for this IE."""
 848                 return False
 849
 850         def initialize(self):
 851                 """Initializes an instance (authentication, etc)."""
 852                 if not self._ready:
 853                         self._real_initialize()
 854                         self._ready = True
 855
 856         def extract(self, url):
 857                 """Extracts URL information and returns it in list of dicts."""
 858                 self.initialize()
 859                 return self._real_extract(url)
 860
 861         def set_downloader(self, downloader):
 862                 """Sets the downloader for this IE."""
 863                 self._downloader = downloader
 864
 865         def _real_initialize(self):
 866                 """Real initialization process. Redefine in subclasses."""
 867                 pass
 868
 869         def _real_extract(self, url):
 870                 """Real extraction process. Redefine in subclasses."""
 871                 pass
 872
 873 class YoutubeIE(InfoExtractor):
 874         """Information extractor for youtube.com."""
 875
 876         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 877         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 878         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 879         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 880         _NETRC_MACHINE = 'youtube'
 881         # Listed in order of quality
 882         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 883         _video_extensions = {
 884                 '13': '3gp',
 885                 '17': 'mp4',
 886                 '18': 'mp4',
 887                 '22': 'mp4',
 888                 '37': 'mp4',
 889                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 890                 '43': 'webm',
 891                 '45': 'webm',
 892         }
 893
 894         @staticmethod
 895         def suitable(url):
 896                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 897
 898         def report_lang(self):
 899                 """Report attempt to set language."""
 900                 self._downloader.to_screen(u'[youtube] Setting language')
 901
 902         def report_login(self):
 903                 """Report attempt to log in."""
 904                 self._downloader.to_screen(u'[youtube] Logging in')
 905
 906         def report_age_confirmation(self):
 907                 """Report attempt to confirm age."""
 908                 self._downloader.to_screen(u'[youtube] Confirming age')
 909
 910         def report_video_webpage_download(self, video_id):
 911                 """Report attempt to download video webpage."""
 912                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 913
 914         def report_video_info_webpage_download(self, video_id):
 915                 """Report attempt to download video info webpage."""
 916                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 917
 918         def report_information_extraction(self, video_id):
 919                 """Report attempt to extract video information."""
 920                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 921
 922         def report_unavailable_format(self, video_id, format):
 923                 """Report extracted video URL."""
 924                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 925
 926         def report_rtmp_download(self):
 927                 """Indicate the download will use the RTMP protocol."""
 928                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 929
 930         def _real_initialize(self):
 931                 if self._downloader is None:
 932                         return
 933
 934                 username = None
 935                 password = None
 936                 downloader_params = self._downloader.params
 937
 938                 # Attempt to use provided username and password or .netrc data
 939                 if downloader_params.get('username', None) is not None:
 940                         username = downloader_params['username']
 941                         password = downloader_params['password']
 942                 elif downloader_params.get('usenetrc', False):
 943                         try:
 944                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 945                                 if info is not None:
 946                                         username = info[0]
 947                                         password = info[2]
 948                                 else:
 949                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 950                         except (IOError, netrc.NetrcParseError), err:
 951                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 952                                 return
 953
 954                 # Set language
 955                 request = urllib2.Request(self._LANG_URL)
 956                 try:
 957                         self.report_lang()
 958                         urllib2.urlopen(request).read()
 959                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 960                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 961                         return
 962
 963                 # No authentication to be performed
 964                 if username is None:
 965                         return
 966
 967                 # Log in
 968                 login_form = {
 969                                 'current_form': 'loginForm',
 970                                 'next':         '/',
 971                                 'action_login': 'Log In',
 972                                 'username':     username,
 973                                 'password':     password,
 974                                 }
 975                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 976                 try:
 977                         self.report_login()
 978                         login_results = urllib2.urlopen(request).read()
 979                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 980                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 981                                 return
 982                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 983                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 984                         return
 985
 986                 # Confirm age
 987                 age_form = {
 988                                 'next_url':             '/',
 989                                 'action_confirm':       'Confirm',
 990                                 }
 991                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 992                 try:
 993                         self.report_age_confirmation()
 994                         age_results = urllib2.urlopen(request).read()
 995                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 996                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 997                         return
 998
 999         def _real_extract(self, url):
1000                 # Extract video id from URL
1001                 mobj = re.match(self._VALID_URL, url)
1002                 if mobj is None:
1003                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1004                         return
1005                 video_id = mobj.group(2)
1006
1007                 # Get video webpage
1008                 self.report_video_webpage_download(video_id)
1009                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1010                 try:
1011                         video_webpage = urllib2.urlopen(request).read()
1012                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1013                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1014                         return
1015
1016                 # Attempt to extract SWF player URL
1017                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1018                 if mobj is not None:
1019                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1020                 else:
1021                         player_url = None
1022
1023                 # Get video info
1024                 self.report_video_info_webpage_download(video_id)
1025                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1026                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1027                                            % (video_id, el_type))
1028                         request = urllib2.Request(video_info_url)
1029                         try:
1030                                 video_info_webpage = urllib2.urlopen(request).read()
1031                                 video_info = parse_qs(video_info_webpage)
1032                                 if 'token' in video_info:
1033                                         break
1034                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1035                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1036                                 return
1037                 if 'token' not in video_info:
1038                         if 'reason' in video_info:
1039                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1040                         else:
1041                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1042                         return
1043
1044                 # Start extracting information
1045                 self.report_information_extraction(video_id)
1046
1047                 # uploader
1048                 if 'author' not in video_info:
1049                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1050                         return
1051                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1052
1053                 # title
1054                 if 'title' not in video_info:
1055                         self._downloader.trouble(u'ERROR: unable to extract video title')
1056                         return
1057                 video_title = urllib.unquote_plus(video_info['title'][0])
1058                 video_title = video_title.decode('utf-8')
1059                 video_title = sanitize_title(video_title)
1060
1061                 # simplified title
1062                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1063                 simple_title = simple_title.strip(ur'_')
1064
1065                 # thumbnail image
1066                 if 'thumbnail_url' not in video_info:
1067                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1068                         video_thumbnail = ''
1069                 else:   # don't panic if we can't find it
1070                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1071
1072                 # upload date
1073                 upload_date = u'NA'
1074                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1075                 if mobj is not None:
1076                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1077                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1078                         for expression in format_expressions:
1079                                 try:
1080                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1081                                 except:
1082                                         pass
1083
1084                 # description
1085                 try:
1086                         lxml.etree
1087                 except NameError:
1088                         video_description = u'No description available.'
1089                         if self._downloader.params.get('forcedescription', False):
1090                                 warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
1091                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1092                                 if mobj is not None:
1093                                         video_description = mobj.group(1).decode('utf-8')
1094                 else:
1095                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1096                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1097                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1098
1099                 # token
1100                 video_token = urllib.unquote_plus(video_info['token'][0])
1101
1102                 # Decide which formats to download
1103                 req_format = self._downloader.params.get('format', None)
1104
1105                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1106                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1107                         format_limit = self._downloader.params.get('format_limit', None)
1108                         if format_limit is not None and format_limit in self._available_formats:
1109                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1110                         else:
1111                                 format_list = self._available_formats
1112                         existing_formats = [x for x in format_list if x in url_map]
1113                         if len(existing_formats) == 0:
1114                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1115                                 return
1116                         if req_format is None:
1117                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1118                         elif req_format == '-1':
1119                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1120                         else:
1121                                 # Specific format
1122                                 if req_format not in url_map:
1123                                         self._downloader.trouble(u'ERROR: requested format not available')
1124                                         return
1125                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1126
1127                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1128                         self.report_rtmp_download()
1129                         video_url_list = [(None, video_info['conn'][0])]
1130
1131                 else:
1132                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1133                         return
1134
1135                 for format_param, video_real_url in video_url_list:
1136                         # At this point we have a new video
1137                         self._downloader.increment_downloads()
1138
1139                         # Extension
1140                         video_extension = self._video_extensions.get(format_param, 'flv')
1141
1142                         # Find the video URL in fmt_url_map or conn paramters
1143                         try:
1144                                 # Process video information
1145                                 self._downloader.process_info({
1146                                         'id':           video_id.decode('utf-8'),
1147                                         'url':          video_real_url.decode('utf-8'),
1148                                         'uploader':     video_uploader.decode('utf-8'),
1149                                         'upload_date':  upload_date,
1150                                         'title':        video_title,
1151                                         'stitle':       simple_title,
1152                                         'ext':          video_extension.decode('utf-8'),
1153                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1154                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1155                                         'description':  video_description,
1156                                         'player_url':   player_url,
1157                                 })
1158                         except UnavailableVideoError, err:
1159                                 self._downloader.trouble(u'\nERROR: unable to download video')
1160
1161
1162 class MetacafeIE(InfoExtractor):
1163         """Information Extractor for metacafe.com."""
1164
1165         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1166         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1167         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1168         _youtube_ie = None
1169
1170         def __init__(self, youtube_ie, downloader=None):
1171                 InfoExtractor.__init__(self, downloader)
1172                 self._youtube_ie = youtube_ie
1173
1174         @staticmethod
1175         def suitable(url):
1176                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1177
1178         def report_disclaimer(self):
1179                 """Report disclaimer retrieval."""
1180                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1181
1182         def report_age_confirmation(self):
1183                 """Report attempt to confirm age."""
1184                 self._downloader.to_screen(u'[metacafe] Confirming age')
1185
1186         def report_download_webpage(self, video_id):
1187                 """Report webpage download."""
1188                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1189
1190         def report_extraction(self, video_id):
1191                 """Report information extraction."""
1192                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1193
1194         def _real_initialize(self):
1195                 # Retrieve disclaimer
1196                 request = urllib2.Request(self._DISCLAIMER)
1197                 try:
1198                         self.report_disclaimer()
1199                         disclaimer = urllib2.urlopen(request).read()
1200                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1201                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1202                         return
1203
1204                 # Confirm age
1205                 disclaimer_form = {
1206                         'filters': '0',
1207                         'submit': "Continue - I'm over 18",
1208                         }
1209                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1210                 try:
1211                         self.report_age_confirmation()
1212                         disclaimer = urllib2.urlopen(request).read()
1213                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1214                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1215                         return
1216
1217         def _real_extract(self, url):
1218                 # Extract id and simplified title from URL
1219                 mobj = re.match(self._VALID_URL, url)
1220                 if mobj is None:
1221                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1222                         return
1223
1224                 video_id = mobj.group(1)
1225
1226                 # Check if video comes from YouTube
1227                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1228                 if mobj2 is not None:
1229                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1230                         return
1231
1232                 # At this point we have a new video
1233                 self._downloader.increment_downloads()
1234
1235                 simple_title = mobj.group(2).decode('utf-8')
1236
1237                 # Retrieve video webpage to extract further information
1238                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1239                 try:
1240                         self.report_download_webpage(video_id)
1241                         webpage = urllib2.urlopen(request).read()
1242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1244                         return
1245
1246                 # Extract URL, uploader and title from webpage
1247                 self.report_extraction(video_id)
1248                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1249                 if mobj is not None:
1250                         mediaURL = urllib.unquote(mobj.group(1))
1251                         video_extension = mediaURL[-3:]
1252
1253                         # Extract gdaKey if available
1254                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1255                         if mobj is None:
1256                                 video_url = mediaURL
1257                         else:
1258                                 gdaKey = mobj.group(1)
1259                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1260                 else:
1261                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1262                         if mobj is None:
1263                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1264                                 return
1265                         vardict = parse_qs(mobj.group(1))
1266                         if 'mediaData' not in vardict:
1267                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1268                                 return
1269                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1270                         if mobj is None:
1271                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1272                                 return
1273                         mediaURL = mobj.group(1).replace('\\/', '/')
1274                         video_extension = mediaURL[-3:]
1275                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1276
1277                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1278                 if mobj is None:
1279                         self._downloader.trouble(u'ERROR: unable to extract title')
1280                         return
1281                 video_title = mobj.group(1).decode('utf-8')
1282                 video_title = sanitize_title(video_title)
1283
1284                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1285                 if mobj is None:
1286                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1287                         return
1288                 video_uploader = mobj.group(1)
1289
1290                 try:
1291                         # Process video information
1292                         self._downloader.process_info({
1293                                 'id':           video_id.decode('utf-8'),
1294                                 'url':          video_url.decode('utf-8'),
1295                                 'uploader':     video_uploader.decode('utf-8'),
1296                                 'upload_date':  u'NA',
1297                                 'title':        video_title,
1298                                 'stitle':       simple_title,
1299                                 'ext':          video_extension.decode('utf-8'),
1300                                 'format':       u'NA',
1301                                 'player_url':   None,
1302                         })
1303                 except UnavailableVideoError:
1304                         self._downloader.trouble(u'\nERROR: unable to download video')
1305
1306
1307 class DailymotionIE(InfoExtractor):
1308         """Information Extractor for Dailymotion"""
1309
1310         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1311
1312         def __init__(self, downloader=None):
1313                 InfoExtractor.__init__(self, downloader)
1314
1315         @staticmethod
1316         def suitable(url):
1317                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1318
1319         def report_download_webpage(self, video_id):
1320                 """Report webpage download."""
1321                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1322
1323         def report_extraction(self, video_id):
1324                 """Report information extraction."""
1325                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1326
1327         def _real_initialize(self):
1328                 return
1329
1330         def _real_extract(self, url):
1331                 # Extract id and simplified title from URL
1332                 mobj = re.match(self._VALID_URL, url)
1333                 if mobj is None:
1334                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1335                         return
1336
1337                 # At this point we have a new video
1338                 self._downloader.increment_downloads()
1339                 video_id = mobj.group(1)
1340
1341                 simple_title = mobj.group(2).decode('utf-8')
1342                 video_extension = 'flv'
1343
1344                 # Retrieve video webpage to extract further information
1345                 request = urllib2.Request(url)
1346                 try:
1347                         self.report_download_webpage(video_id)
1348                         webpage = urllib2.urlopen(request).read()
1349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1351                         return
1352
1353                 # Extract URL, uploader and title from webpage
1354                 self.report_extraction(video_id)
1355                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1356                 if mobj is None:
1357                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1358                         return
1359                 mediaURL = urllib.unquote(mobj.group(1))
1360
1361                 # if needed add http://www.dailymotion.com/ if relative URL
1362
1363                 video_url = mediaURL
1364
1365                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1366                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1367                 if mobj is None:
1368                         self._downloader.trouble(u'ERROR: unable to extract title')
1369                         return
1370                 video_title = mobj.group(1).decode('utf-8')
1371                 video_title = sanitize_title(video_title)
1372
1373                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1374                 if mobj is None:
1375                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1376                         return
1377                 video_uploader = mobj.group(1)
1378
1379                 try:
1380                         # Process video information
1381                         self._downloader.process_info({
1382                                 'id':           video_id.decode('utf-8'),
1383                                 'url':          video_url.decode('utf-8'),
1384                                 'uploader':     video_uploader.decode('utf-8'),
1385                                 'upload_date':  u'NA',
1386                                 'title':        video_title,
1387                                 'stitle':       simple_title,
1388                                 'ext':          video_extension.decode('utf-8'),
1389                                 'format':       u'NA',
1390                                 'player_url':   None,
1391                         })
1392                 except UnavailableVideoError:
1393                         self._downloader.trouble(u'\nERROR: unable to download video')
1394
1395 class GoogleIE(InfoExtractor):
1396         """Information extractor for video.google.com."""
1397
1398         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1399
1400         def __init__(self, downloader=None):
1401                 InfoExtractor.__init__(self, downloader)
1402
1403         @staticmethod
1404         def suitable(url):
1405                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1406
1407         def report_download_webpage(self, video_id):
1408                 """Report webpage download."""
1409                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1410
1411         def report_extraction(self, video_id):
1412                 """Report information extraction."""
1413                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1414
1415         def _real_initialize(self):
1416                 return
1417
1418         def _real_extract(self, url):
1419                 # Extract id from URL
1420                 mobj = re.match(self._VALID_URL, url)
1421                 if mobj is None:
1422                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1423                         return
1424
1425                 # At this point we have a new video
1426                 self._downloader.increment_downloads()
1427                 video_id = mobj.group(1)
1428
1429                 video_extension = 'mp4'
1430
1431                 # Retrieve video webpage to extract further information
1432                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1433                 try:
1434                         self.report_download_webpage(video_id)
1435                         webpage = urllib2.urlopen(request).read()
1436                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1437                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1438                         return
1439
1440                 # Extract URL, uploader, and title from webpage
1441                 self.report_extraction(video_id)
1442                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1443                 if mobj is None:
1444                         video_extension = 'flv'
1445                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1446                 if mobj is None:
1447                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1448                         return
1449                 mediaURL = urllib.unquote(mobj.group(1))
1450                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1451                 mediaURL = mediaURL.replace('\\x26', '\x26')
1452
1453                 video_url = mediaURL
1454
1455                 mobj = re.search(r'<title>(.*)</title>', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract title')
1458                         return
1459                 video_title = mobj.group(1).decode('utf-8')
1460                 video_title = sanitize_title(video_title)
1461                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1462
1463                 # Extract video description
1464                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract video description')
1467                         return
1468                 video_description = mobj.group(1).decode('utf-8')
1469                 if not video_description:
1470                         video_description = 'No description available.'
1471
1472                 # Extract video thumbnail
1473                 if self._downloader.params.get('forcethumbnail', False):
1474                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1475                         try:
1476                                 webpage = urllib2.urlopen(request).read()
1477                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1479                                 return
1480                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1481                         if mobj is None:
1482                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1483                                 return
1484                         video_thumbnail = mobj.group(1)
1485                 else:   # we need something to pass to process_info
1486                         video_thumbnail = ''
1487
1488
1489                 try:
1490                         # Process video information
1491                         self._downloader.process_info({
1492                                 'id':           video_id.decode('utf-8'),
1493                                 'url':          video_url.decode('utf-8'),
1494                                 'uploader':     u'NA',
1495                                 'upload_date':  u'NA',
1496                                 'title':        video_title,
1497                                 'stitle':       simple_title,
1498                                 'ext':          video_extension.decode('utf-8'),
1499                                 'format':       u'NA',
1500                                 'player_url':   None,
1501                         })
1502                 except UnavailableVideoError:
1503                         self._downloader.trouble(u'\nERROR: unable to download video')
1504
1505
1506 class PhotobucketIE(InfoExtractor):
1507         """Information extractor for photobucket.com."""
1508
1509         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1510
1511         def __init__(self, downloader=None):
1512                 InfoExtractor.__init__(self, downloader)
1513
1514         @staticmethod
1515         def suitable(url):
1516                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1517
1518         def report_download_webpage(self, video_id):
1519                 """Report webpage download."""
1520                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1521
1522         def report_extraction(self, video_id):
1523                 """Report information extraction."""
1524                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1525
1526         def _real_initialize(self):
1527                 return
1528
1529         def _real_extract(self, url):
1530                 # Extract id from URL
1531                 mobj = re.match(self._VALID_URL, url)
1532                 if mobj is None:
1533                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1534                         return
1535
1536                 # At this point we have a new video
1537                 self._downloader.increment_downloads()
1538                 video_id = mobj.group(1)
1539
1540                 video_extension = 'flv'
1541
1542                 # Retrieve video webpage to extract further information
1543                 request = urllib2.Request(url)
1544                 try:
1545                         self.report_download_webpage(video_id)
1546                         webpage = urllib2.urlopen(request).read()
1547                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1549                         return
1550
1551                 # Extract URL, uploader, and title from webpage
1552                 self.report_extraction(video_id)
1553                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1554                 if mobj is None:
1555                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1556                         return
1557                 mediaURL = urllib.unquote(mobj.group(1))
1558
1559                 video_url = mediaURL
1560
1561                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1562                 if mobj is None:
1563                         self._downloader.trouble(u'ERROR: unable to extract title')
1564                         return
1565                 video_title = mobj.group(1).decode('utf-8')
1566                 video_title = sanitize_title(video_title)
1567                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1568
1569                 video_uploader = mobj.group(2).decode('utf-8')
1570
1571                 try:
1572                         # Process video information
1573                         self._downloader.process_info({
1574                                 'id':           video_id.decode('utf-8'),
1575                                 'url':          video_url.decode('utf-8'),
1576                                 'uploader':     video_uploader,
1577                                 'upload_date':  u'NA',
1578                                 'title':        video_title,
1579                                 'stitle':       simple_title,
1580                                 'ext':          video_extension.decode('utf-8'),
1581                                 'format':       u'NA',
1582                                 'player_url':   None,
1583                         })
1584                 except UnavailableVideoError:
1585                         self._downloader.trouble(u'\nERROR: unable to download video')
1586
1587
1588 class YahooIE(InfoExtractor):
1589         """Information extractor for video.yahoo.com."""
1590
1591         # _VALID_URL matches all Yahoo! Video URLs
1592         # _VPAGE_URL matches only the extractable '/watch/' URLs
1593         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1594         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1595
1596         def __init__(self, downloader=None):
1597                 InfoExtractor.__init__(self, downloader)
1598
1599         @staticmethod
1600         def suitable(url):
1601                 return (re.match(YahooIE._VALID_URL, url) is not None)
1602
1603         def report_download_webpage(self, video_id):
1604                 """Report webpage download."""
1605                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1606
1607         def report_extraction(self, video_id):
1608                 """Report information extraction."""
1609                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1610
1611         def _real_initialize(self):
1612                 return
1613
1614         def _real_extract(self, url, new_video=True):
1615                 # Extract ID from URL
1616                 mobj = re.match(self._VALID_URL, url)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1619                         return
1620
1621                 # At this point we have a new video
1622                 self._downloader.increment_downloads()
1623                 video_id = mobj.group(2)
1624                 video_extension = 'flv'
1625
1626                 # Rewrite valid but non-extractable URLs as
1627                 # extractable English language /watch/ URLs
1628                 if re.match(self._VPAGE_URL, url) is None:
1629                         request = urllib2.Request(url)
1630                         try:
1631                                 webpage = urllib2.urlopen(request).read()
1632                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1634                                 return
1635
1636                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1637                         if mobj is None:
1638                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1639                                 return
1640                         yahoo_id = mobj.group(1)
1641
1642                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1643                         if mobj is None:
1644                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1645                                 return
1646                         yahoo_vid = mobj.group(1)
1647
1648                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1649                         return self._real_extract(url, new_video=False)
1650
1651                 # Retrieve video webpage to extract further information
1652                 request = urllib2.Request(url)
1653                 try:
1654                         self.report_download_webpage(video_id)
1655                         webpage = urllib2.urlopen(request).read()
1656                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1657                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1658                         return
1659
1660                 # Extract uploader and title from webpage
1661                 self.report_extraction(video_id)
1662                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1663                 if mobj is None:
1664                         self._downloader.trouble(u'ERROR: unable to extract video title')
1665                         return
1666                 video_title = mobj.group(1).decode('utf-8')
1667                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668
1669                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1672                         return
1673                 video_uploader = mobj.group(1).decode('utf-8')
1674
1675                 # Extract video thumbnail
1676                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1677                 if mobj is None:
1678                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1679                         return
1680                 video_thumbnail = mobj.group(1).decode('utf-8')
1681
1682                 # Extract video description
1683                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1684                 if mobj is None:
1685                         self._downloader.trouble(u'ERROR: unable to extract video description')
1686                         return
1687                 video_description = mobj.group(1).decode('utf-8')
1688                 if not video_description: video_description = 'No description available.'
1689
1690                 # Extract video height and width
1691                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1692                 if mobj is None:
1693                         self._downloader.trouble(u'ERROR: unable to extract video height')
1694                         return
1695                 yv_video_height = mobj.group(1)
1696
1697                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1698                 if mobj is None:
1699                         self._downloader.trouble(u'ERROR: unable to extract video width')
1700                         return
1701                 yv_video_width = mobj.group(1)
1702
1703                 # Retrieve video playlist to extract media URL
1704                 # I'm not completely sure what all these options are, but we
1705                 # seem to need most of them, otherwise the server sends a 401.
1706                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1707                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1708                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1709                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1710                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1711                 try:
1712                         self.report_download_webpage(video_id)
1713                         webpage = urllib2.urlopen(request).read()
1714                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1715                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1716                         return
1717
1718                 # Extract media URL from playlist XML
1719                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1722                         return
1723                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1724                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1725
1726                 try:
1727                         # Process video information
1728                         self._downloader.process_info({
1729                                 'id':           video_id.decode('utf-8'),
1730                                 'url':          video_url,
1731                                 'uploader':     video_uploader,
1732                                 'upload_date':  u'NA',
1733                                 'title':        video_title,
1734                                 'stitle':       simple_title,
1735                                 'ext':          video_extension.decode('utf-8'),
1736                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1737                                 'description':  video_description,
1738                                 'thumbnail':    video_thumbnail,
1739                                 'description':  video_description,
1740                                 'player_url':   None,
1741                         })
1742                 except UnavailableVideoError:
1743                         self._downloader.trouble(u'\nERROR: unable to download video')
1744
1745
1746 class GenericIE(InfoExtractor):
1747         """Generic last-resort information extractor."""
1748
1749         def __init__(self, downloader=None):
1750                 InfoExtractor.__init__(self, downloader)
1751
1752         @staticmethod
1753         def suitable(url):
1754                 return True
1755
1756         def report_download_webpage(self, video_id):
1757                 """Report webpage download."""
1758                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1759                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1760
1761         def report_extraction(self, video_id):
1762                 """Report information extraction."""
1763                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1764
1765         def _real_initialize(self):
1766                 return
1767
1768         def _real_extract(self, url):
1769                 # At this point we have a new video
1770                 self._downloader.increment_downloads()
1771
1772                 video_id = url.split('/')[-1]
1773                 request = urllib2.Request(url)
1774                 try:
1775                         self.report_download_webpage(video_id)
1776                         webpage = urllib2.urlopen(request).read()
1777                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1779                         return
1780                 except ValueError, err:
1781                         # since this is the last-resort InfoExtractor, if
1782                         # this error is thrown, it'll be thrown here
1783                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1784                         return
1785
1786                 self.report_extraction(video_id)
1787                 # Start with something easy: JW Player in SWFObject
1788                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1789                 if mobj is None:
1790                         # Broaden the search a little bit
1791                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1792                 if mobj is None:
1793                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1794                         return
1795
1796                 # It's possible that one of the regexes
1797                 # matched, but returned an empty group:
1798                 if mobj.group(1) is None:
1799                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1800                         return
1801
1802                 video_url = urllib.unquote(mobj.group(1))
1803                 video_id  = os.path.basename(video_url)
1804
1805                 # here's a fun little line of code for you:
1806                 video_extension = os.path.splitext(video_id)[1][1:]
1807                 video_id        = os.path.splitext(video_id)[0]
1808
1809                 # it's tempting to parse this further, but you would
1810                 # have to take into account all the variations like
1811                 #   Video Title - Site Name
1812                 #   Site Name | Video Title
1813                 #   Video Title - Tagline | Site Name
1814                 # and so on and so forth; it's just not practical
1815                 mobj = re.search(r'<title>(.*)</title>', webpage)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: unable to extract title')
1818                         return
1819                 video_title = mobj.group(1).decode('utf-8')
1820                 video_title = sanitize_title(video_title)
1821                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1822
1823                 # video uploader is domain name
1824                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: unable to extract title')
1827                         return
1828                 video_uploader = mobj.group(1).decode('utf-8')
1829
1830                 try:
1831                         # Process video information
1832                         self._downloader.process_info({
1833                                 'id':           video_id.decode('utf-8'),
1834                                 'url':          video_url.decode('utf-8'),
1835                                 'uploader':     video_uploader,
1836                                 'upload_date':  u'NA',
1837                                 'title':        video_title,
1838                                 'stitle':       simple_title,
1839                                 'ext':          video_extension.decode('utf-8'),
1840                                 'format':       u'NA',
1841                                 'player_url':   None,
1842                         })
1843                 except UnavailableVideoError, err:
1844                         self._downloader.trouble(u'\nERROR: unable to download video')
1845
1846
1847 class YoutubeSearchIE(InfoExtractor):
1848         """Information Extractor for YouTube search queries."""
1849         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1850         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1851         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1852         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1853         _youtube_ie = None
1854         _max_youtube_results = 1000
1855
1856         def __init__(self, youtube_ie, downloader=None):
1857                 InfoExtractor.__init__(self, downloader)
1858                 self._youtube_ie = youtube_ie
1859
1860         @staticmethod
1861         def suitable(url):
1862                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1863
1864         def report_download_page(self, query, pagenum):
1865                 """Report attempt to download playlist page with given number."""
1866                 query = query.decode(preferredencoding())
1867                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1868
1869         def _real_initialize(self):
1870                 self._youtube_ie.initialize()
1871
1872         def _real_extract(self, query):
1873                 mobj = re.match(self._VALID_QUERY, query)
1874                 if mobj is None:
1875                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1876                         return
1877
1878                 prefix, query = query.split(':')
1879                 prefix = prefix[8:]
1880                 query  = query.encode('utf-8')
1881                 if prefix == '':
1882                         self._download_n_results(query, 1)
1883                         return
1884                 elif prefix == 'all':
1885                         self._download_n_results(query, self._max_youtube_results)
1886                         return
1887                 else:
1888                         try:
1889                                 n = long(prefix)
1890                                 if n <= 0:
1891                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1892                                         return
1893                                 elif n > self._max_youtube_results:
1894                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1895                                         n = self._max_youtube_results
1896                                 self._download_n_results(query, n)
1897                                 return
1898                         except ValueError: # parsing prefix as integer fails
1899                                 self._download_n_results(query, 1)
1900                                 return
1901
1902         def _download_n_results(self, query, n):
1903                 """Downloads a specified number of results for a query"""
1904
1905                 video_ids = []
1906                 already_seen = set()
1907                 pagenum = 1
1908
1909                 while True:
1910                         self.report_download_page(query, pagenum)
1911                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1912                         request = urllib2.Request(result_url)
1913                         try:
1914                                 page = urllib2.urlopen(request).read()
1915                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1916                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1917                                 return
1918
1919                         # Extract video identifiers
1920                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1921                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1922                                 if video_id not in already_seen:
1923                                         video_ids.append(video_id)
1924                                         already_seen.add(video_id)
1925                                         if len(video_ids) == n:
1926                                                 # Specified n videos reached
1927                                                 for id in video_ids:
1928                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1929                                                 return
1930
1931                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1932                                 for id in video_ids:
1933                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1934                                 return
1935
1936                         pagenum = pagenum + 1
1937
1938 class GoogleSearchIE(InfoExtractor):
1939         """Information Extractor for Google Video search queries."""
1940         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1941         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1942         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1943         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1944         _google_ie = None
1945         _max_google_results = 1000
1946
1947         def __init__(self, google_ie, downloader=None):
1948                 InfoExtractor.__init__(self, downloader)
1949                 self._google_ie = google_ie
1950
1951         @staticmethod
1952         def suitable(url):
1953                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1954
1955         def report_download_page(self, query, pagenum):
1956                 """Report attempt to download playlist page with given number."""
1957                 query = query.decode(preferredencoding())
1958                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1959
1960         def _real_initialize(self):
1961                 self._google_ie.initialize()
1962
1963         def _real_extract(self, query):
1964                 mobj = re.match(self._VALID_QUERY, query)
1965                 if mobj is None:
1966                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1967                         return
1968
1969                 prefix, query = query.split(':')
1970                 prefix = prefix[8:]
1971                 query  = query.encode('utf-8')
1972                 if prefix == '':
1973                         self._download_n_results(query, 1)
1974                         return
1975                 elif prefix == 'all':
1976                         self._download_n_results(query, self._max_google_results)
1977                         return
1978                 else:
1979                         try:
1980                                 n = long(prefix)
1981                                 if n <= 0:
1982                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1983                                         return
1984                                 elif n > self._max_google_results:
1985                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1986                                         n = self._max_google_results
1987                                 self._download_n_results(query, n)
1988                                 return
1989                         except ValueError: # parsing prefix as integer fails
1990                                 self._download_n_results(query, 1)
1991                                 return
1992
1993         def _download_n_results(self, query, n):
1994                 """Downloads a specified number of results for a query"""
1995
1996                 video_ids = []
1997                 already_seen = set()
1998                 pagenum = 1
1999
2000                 while True:
2001                         self.report_download_page(query, pagenum)
2002                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2003                         request = urllib2.Request(result_url)
2004                         try:
2005                                 page = urllib2.urlopen(request).read()
2006                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2008                                 return
2009
2010                         # Extract video identifiers
2011                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2012                                 video_id = mobj.group(1)
2013                                 if video_id not in already_seen:
2014                                         video_ids.append(video_id)
2015                                         already_seen.add(video_id)
2016                                         if len(video_ids) == n:
2017                                                 # Specified n videos reached
2018                                                 for id in video_ids:
2019                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2020                                                 return
2021
2022                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2023                                 for id in video_ids:
2024                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2025                                 return
2026
2027                         pagenum = pagenum + 1
2028
2029 class YahooSearchIE(InfoExtractor):
2030         """Information Extractor for Yahoo! Video search queries."""
2031         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2032         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2033         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2034         _MORE_PAGES_INDICATOR = r'\s*Next'
2035         _yahoo_ie = None
2036         _max_yahoo_results = 1000
2037
2038         def __init__(self, yahoo_ie, downloader=None):
2039                 InfoExtractor.__init__(self, downloader)
2040                 self._yahoo_ie = yahoo_ie
2041
2042         @staticmethod
2043         def suitable(url):
2044                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2045
2046         def report_download_page(self, query, pagenum):
2047                 """Report attempt to download playlist page with given number."""
2048                 query = query.decode(preferredencoding())
2049                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2050
2051         def _real_initialize(self):
2052                 self._yahoo_ie.initialize()
2053
2054         def _real_extract(self, query):
2055                 mobj = re.match(self._VALID_QUERY, query)
2056                 if mobj is None:
2057                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2058                         return
2059
2060                 prefix, query = query.split(':')
2061                 prefix = prefix[8:]
2062                 query  = query.encode('utf-8')
2063                 if prefix == '':
2064                         self._download_n_results(query, 1)
2065                         return
2066                 elif prefix == 'all':
2067                         self._download_n_results(query, self._max_yahoo_results)
2068                         return
2069                 else:
2070                         try:
2071                                 n = long(prefix)
2072                                 if n <= 0:
2073                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2074                                         return
2075                                 elif n > self._max_yahoo_results:
2076                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2077                                         n = self._max_yahoo_results
2078                                 self._download_n_results(query, n)
2079                                 return
2080                         except ValueError: # parsing prefix as integer fails
2081                                 self._download_n_results(query, 1)
2082                                 return
2083
2084         def _download_n_results(self, query, n):
2085                 """Downloads a specified number of results for a query"""
2086
2087                 video_ids = []
2088                 already_seen = set()
2089                 pagenum = 1
2090
2091                 while True:
2092                         self.report_download_page(query, pagenum)
2093                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2094                         request = urllib2.Request(result_url)
2095                         try:
2096                                 page = urllib2.urlopen(request).read()
2097                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2098                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2099                                 return
2100
2101                         # Extract video identifiers
2102                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2103                                 video_id = mobj.group(1)
2104                                 if video_id not in already_seen:
2105                                         video_ids.append(video_id)
2106                                         already_seen.add(video_id)
2107                                         if len(video_ids) == n:
2108                                                 # Specified n videos reached
2109                                                 for id in video_ids:
2110                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2111                                                 return
2112
2113                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2114                                 for id in video_ids:
2115                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2116                                 return
2117
2118                         pagenum = pagenum + 1
2119
2120 class YoutubePlaylistIE(InfoExtractor):
2121         """Information Extractor for YouTube playlists."""
2122
2123         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2124         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2125         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2126         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2127         _youtube_ie = None
2128
2129         def __init__(self, youtube_ie, downloader=None):
2130                 InfoExtractor.__init__(self, downloader)
2131                 self._youtube_ie = youtube_ie
2132
2133         @staticmethod
2134         def suitable(url):
2135                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2136
2137         def report_download_page(self, playlist_id, pagenum):
2138                 """Report attempt to download playlist page with given number."""
2139                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2140
2141         def _real_initialize(self):
2142                 self._youtube_ie.initialize()
2143
2144         def _real_extract(self, url):
2145                 # Extract playlist id
2146                 mobj = re.match(self._VALID_URL, url)
2147                 if mobj is None:
2148                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2149                         return
2150
2151                 # Single video case
2152                 if mobj.group(3) is not None:
2153                         self._youtube_ie.extract(mobj.group(3))
2154                         return
2155
2156                 # Download playlist pages
2157                 # prefix is 'p' as default for playlists but there are other types that need extra care
2158                 playlist_prefix = mobj.group(1)
2159                 if playlist_prefix == 'a':
2160                         playlist_access = 'artist'
2161                 else:
2162                         playlist_prefix = 'p'
2163                         playlist_access = 'view_play_list'
2164                 playlist_id = mobj.group(2)
2165                 video_ids = []
2166                 pagenum = 1
2167
2168                 while True:
2169                         self.report_download_page(playlist_id, pagenum)
2170                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2171                         try:
2172                                 page = urllib2.urlopen(request).read()
2173                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2174                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2175                                 return
2176
2177                         # Extract video identifiers
2178                         ids_in_page = []
2179                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2180                                 if mobj.group(1) not in ids_in_page:
2181                                         ids_in_page.append(mobj.group(1))
2182                         video_ids.extend(ids_in_page)
2183
2184                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2185                                 break
2186                         pagenum = pagenum + 1
2187
2188                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2189                 playlistend = self._downloader.params.get('playlistend', -1)
2190                 video_ids = video_ids[playliststart:playlistend]
2191
2192                 for id in video_ids:
2193                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2194                 return
2195
2196 class YoutubeUserIE(InfoExtractor):
2197         """Information Extractor for YouTube users."""
2198
2199         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2200         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2201         _GDATA_PAGE_SIZE = 50
2202         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2203         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2204         _youtube_ie = None
2205
2206         def __init__(self, youtube_ie, downloader=None):
2207                 InfoExtractor.__init__(self, downloader)
2208                 self._youtube_ie = youtube_ie
2209
2210         @staticmethod
2211         def suitable(url):
2212                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2213
2214         def report_download_page(self, username, start_index):
2215                 """Report attempt to download user page."""
2216                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2217                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2218
2219         def _real_initialize(self):
2220                 self._youtube_ie.initialize()
2221
2222         def _real_extract(self, url):
2223                 # Extract username
2224                 mobj = re.match(self._VALID_URL, url)
2225                 if mobj is None:
2226                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2227                         return
2228
2229                 username = mobj.group(1)
2230
2231                 # Download video ids using YouTube Data API. Result size per
2232                 # query is limited (currently to 50 videos) so we need to query
2233                 # page by page until there are no video ids - it means we got
2234                 # all of them.
2235
2236                 video_ids = []
2237                 pagenum = 0
2238
2239                 while True:
2240                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2241                         self.report_download_page(username, start_index)
2242
2243                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2244
2245                         try:
2246                                 page = urllib2.urlopen(request).read()
2247                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2248                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2249                                 return
2250
2251                         # Extract video identifiers
2252                         ids_in_page = []
2253
2254                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2255                                 if mobj.group(1) not in ids_in_page:
2256                                         ids_in_page.append(mobj.group(1))
2257
2258                         video_ids.extend(ids_in_page)
2259
2260                         # A little optimization - if current page is not
2261                         # "full", ie. does not contain PAGE_SIZE video ids then
2262                         # we can assume that this page is the last one - there
2263                         # are no more ids on further pages - no need to query
2264                         # again.
2265
2266                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2267                                 break
2268
2269                         pagenum += 1
2270
2271                 all_ids_count = len(video_ids)
2272                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2273                 playlistend = self._downloader.params.get('playlistend', -1)
2274
2275                 if playlistend == -1:
2276                         video_ids = video_ids[playliststart:]
2277                 else:
2278                         video_ids = video_ids[playliststart:playlistend]
2279
2280                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2281                                            (username, all_ids_count, len(video_ids)))
2282
2283                 for video_id in video_ids:
2284                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2285
2286
2287 class DepositFilesIE(InfoExtractor):
2288         """Information extractor for depositfiles.com"""
2289
2290         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2291
2292         def __init__(self, downloader=None):
2293                 InfoExtractor.__init__(self, downloader)
2294
2295         @staticmethod
2296         def suitable(url):
2297                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2298
2299         def report_download_webpage(self, file_id):
2300                 """Report webpage download."""
2301                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2302
2303         def report_extraction(self, file_id):
2304                 """Report information extraction."""
2305                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2306
2307         def _real_initialize(self):
2308                 return
2309
2310         def _real_extract(self, url):
2311                 # At this point we have a new file
2312                 self._downloader.increment_downloads()
2313
2314                 file_id = url.split('/')[-1]
2315                 # Rebuild url in english locale
2316                 url = 'http://depositfiles.com/en/files/' + file_id
2317
2318                 # Retrieve file webpage with 'Free download' button pressed
2319                 free_download_indication = { 'gateway_result' : '1' }
2320                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2321                 try:
2322                         self.report_download_webpage(file_id)
2323                         webpage = urllib2.urlopen(request).read()
2324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2325                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2326                         return
2327
2328                 # Search for the real file URL
2329                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2330                 if (mobj is None) or (mobj.group(1) is None):
2331                         # Try to figure out reason of the error.
2332                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2333                         if (mobj is not None) and (mobj.group(1) is not None):
2334                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2335                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2336                         else:
2337                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2338                         return
2339
2340                 file_url = mobj.group(1)
2341                 file_extension = os.path.splitext(file_url)[1][1:]
2342
2343                 # Search for file title
2344                 mobj = re.search(r'<b title="(.*?)">', webpage)
2345                 if mobj is None:
2346                         self._downloader.trouble(u'ERROR: unable to extract title')
2347                         return
2348                 file_title = mobj.group(1).decode('utf-8')
2349
2350                 try:
2351                         # Process file information
2352                         self._downloader.process_info({
2353                                 'id':           file_id.decode('utf-8'),
2354                                 'url':          file_url.decode('utf-8'),
2355                                 'uploader':     u'NA',
2356                                 'upload_date':  u'NA',
2357                                 'title':        file_title,
2358                                 'stitle':       file_title,
2359                                 'ext':          file_extension.decode('utf-8'),
2360                                 'format':       u'NA',
2361                                 'player_url':   None,
2362                         })
2363                 except UnavailableVideoError, err:
2364                         self._downloader.trouble(u'ERROR: unable to download file')
2365
2366 class FacebookIE(InfoExtractor):
2367         """Information Extractor for Facebook"""
2368
2369         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2370         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2371         _NETRC_MACHINE = 'facebook'
2372         _available_formats = ['highqual', 'lowqual']
2373         _video_extensions = {
2374                 'highqual': 'mp4',
2375                 'lowqual': 'mp4',
2376         }
2377
2378         def __init__(self, downloader=None):
2379                 InfoExtractor.__init__(self, downloader)
2380
2381         @staticmethod
2382         def suitable(url):
2383                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2384
2385         def _reporter(self, message):
2386                 """Add header and report message."""
2387                 self._downloader.to_screen(u'[facebook] %s' % message)
2388
2389         def report_login(self):
2390                 """Report attempt to log in."""
2391                 self._reporter(u'Logging in')
2392
2393         def report_video_webpage_download(self, video_id):
2394                 """Report attempt to download video webpage."""
2395                 self._reporter(u'%s: Downloading video webpage' % video_id)
2396
2397         def report_information_extraction(self, video_id):
2398                 """Report attempt to extract video information."""
2399                 self._reporter(u'%s: Extracting video information' % video_id)
2400
2401         def _parse_page(self, video_webpage):
2402                 """Extract video information from page"""
2403                 # General data
2404                 data = {'title': r'class="video_title datawrap">(.*?)</',
2405                         'description': r'<div class="datawrap">(.*?)</div>',
2406                         'owner': r'\("video_owner_name", "(.*?)"\)',
2407                         'upload_date': r'data-date="(.*?)"',
2408                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2409                         }
2410                 video_info = {}
2411                 for piece in data.keys():
2412                         mobj = re.search(data[piece], video_webpage)
2413                         if mobj is not None:
2414                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2415
2416                 # Video urls
2417                 video_urls = {}
2418                 for fmt in self._available_formats:
2419                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2420                         if mobj is not None:
2421                                 # URL is in a Javascript segment inside an escaped Unicode format within
2422                                 # the generally utf-8 page
2423                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2424                 video_info['video_urls'] = video_urls
2425
2426                 return video_info
2427
2428         def _real_initialize(self):
2429                 if self._downloader is None:
2430                         return
2431
2432                 useremail = None
2433                 password = None
2434                 downloader_params = self._downloader.params
2435
2436                 # Attempt to use provided username and password or .netrc data
2437                 if downloader_params.get('username', None) is not None:
2438                         useremail = downloader_params['username']
2439                         password = downloader_params['password']
2440                 elif downloader_params.get('usenetrc', False):
2441                         try:
2442                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2443                                 if info is not None:
2444                                         useremail = info[0]
2445                                         password = info[2]
2446                                 else:
2447                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2448                         except (IOError, netrc.NetrcParseError), err:
2449                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2450                                 return
2451
2452                 if useremail is None:
2453                         return
2454
2455                 # Log in
2456                 login_form = {
2457                         'email': useremail,
2458                         'pass': password,
2459                         'login': 'Log+In'
2460                         }
2461                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2462                 try:
2463                         self.report_login()
2464                         login_results = urllib2.urlopen(request).read()
2465                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2466                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2467                                 return
2468                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2470                         return
2471
2472         def _real_extract(self, url):
2473                 mobj = re.match(self._VALID_URL, url)
2474                 if mobj is None:
2475                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2476                         return
2477                 video_id = mobj.group('ID')
2478
2479                 # Get video webpage
2480                 self.report_video_webpage_download(video_id)
2481                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2482                 try:
2483                         page = urllib2.urlopen(request)
2484                         video_webpage = page.read()
2485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2487                         return
2488
2489                 # Start extracting information
2490                 self.report_information_extraction(video_id)
2491
2492                 # Extract information
2493                 video_info = self._parse_page(video_webpage)
2494
2495                 # uploader
2496                 if 'owner' not in video_info:
2497                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2498                         return
2499                 video_uploader = video_info['owner']
2500
2501                 # title
2502                 if 'title' not in video_info:
2503                         self._downloader.trouble(u'ERROR: unable to extract video title')
2504                         return
2505                 video_title = video_info['title']
2506                 video_title = video_title.decode('utf-8')
2507                 video_title = sanitize_title(video_title)
2508
2509                 # simplified title
2510                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2511                 simple_title = simple_title.strip(ur'_')
2512
2513                 # thumbnail image
2514                 if 'thumbnail' not in video_info:
2515                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2516                         video_thumbnail = ''
2517                 else:
2518                         video_thumbnail = video_info['thumbnail']
2519
2520                 # upload date
2521                 upload_date = u'NA'
2522                 if 'upload_date' in video_info:
2523                         upload_time = video_info['upload_date']
2524                         timetuple = email.utils.parsedate_tz(upload_time)
2525                         if timetuple is not None:
2526                                 try:
2527                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2528                                 except:
2529                                         pass
2530
2531                 # description
2532                 video_description = 'No description available.'
2533                 if (self._downloader.params.get('forcedescription', False) and
2534                     'description' in video_info):
2535                         video_description = video_info['description']
2536
2537                 url_map = video_info['video_urls']
2538                 if len(url_map.keys()) > 0:
2539                         # Decide which formats to download
2540                         req_format = self._downloader.params.get('format', None)
2541                         format_limit = self._downloader.params.get('format_limit', None)
2542
2543                         if format_limit is not None and format_limit in self._available_formats:
2544                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2545                         else:
2546                                 format_list = self._available_formats
2547                         existing_formats = [x for x in format_list if x in url_map]
2548                         if len(existing_formats) == 0:
2549                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2550                                 return
2551                         if req_format is None:
2552                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2553                         elif req_format == '-1':
2554                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2555                         else:
2556                                 # Specific format
2557                                 if req_format not in url_map:
2558                                         self._downloader.trouble(u'ERROR: requested format not available')
2559                                         return
2560                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2561
2562                 for format_param, video_real_url in video_url_list:
2563
2564                         # At this point we have a new video
2565                         self._downloader.increment_downloads()
2566
2567                         # Extension
2568                         video_extension = self._video_extensions.get(format_param, 'mp4')
2569
2570                         # Find the video URL in fmt_url_map or conn paramters
2571                         try:
2572                                 # Process video information
2573                                 self._downloader.process_info({
2574                                         'id':           video_id.decode('utf-8'),
2575                                         'url':          video_real_url.decode('utf-8'),
2576                                         'uploader':     video_uploader.decode('utf-8'),
2577                                         'upload_date':  upload_date,
2578                                         'title':        video_title,
2579                                         'stitle':       simple_title,
2580                                         'ext':          video_extension.decode('utf-8'),
2581                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2582                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2583                                         'description':  video_description.decode('utf-8'),
2584                                         'player_url':   None,
2585                                 })
2586                         except UnavailableVideoError, err:
2587                                 self._downloader.trouble(u'\nERROR: unable to download video')
2588
2589 class BlipTVIE(InfoExtractor):
2590         """Information extractor for blip.tv"""
2591
2592         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2593         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2594
2595         @staticmethod
2596         def suitable(url):
2597                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2598
2599         def report_extraction(self, file_id):
2600                 """Report information extraction."""
2601                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2602
2603         def _simplify_title(self, title):
2604                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2605                 res = res.strip(ur'_')
2606                 return res
2607
2608         def _real_extract(self, url):
2609                 mobj = re.match(self._VALID_URL, url)
2610                 if mobj is None:
2611                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2612                         return
2613
2614                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2615                 request = urllib2.Request(json_url)
2616                 self.report_extraction(mobj.group(1))
2617                 try:
2618                         json_code = urllib2.urlopen(request).read()
2619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2620                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2621                         return
2622                 try:
2623                         json_data = json.loads(json_code)
2624                         data = json_data['Post'] if 'Post' in json_data else json_data
2625
2626                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2627                         video_url = data['media']['url']
2628                         umobj = re.match(self._URL_EXT, video_url)
2629                         if umobj is None:
2630                                 raise ValueError('Can not determine filename extension')
2631                         ext = umobj.group(1)
2632
2633                         info = {
2634                                 'id': data['item_id'],
2635                                 'url': video_url,
2636                                 'uploader': data['display_name'],
2637                                 'upload_date': upload_date,
2638                                 'title': data['title'],
2639                                 'stitle': self._simplify_title(data['title']),
2640                                 'ext': ext,
2641                                 'format': data['media']['mimeType'],
2642                                 'thumbnail': data['thumbnailUrl'],
2643                                 'description': data['description'],
2644                                 'player_url': data['embedUrl']
2645                         }
2646                 except (ValueError,KeyError), err:
2647                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2648                         return
2649
2650                 try:
2651                         self._downloader.process_info(info)
2652                 except UnavailableVideoError, err:
2653                         self._downloader.trouble(u'\nERROR: unable to download video')
2654
2655
2656 class PostProcessor(object):
2657         """Post Processor class.
2658
2659         PostProcessor objects can be added to downloaders with their
2660         add_post_processor() method. When the downloader has finished a
2661         successful download, it will take its internal chain of PostProcessors
2662         and start calling the run() method on each one of them, first with
2663         an initial argument and then with the returned value of the previous
2664         PostProcessor.
2665
2666         The chain will be stopped if one of them ever returns None or the end
2667         of the chain is reached.
2668
2669         PostProcessor objects follow a "mutual registration" process similar
2670         to InfoExtractor objects.
2671         """
2672
2673         _downloader = None
2674
2675         def __init__(self, downloader=None):
2676                 self._downloader = downloader
2677
2678         def set_downloader(self, downloader):
2679                 """Sets the downloader for this PP."""
2680                 self._downloader = downloader
2681
2682         def run(self, information):
2683                 """Run the PostProcessor.
2684
2685                 The "information" argument is a dictionary like the ones
2686                 composed by InfoExtractors. The only difference is that this
2687                 one has an extra field called "filepath" that points to the
2688                 downloaded file.
2689
2690                 When this method returns None, the postprocessing chain is
2691                 stopped. However, this method may return an information
2692                 dictionary that will be passed to the next postprocessing
2693                 object in the chain. It can be the one it received after
2694                 changing some fields.
2695
2696                 In addition, this method may raise a PostProcessingError
2697                 exception that will be taken into account by the downloader
2698                 it was called from.
2699                 """
2700                 return information # by default, do nothing
2701
2702 class FFmpegExtractAudioPP(PostProcessor):
2703
2704         def __init__(self, downloader=None, preferredcodec=None):
2705                 PostProcessor.__init__(self, downloader)
2706                 if preferredcodec is None:
2707                         preferredcodec = 'best'
2708                 self._preferredcodec = preferredcodec
2709
2710         @staticmethod
2711         def get_audio_codec(path):
2712                 try:
2713                         cmd = ['ffprobe', '-show_streams', '--', path]
2714                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2715                         output = handle.communicate()[0]
2716                         if handle.wait() != 0:
2717                                 return None
2718                 except (IOError, OSError):
2719                         return None
2720                 audio_codec = None
2721                 for line in output.split('\n'):
2722                         if line.startswith('codec_name='):
2723                                 audio_codec = line.split('=')[1].strip()
2724                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2725                                 return audio_codec
2726                 return None
2727
2728         @staticmethod
2729         def run_ffmpeg(path, out_path, codec, more_opts):
2730                 try:
2731                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2732                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2733                         return (ret == 0)
2734                 except (IOError, OSError):
2735                         return False
2736
2737         def run(self, information):
2738                 path = information['filepath']
2739
2740                 filecodec = self.get_audio_codec(path)
2741                 if filecodec is None:
2742                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2743                         return None
2744
2745                 more_opts = []
2746                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2747                         if filecodec == 'aac' or filecodec == 'mp3':
2748                                 # Lossless if possible
2749                                 acodec = 'copy'
2750                                 extension = filecodec
2751                                 if filecodec == 'aac':
2752                                         more_opts = ['-f', 'adts']
2753                         else:
2754                                 # MP3 otherwise.
2755                                 acodec = 'libmp3lame'
2756                                 extension = 'mp3'
2757                                 more_opts = ['-ab', '128k']
2758                 else:
2759                         # We convert the audio (lossy)
2760                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2761                         extension = self._preferredcodec
2762                         more_opts = ['-ab', '128k']
2763                         if self._preferredcodec == 'aac':
2764                                 more_opts += ['-f', 'adts']
2765
2766                 (prefix, ext) = os.path.splitext(path)
2767                 new_path = prefix + '.' + extension
2768                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2769                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2770
2771                 if not status:
2772                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2773                         return None
2774
2775                 try:
2776                         os.remove(path)
2777                 except (IOError, OSError):
2778                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2779                         return None
2780
2781                 information['filepath'] = new_path
2782                 return information
2783
2784 ### MAIN PROGRAM ###
2785 if __name__ == '__main__':
2786         try:
2787                 # Modules needed only when running the main program
2788                 import getpass
2789                 import optparse
2790
2791                 # Function to update the program file with the latest version from the repository.
2792                 def update_self(downloader, filename):
2793                         # Note: downloader only used for options
2794                         if not os.access(filename, os.W_OK):
2795                                 sys.exit('ERROR: no write permissions on %s' % filename)
2796
2797                         downloader.to_screen('Updating to latest stable version...')
2798                         try:
2799                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2800                                 latest_version = urllib.urlopen(latest_url).read().strip()
2801                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2802                                 newcontent = urllib.urlopen(prog_url).read()
2803                         except (IOError, OSError), err:
2804                                 sys.exit('ERROR: unable to download latest version')
2805                         try:
2806                                 stream = open(filename, 'w')
2807                                 stream.write(newcontent)
2808                                 stream.close()
2809                         except (IOError, OSError), err:
2810                                 sys.exit('ERROR: unable to overwrite current version')
2811                         downloader.to_screen('Updated to version %s' % latest_version)
2812
2813                 # Parse command line
2814                 parser = optparse.OptionParser(
2815                         usage='Usage: %prog [options] url...',
2816                         version='2011.03.29',
2817                         conflict_handler='resolve',
2818                 )
2819
2820                 parser.add_option('-h', '--help',
2821                                 action='help', help='print this help text and exit')
2822                 parser.add_option('-v', '--version',
2823                                 action='version', help='print program version and exit')
2824                 parser.add_option('-U', '--update',
2825                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2826                 parser.add_option('-i', '--ignore-errors',
2827                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2828                 parser.add_option('-r', '--rate-limit',
2829                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2830                 parser.add_option('-R', '--retries',
2831                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2832                 parser.add_option('--playlist-start',
2833                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2834                 parser.add_option('--playlist-end',
2835                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2836                 parser.add_option('--dump-user-agent',
2837                                 action='store_true', dest='dump_user_agent',
2838                                 help='display the current browser identification', default=False)
2839
2840                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2841                 authentication.add_option('-u', '--username',
2842                                 dest='username', metavar='USERNAME', help='account username')
2843                 authentication.add_option('-p', '--password',
2844                                 dest='password', metavar='PASSWORD', help='account password')
2845                 authentication.add_option('-n', '--netrc',
2846                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2847                 parser.add_option_group(authentication)
2848
2849                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2850                 video_format.add_option('-f', '--format',
2851                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2852                 video_format.add_option('--all-formats',
2853                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2854                 video_format.add_option('--max-quality',
2855                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2856                 parser.add_option_group(video_format)
2857
2858                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2859                 verbosity.add_option('-q', '--quiet',
2860                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2861                 verbosity.add_option('-s', '--simulate',
2862                                 action='store_true', dest='simulate', help='do not download video', default=False)
2863                 verbosity.add_option('-g', '--get-url',
2864                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2865                 verbosity.add_option('-e', '--get-title',
2866                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2867                 verbosity.add_option('--get-thumbnail',
2868                                 action='store_true', dest='getthumbnail',
2869                                 help='simulate, quiet but print thumbnail URL', default=False)
2870                 verbosity.add_option('--get-description',
2871                                 action='store_true', dest='getdescription',
2872                                 help='simulate, quiet but print video description', default=False)
2873                 verbosity.add_option('--get-filename',
2874                                 action='store_true', dest='getfilename',
2875                                 help='simulate, quiet but print output filename', default=False)
2876                 verbosity.add_option('--no-progress',
2877                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2878                 verbosity.add_option('--console-title',
2879                                 action='store_true', dest='consoletitle',
2880                                 help='display progress in console titlebar', default=False)
2881                 parser.add_option_group(verbosity)
2882
2883                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2884                 filesystem.add_option('-t', '--title',
2885                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2886                 filesystem.add_option('-l', '--literal',
2887                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2888                 filesystem.add_option('-A', '--auto-number',
2889                                 action='store_true', dest='autonumber',
2890                                 help='number downloaded files starting from 00000', default=False)
2891                 filesystem.add_option('-o', '--output',
2892                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2893                 filesystem.add_option('-a', '--batch-file',
2894                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2895                 filesystem.add_option('-w', '--no-overwrites',
2896                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2897                 filesystem.add_option('-c', '--continue',
2898                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2899                 filesystem.add_option('--cookies',
2900                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2901                 filesystem.add_option('--no-part',
2902                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2903                 filesystem.add_option('--no-mtime',
2904                                 action='store_false', dest='updatetime',
2905                                 help='do not use the Last-modified header to set the file modification time', default=True)
2906                 parser.add_option_group(filesystem)
2907
2908                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2909                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2910                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2911                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2912                                 help='"best", "aac" or "mp3"; best by default')
2913                 parser.add_option_group(postproc)
2914
2915                 (opts, args) = parser.parse_args()
2916
2917                 # Open appropriate CookieJar
2918                 if opts.cookiefile is None:
2919                         jar = cookielib.CookieJar()
2920                 else:
2921                         try:
2922                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2923                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2924                                         jar.load()
2925                         except (IOError, OSError), err:
2926                                 sys.exit(u'ERROR: unable to open cookie file')
2927
2928                 # Dump user agent
2929                 if opts.dump_user_agent:
2930                         print std_headers['User-Agent']
2931                         sys.exit(0)
2932
2933                 # General configuration
2934                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2935                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2936                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2937
2938                 # Batch file verification
2939                 batchurls = []
2940                 if opts.batchfile is not None:
2941                         try:
2942                                 if opts.batchfile == '-':
2943                                         batchfd = sys.stdin
2944                                 else:
2945                                         batchfd = open(opts.batchfile, 'r')
2946                                 batchurls = batchfd.readlines()
2947                                 batchurls = [x.strip() for x in batchurls]
2948                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2949                         except IOError:
2950                                 sys.exit(u'ERROR: batch file could not be read')
2951                 all_urls = batchurls + args
2952
2953                 # Conflicting, missing and erroneous options
2954                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2955                         parser.error(u'using .netrc conflicts with giving username/password')
2956                 if opts.password is not None and opts.username is None:
2957                         parser.error(u'account username missing')
2958                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2959                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2960                 if opts.usetitle and opts.useliteral:
2961                         parser.error(u'using title conflicts with using literal title')
2962                 if opts.username is not None and opts.password is None:
2963                         opts.password = getpass.getpass(u'Type account password and press return:')
2964                 if opts.ratelimit is not None:
2965                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2966                         if numeric_limit is None:
2967                                 parser.error(u'invalid rate limit specified')
2968                         opts.ratelimit = numeric_limit
2969                 if opts.retries is not None:
2970                         try:
2971                                 opts.retries = long(opts.retries)
2972                         except (TypeError, ValueError), err:
2973                                 parser.error(u'invalid retry count specified')
2974                 try:
2975                         opts.playliststart = long(opts.playliststart)
2976                         if opts.playliststart <= 0:
2977                                 raise ValueError
2978                 except (TypeError, ValueError), err:
2979                         parser.error(u'invalid playlist start number specified')
2980                 try:
2981                         opts.playlistend = long(opts.playlistend)
2982                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2983                                 raise ValueError
2984                 except (TypeError, ValueError), err:
2985                         parser.error(u'invalid playlist end number specified')
2986                 if opts.extractaudio:
2987                         if opts.audioformat not in ['best', 'aac', 'mp3']:
2988                                 parser.error(u'invalid audio format specified')
2989
2990                 # Information extractors
2991                 youtube_ie = YoutubeIE()
2992                 metacafe_ie = MetacafeIE(youtube_ie)
2993                 dailymotion_ie = DailymotionIE()
2994                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2995                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2996                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2997                 google_ie = GoogleIE()
2998                 google_search_ie = GoogleSearchIE(google_ie)
2999                 photobucket_ie = PhotobucketIE()
3000                 yahoo_ie = YahooIE()
3001                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3002                 deposit_files_ie = DepositFilesIE()
3003                 facebook_ie = FacebookIE()
3004                 bliptv_ie = BlipTVIE()
3005                 generic_ie = GenericIE()
3006
3007                 # File downloader
3008                 fd = FileDownloader({
3009                         'usenetrc': opts.usenetrc,
3010                         'username': opts.username,
3011                         'password': opts.password,
3012                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3013                         'forceurl': opts.geturl,
3014                         'forcetitle': opts.gettitle,
3015                         'forcethumbnail': opts.getthumbnail,
3016                         'forcedescription': opts.getdescription,
3017                         'forcefilename': opts.getfilename,
3018                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3019                         'format': opts.format,
3020                         'format_limit': opts.format_limit,
3021                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3022                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3023                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3024                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3025                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3026                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3027                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3028                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3029                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3030                                 or u'%(id)s.%(ext)s'),
3031                         'ignoreerrors': opts.ignoreerrors,
3032                         'ratelimit': opts.ratelimit,
3033                         'nooverwrites': opts.nooverwrites,
3034                         'retries': opts.retries,
3035                         'continuedl': opts.continue_dl,
3036                         'noprogress': opts.noprogress,
3037                         'playliststart': opts.playliststart,
3038                         'playlistend': opts.playlistend,
3039                         'logtostderr': opts.outtmpl == '-',
3040                         'consoletitle': opts.consoletitle,
3041                         'nopart': opts.nopart,
3042                         'updatetime': opts.updatetime,
3043                         })
3044                 fd.add_info_extractor(youtube_search_ie)
3045                 fd.add_info_extractor(youtube_pl_ie)
3046                 fd.add_info_extractor(youtube_user_ie)
3047                 fd.add_info_extractor(metacafe_ie)
3048                 fd.add_info_extractor(dailymotion_ie)
3049                 fd.add_info_extractor(youtube_ie)
3050                 fd.add_info_extractor(google_ie)
3051                 fd.add_info_extractor(google_search_ie)
3052                 fd.add_info_extractor(photobucket_ie)
3053                 fd.add_info_extractor(yahoo_ie)
3054                 fd.add_info_extractor(yahoo_search_ie)
3055                 fd.add_info_extractor(deposit_files_ie)
3056                 fd.add_info_extractor(facebook_ie)
3057                 fd.add_info_extractor(bliptv_ie)
3058
3059                 # This must come last since it's the
3060                 # fallback if none of the others work
3061                 fd.add_info_extractor(generic_ie)
3062
3063                 # PostProcessors
3064                 if opts.extractaudio:
3065                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3066
3067                 # Update version
3068                 if opts.update_self:
3069                         update_self(fd, sys.argv[0])
3070
3071                 # Maybe do nothing
3072                 if len(all_urls) < 1:
3073                         if not opts.update_self:
3074                                 parser.error(u'you must provide at least one URL')
3075                         else:
3076                                 sys.exit()
3077                 retcode = fd.download(all_urls)
3078
3079                 # Dump cookie jar if requested
3080                 if opts.cookiefile is not None:
3081                         try:
3082                                 jar.save()
3083                         except (IOError, OSError), err:
3084                                 sys.exit(u'ERROR: unable to save cookie jar')
3085
3086                 sys.exit(retcode)
3087
3088         except DownloadError:
3089                 sys.exit(1)
3090         except SameFileError:
3091                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3092         except KeyboardInterrupt:
3093                 sys.exit(u'\nERROR: Interrupted by user')