youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         )
  14
  15 __license__ = 'Public Domain'
  16 __version__ = '2011.08.28-phihag'
  17
  18 import cookielib
  19 import datetime
  20 import gzip
  21 import htmlentitydefs
  22 import httplib
  23 import locale
  24 import math
  25 import netrc
  26 import os
  27 import os.path
  28 import re
  29 import socket
  30 import string
  31 import subprocess
  32 import sys
  33 import time
  34 import urllib
  35 import urllib2
  36 import warnings
  37 import zlib
  38
  39 if os.name == 'nt':
  40         import ctypes
  41
  42 try:
  43         import email.utils
  44 except ImportError: # Python 2.4
  45         import email.Utils
  46 try:
  47         import cStringIO as StringIO
  48 except ImportError:
  49         import StringIO
  50
  51 # parse_qs was moved from the cgi module to the urlparse module recently.
  52 try:
  53         from urlparse import parse_qs
  54 except ImportError:
  55         from cgi import parse_qs
  56
  57 try:
  58         import lxml.etree
  59 except ImportError:
  60         pass # Handled below
  61
  62 std_headers = {
  63         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  64         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  65         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  66         'Accept-Encoding': 'gzip, deflate',
  67         'Accept-Language': 'en-us,en;q=0.5',
  68 }
  69
  70 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  71
  72 try:
  73         import json
  74 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  75         import re
  76         class json(object):
  77                 @staticmethod
  78                 def loads(s):
  79                         s = s.decode('UTF-8')
  80                         def raiseError(msg, i):
  81                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  82                         def skipSpace(i, expectMore=True):
  83                                 while i < len(s) and s[i] in ' \t\r\n':
  84                                         i += 1
  85                                 if expectMore:
  86                                         if i >= len(s):
  87                                                 raiseError('Premature end', i)
  88                                 return i
  89                         def decodeEscape(match):
  90                                 esc = match.group(1)
  91                                 _STATIC = {
  92                                         '"': '"',
  93                                         '\\': '\\',
  94                                         '/': '/',
  95                                         'b': unichr(0x8),
  96                                         'f': unichr(0xc),
  97                                         'n': '\n',
  98                                         'r': '\r',
  99                                         't': '\t',
 100                                 }
 101                                 if esc in _STATIC:
 102                                         return _STATIC[esc]
 103                                 if esc[0] == 'u':
 104                                         if len(esc) == 1+4:
 105                                                 return unichr(int(esc[1:5], 16))
 106                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 107                                                 hi = int(esc[1:5], 16)
 108                                                 low = int(esc[7:11], 16)
 109                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 110                                 raise ValueError('Unknown escape ' + str(esc))
 111                         def parseString(i):
 112                                 i += 1
 113                                 e = i
 114                                 while True:
 115                                         e = s.index('"', e)
 116                                         bslashes = 0
 117                                         while s[e-bslashes-1] == '\\':
 118                                                 bslashes += 1
 119                                         if bslashes % 2 == 1:
 120                                                 e += 1
 121                                                 continue
 122                                         break
 123                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 124                                 stri = rexp.sub(decodeEscape, s[i:e])
 125                                 return (e+1,stri)
 126                         def parseObj(i):
 127                                 i += 1
 128                                 res = {}
 129                                 i = skipSpace(i)
 130                                 if s[i] == '}': # Empty dictionary
 131                                         return (i+1,res)
 132                                 while True:
 133                                         if s[i] != '"':
 134                                                 raiseError('Expected a string object key', i)
 135                                         i,key = parseString(i)
 136                                         i = skipSpace(i)
 137                                         if i >= len(s) or s[i] != ':':
 138                                                 raiseError('Expected a colon', i)
 139                                         i,val = parse(i+1)
 140                                         res[key] = val
 141                                         i = skipSpace(i)
 142                                         if s[i] == '}':
 143                                                 return (i+1, res)
 144                                         if s[i] != ',':
 145                                                 raiseError('Expected comma or closing curly brace', i)
 146                                         i = skipSpace(i+1)
 147                         def parseArray(i):
 148                                 res = []
 149                                 i = skipSpace(i+1)
 150                                 if s[i] == ']': # Empty array
 151                                         return (i+1,res)
 152                                 while True:
 153                                         i,val = parse(i)
 154                                         res.append(val)
 155                                         i = skipSpace(i) # Raise exception if premature end
 156                                         if s[i] == ']':
 157                                                 return (i+1, res)
 158                                         if s[i] != ',':
 159                                                 raiseError('Expected a comma or closing bracket', i)
 160                                         i = skipSpace(i+1)
 161                         def parseDiscrete(i):
 162                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 163                                         if s.startswith(k, i):
 164                                                 return (i+len(k), v)
 165                                 raiseError('Not a boolean (or null)', i)
 166                         def parseNumber(i):
 167                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 168                                 if mobj is None:
 169                                         raiseError('Not a number', i)
 170                                 nums = mobj.group(1)
 171                                 if '.' in nums or 'e' in nums or 'E' in nums:
 172                                         return (i+len(nums), float(nums))
 173                                 return (i+len(nums), int(nums))
 174                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 175                         def parse(i):
 176                                 i = skipSpace(i)
 177                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 178                                 i = skipSpace(i, False)
 179                                 return (i,res)
 180                         i,res = parse(0)
 181                         if i < len(s):
 182                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 183                         return res
 184
 185 def preferredencoding():
 186         """Get preferred encoding.
 187
 188         Returns the best encoding scheme for the system, based on
 189         locale.getpreferredencoding() and some further tweaks.
 190         """
 191         def yield_preferredencoding():
 192                 try:
 193                         pref = locale.getpreferredencoding()
 194                         u'TEST'.encode(pref)
 195                 except:
 196                         pref = 'UTF-8'
 197                 while True:
 198                         yield pref
 199         return yield_preferredencoding().next()
 200
 201 def htmlentity_transform(matchobj):
 202         """Transforms an HTML entity to a Unicode character.
 203
 204         This function receives a match object and is intended to be used with
 205         the re.sub() function.
 206         """
 207         entity = matchobj.group(1)
 208
 209         # Known non-numeric HTML entity
 210         if entity in htmlentitydefs.name2codepoint:
 211                 return unichr(htmlentitydefs.name2codepoint[entity])
 212
 213         # Unicode character
 214         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 215         if mobj is not None:
 216                 numstr = mobj.group(1)
 217                 if numstr.startswith(u'x'):
 218                         base = 16
 219                         numstr = u'0%s' % numstr
 220                 else:
 221                         base = 10
 222                 return unichr(long(numstr, base))
 223
 224         # Unknown entity in name, return its literal representation
 225         return (u'&%s;' % entity)
 226
 227 def sanitize_title(utitle):
 228         """Sanitizes a video title so it could be used as part of a filename."""
 229         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 230         return utitle.replace(unicode(os.sep), u'%')
 231
 232 def sanitize_open(filename, open_mode):
 233         """Try to open the given filename, and slightly tweak it if this fails.
 234
 235         Attempts to open the given filename. If this fails, it tries to change
 236         the filename slightly, step by step, until it's either able to open it
 237         or it fails and raises a final exception, like the standard open()
 238         function.
 239
 240         It returns the tuple (stream, definitive_file_name).
 241         """
 242         try:
 243                 if filename == u'-':
 244                         if sys.platform == 'win32':
 245                                 import msvcrt
 246                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 247                         return (sys.stdout, filename)
 248                 stream = open(filename, open_mode)
 249                 return (stream, filename)
 250         except (IOError, OSError), err:
 251                 # In case of error, try to remove win32 forbidden chars
 252                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 253
 254                 # An exception here should be caught in the caller
 255                 stream = open(filename, open_mode)
 256                 return (stream, filename)
 257
 258 def timeconvert(timestr):
 259     """Convert RFC 2822 defined time string into system timestamp"""
 260     timestamp = None
 261     timetuple = email.utils.parsedate_tz(timestr)
 262     if timetuple is not None:
 263         timestamp = email.utils.mktime_tz(timetuple)
 264     return timestamp
 265
 266 class DownloadError(Exception):
 267         """Download Error exception.
 268
 269         This exception may be thrown by FileDownloader objects if they are not
 270         configured to continue on errors. They will contain the appropriate
 271         error message.
 272         """
 273         pass
 274
 275 class SameFileError(Exception):
 276         """Same File exception.
 277
 278         This exception will be thrown by FileDownloader objects if they detect
 279         multiple files would have to be downloaded to the same file on disk.
 280         """
 281         pass
 282
 283 class PostProcessingError(Exception):
 284         """Post Processing exception.
 285
 286         This exception may be raised by PostProcessor's .run() method to
 287         indicate an error in the postprocessing task.
 288         """
 289         pass
 290
 291 class UnavailableVideoError(Exception):
 292         """Unavailable Format exception.
 293
 294         This exception will be thrown when a video is requested
 295         in a format that is not available for that video.
 296         """
 297         pass
 298
 299 class ContentTooShortError(Exception):
 300         """Content Too Short exception.
 301
 302         This exception may be raised by FileDownloader objects when a file they
 303         download is too small for what the server announced first, indicating
 304         the connection was probably interrupted.
 305         """
 306         # Both in bytes
 307         downloaded = None
 308         expected = None
 309
 310         def __init__(self, downloaded, expected):
 311                 self.downloaded = downloaded
 312                 self.expected = expected
 313
 314 class YoutubeDLHandler(urllib2.HTTPHandler):
 315         """Handler for HTTP requests and responses.
 316
 317         This class, when installed with an OpenerDirector, automatically adds
 318         the standard headers to every HTTP request and handles gzipped and
 319         deflated responses from web servers. If compression is to be avoided in
 320         a particular request, the original request in the program code only has
 321         to include the HTTP header "Youtubedl-No-Compression", which will be
 322         removed before making the real request.
 323
 324         Part of this code was copied from:
 325
 326           http://techknack.net/python-urllib2-handlers/
 327
 328         Andrew Rowls, the author of that code, agreed to release it to the
 329         public domain.
 330         """
 331
 332         @staticmethod
 333         def deflate(data):
 334                 try:
 335                         return zlib.decompress(data, -zlib.MAX_WBITS)
 336                 except zlib.error:
 337                         return zlib.decompress(data)
 338
 339         @staticmethod
 340         def addinfourl_wrapper(stream, headers, url, code):
 341                 if hasattr(urllib2.addinfourl, 'getcode'):
 342                         return urllib2.addinfourl(stream, headers, url, code)
 343                 ret = urllib2.addinfourl(stream, headers, url)
 344                 ret.code = code
 345                 return ret
 346
 347         def http_request(self, req):
 348                 for h in std_headers:
 349                         if h in req.headers:
 350                                 del req.headers[h]
 351                         req.add_header(h, std_headers[h])
 352                 if 'Youtubedl-no-compression' in req.headers:
 353                         if 'Accept-encoding' in req.headers:
 354                                 del req.headers['Accept-encoding']
 355                         del req.headers['Youtubedl-no-compression']
 356                 return req
 357
 358         def http_response(self, req, resp):
 359                 old_resp = resp
 360                 # gzip
 361                 if resp.headers.get('Content-encoding', '') == 'gzip':
 362                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 363                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 364                         resp.msg = old_resp.msg
 365                 # deflate
 366                 if resp.headers.get('Content-encoding', '') == 'deflate':
 367                         gz = StringIO.StringIO(self.deflate(resp.read()))
 368                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 369                         resp.msg = old_resp.msg
 370                 return resp
 371
 372 class FileDownloader(object):
 373         """File Downloader class.
 374
 375         File downloader objects are the ones responsible of downloading the
 376         actual video file and writing it to disk if the user has requested
 377         it, among some other tasks. In most cases there should be one per
 378         program. As, given a video URL, the downloader doesn't know how to
 379         extract all the needed information, task that InfoExtractors do, it
 380         has to pass the URL to one of them.
 381
 382         For this, file downloader objects have a method that allows
 383         InfoExtractors to be registered in a given order. When it is passed
 384         a URL, the file downloader handles it to the first InfoExtractor it
 385         finds that reports being able to handle it. The InfoExtractor extracts
 386         all the information about the video or videos the URL refers to, and
 387         asks the FileDownloader to process the video information, possibly
 388         downloading the video.
 389
 390         File downloaders accept a lot of parameters. In order not to saturate
 391         the object constructor with arguments, it receives a dictionary of
 392         options instead. These options are available through the params
 393         attribute for the InfoExtractors to use. The FileDownloader also
 394         registers itself as the downloader in charge for the InfoExtractors
 395         that are added to it, so this is a "mutual registration".
 396
 397         Available options:
 398
 399         username:         Username for authentication purposes.
 400         password:         Password for authentication purposes.
 401         usenetrc:         Use netrc for authentication instead.
 402         quiet:            Do not print messages to stdout.
 403         forceurl:         Force printing final URL.
 404         forcetitle:       Force printing title.
 405         forcethumbnail:   Force printing thumbnail URL.
 406         forcedescription: Force printing description.
 407         forcefilename:    Force printing final filename.
 408         simulate:         Do not download the video files.
 409         format:           Video format code.
 410         format_limit:     Highest quality format to try.
 411         outtmpl:          Template for output names.
 412         ignoreerrors:     Do not stop on download errors.
 413         ratelimit:        Download speed limit, in bytes/sec.
 414         nooverwrites:     Prevent overwriting files.
 415         retries:          Number of times to retry for HTTP error 5xx
 416         continuedl:       Try to continue downloads if possible.
 417         noprogress:       Do not print the progress bar.
 418         playliststart:    Playlist item to start at.
 419         playlistend:      Playlist item to end at.
 420         logtostderr:      Log messages to stderr instead of stdout.
 421         consoletitle:     Display progress in console window's titlebar.
 422         nopart:           Do not use temporary .part files.
 423         updatetime:       Use the Last-modified header to set output file timestamps.
 424         writedescription: Write the video description to a .description file
 425         writeinfojson:    Write the video description to a .info.json file
 426         """
 427
 428         params = None
 429         _ies = []
 430         _pps = []
 431         _download_retcode = None
 432         _num_downloads = None
 433         _screen_file = None
 434
 435         def __init__(self, params):
 436                 """Create a FileDownloader object with the given options."""
 437                 self._ies = []
 438                 self._pps = []
 439                 self._download_retcode = 0
 440                 self._num_downloads = 0
 441                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 442                 self.params = params
 443
 444         @staticmethod
 445         def pmkdir(filename):
 446                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 447                 components = filename.split(os.sep)
 448                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 449                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 450                 for dir in aggregate:
 451                         if not os.path.exists(dir):
 452                                 os.mkdir(dir)
 453
 454         @staticmethod
 455         def format_bytes(bytes):
 456                 if bytes is None:
 457                         return 'N/A'
 458                 if type(bytes) is str:
 459                         bytes = float(bytes)
 460                 if bytes == 0.0:
 461                         exponent = 0
 462                 else:
 463                         exponent = long(math.log(bytes, 1024.0))
 464                 suffix = 'bkMGTPEZY'[exponent]
 465                 converted = float(bytes) / float(1024**exponent)
 466                 return '%.2f%s' % (converted, suffix)
 467
 468         @staticmethod
 469         def calc_percent(byte_counter, data_len):
 470                 if data_len is None:
 471                         return '---.-%'
 472                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 473
 474         @staticmethod
 475         def calc_eta(start, now, total, current):
 476                 if total is None:
 477                         return '--:--'
 478                 dif = now - start
 479                 if current == 0 or dif < 0.001: # One millisecond
 480                         return '--:--'
 481                 rate = float(current) / dif
 482                 eta = long((float(total) - float(current)) / rate)
 483                 (eta_mins, eta_secs) = divmod(eta, 60)
 484                 if eta_mins > 99:
 485                         return '--:--'
 486                 return '%02d:%02d' % (eta_mins, eta_secs)
 487
 488         @staticmethod
 489         def calc_speed(start, now, bytes):
 490                 dif = now - start
 491                 if bytes == 0 or dif < 0.001: # One millisecond
 492                         return '%10s' % '---b/s'
 493                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 494
 495         @staticmethod
 496         def best_block_size(elapsed_time, bytes):
 497                 new_min = max(bytes / 2.0, 1.0)
 498                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 499                 if elapsed_time < 0.001:
 500                         return long(new_max)
 501                 rate = bytes / elapsed_time
 502                 if rate > new_max:
 503                         return long(new_max)
 504                 if rate < new_min:
 505                         return long(new_min)
 506                 return long(rate)
 507
 508         @staticmethod
 509         def parse_bytes(bytestr):
 510                 """Parse a string indicating a byte quantity into a long integer."""
 511                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 512                 if matchobj is None:
 513                         return None
 514                 number = float(matchobj.group(1))
 515                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 516                 return long(round(number * multiplier))
 517
 518         def add_info_extractor(self, ie):
 519                 """Add an InfoExtractor object to the end of the list."""
 520                 self._ies.append(ie)
 521                 ie.set_downloader(self)
 522
 523         def add_post_processor(self, pp):
 524                 """Add a PostProcessor object to the end of the chain."""
 525                 self._pps.append(pp)
 526                 pp.set_downloader(self)
 527
 528         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 529                 """Print message to stdout if not in quiet mode."""
 530                 try:
 531                         if not self.params.get('quiet', False):
 532                                 terminator = [u'\n', u''][skip_eol]
 533                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 534                         self._screen_file.flush()
 535                 except (UnicodeEncodeError), err:
 536                         if not ignore_encoding_errors:
 537                                 raise
 538
 539         def to_stderr(self, message):
 540                 """Print message to stderr."""
 541                 print >>sys.stderr, message.encode(preferredencoding())
 542
 543         def to_cons_title(self, message):
 544                 """Set console/terminal window title to message."""
 545                 if not self.params.get('consoletitle', False):
 546                         return
 547                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 548                         # c_wchar_p() might not be necessary if `message` is
 549                         # already of type unicode()
 550                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 551                 elif 'TERM' in os.environ:
 552                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 553
 554         def fixed_template(self):
 555                 """Checks if the output template is fixed."""
 556                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 557
 558         def trouble(self, message=None):
 559                 """Determine action to take when a download problem appears.
 560
 561                 Depending on if the downloader has been configured to ignore
 562                 download errors or not, this method may throw an exception or
 563                 not when errors are found, after printing the message.
 564                 """
 565                 if message is not None:
 566                         self.to_stderr(message)
 567                 if not self.params.get('ignoreerrors', False):
 568                         raise DownloadError(message)
 569                 self._download_retcode = 1
 570
 571         def slow_down(self, start_time, byte_counter):
 572                 """Sleep if the download speed is over the rate limit."""
 573                 rate_limit = self.params.get('ratelimit', None)
 574                 if rate_limit is None or byte_counter == 0:
 575                         return
 576                 now = time.time()
 577                 elapsed = now - start_time
 578                 if elapsed <= 0.0:
 579                         return
 580                 speed = float(byte_counter) / elapsed
 581                 if speed > rate_limit:
 582                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 583
 584         def temp_name(self, filename):
 585                 """Returns a temporary filename for the given filename."""
 586                 if self.params.get('nopart', False) or filename == u'-' or \
 587                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 588                         return filename
 589                 return filename + u'.part'
 590
 591         def undo_temp_name(self, filename):
 592                 if filename.endswith(u'.part'):
 593                         return filename[:-len(u'.part')]
 594                 return filename
 595
 596         def try_rename(self, old_filename, new_filename):
 597                 try:
 598                         if old_filename == new_filename:
 599                                 return
 600                         os.rename(old_filename, new_filename)
 601                 except (IOError, OSError), err:
 602                         self.trouble(u'ERROR: unable to rename file')
 603
 604         def try_utime(self, filename, last_modified_hdr):
 605                 """Try to set the last-modified time of the given file."""
 606                 if last_modified_hdr is None:
 607                         return
 608                 if not os.path.isfile(filename):
 609                         return
 610                 timestr = last_modified_hdr
 611                 if timestr is None:
 612                         return
 613                 filetime = timeconvert(timestr)
 614                 if filetime is None:
 615                         return
 616                 try:
 617                         os.utime(filename,(time.time(), filetime))
 618                 except:
 619                         pass
 620
 621         def report_writedescription(self, descfn):
 622                 """ Report that the description file is being written """
 623                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 624
 625         def report_writeinfojson(self, infofn):
 626                 """ Report that the metadata file has been written """
 627                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 628
 629         def report_destination(self, filename):
 630                 """Report destination filename."""
 631                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 632
 633         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 634                 """Report download progress."""
 635                 if self.params.get('noprogress', False):
 636                         return
 637                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 638                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 639                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 640                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 641
 642         def report_resuming_byte(self, resume_len):
 643                 """Report attempt to resume at given byte."""
 644                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 645
 646         def report_retry(self, count, retries):
 647                 """Report retry in case of HTTP error 5xx"""
 648                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 649
 650         def report_file_already_downloaded(self, file_name):
 651                 """Report file has already been fully downloaded."""
 652                 try:
 653                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 654                 except (UnicodeEncodeError), err:
 655                         self.to_screen(u'[download] The file has already been downloaded')
 656
 657         def report_unable_to_resume(self):
 658                 """Report it was impossible to resume download."""
 659                 self.to_screen(u'[download] Unable to resume')
 660
 661         def report_finish(self):
 662                 """Report download finished."""
 663                 if self.params.get('noprogress', False):
 664                         self.to_screen(u'[download] Download completed')
 665                 else:
 666                         self.to_screen(u'')
 667
 668         def increment_downloads(self):
 669                 """Increment the ordinal that assigns a number to each file."""
 670                 self._num_downloads += 1
 671
 672         def prepare_filename(self, info_dict):
 673                 """Generate the output filename."""
 674                 try:
 675                         template_dict = dict(info_dict)
 676                         template_dict['epoch'] = unicode(long(time.time()))
 677                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 678                         filename = self.params['outtmpl'] % template_dict
 679                         return filename
 680                 except (ValueError, KeyError), err:
 681                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 682                         return None
 683
 684         def process_info(self, info_dict):
 685                 """Process a single dictionary returned by an InfoExtractor."""
 686                 filename = self.prepare_filename(info_dict)
 687                 # Do nothing else if in simulate mode
 688                 if self.params.get('simulate', False):
 689                         # Forced printings
 690                         if self.params.get('forcetitle', False):
 691                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 692                         if self.params.get('forceurl', False):
 693                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 694                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 695                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 696                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 697                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 698                         if self.params.get('forcefilename', False) and filename is not None:
 699                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 700
 701                         return
 702
 703                 if filename is None:
 704                         return
 705                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 706                         self.to_stderr(u'WARNING: file exists and will be skipped')
 707                         return
 708
 709                 try:
 710                         self.pmkdir(filename)
 711                 except (OSError, IOError), err:
 712                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 713                         return
 714
 715                 if self.params.get('writedescription', False):
 716                         try:
 717                                 descfn = filename + '.description'
 718                                 self.report_writedescription(descfn)
 719                                 descfile = open(descfn, 'wb')
 720                                 try:
 721                                         descfile.write(info_dict['description'].encode('utf-8'))
 722                                 finally:
 723                                         descfile.close()
 724                         except (OSError, IOError):
 725                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 726                                 return
 727
 728                 if self.params.get('writeinfojson', False):
 729                         infofn = filename + '.info.json'
 730                         self.report_writeinfojson(infofn)
 731                         try:
 732                                 json.dump
 733                         except (NameError,AttributeError):
 734                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 735                                 return
 736                         try:
 737                                 infof = open(infofn, 'wb')
 738                                 try:
 739                                         json.dump(info_dict, infof)
 740                                 finally:
 741                                         infof.close()
 742                         except (OSError, IOError):
 743                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 744                                 return
 745
 746                 try:
 747                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 748                 except (OSError, IOError), err:
 749                         raise UnavailableVideoError
 750                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 751                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 752                         return
 753                 except (ContentTooShortError, ), err:
 754                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 755                         return
 756
 757                 if success:
 758                         try:
 759                                 self.post_process(filename, info_dict)
 760                         except (PostProcessingError), err:
 761                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 762                                 return
 763
 764         def download(self, url_list):
 765                 """Download a given list of URLs."""
 766                 if len(url_list) > 1 and self.fixed_template():
 767                         raise SameFileError(self.params['outtmpl'])
 768
 769                 for url in url_list:
 770                         suitable_found = False
 771                         for ie in self._ies:
 772                                 # Go to next InfoExtractor if not suitable
 773                                 if not ie.suitable(url):
 774                                         continue
 775
 776                                 # Suitable InfoExtractor found
 777                                 suitable_found = True
 778
 779                                 # Extract information from URL and process it
 780                                 ie.extract(url)
 781
 782                                 # Suitable InfoExtractor had been found; go to next URL
 783                                 break
 784
 785                         if not suitable_found:
 786                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 787
 788                 return self._download_retcode
 789
 790         def post_process(self, filename, ie_info):
 791                 """Run the postprocessing chain on the given file."""
 792                 info = dict(ie_info)
 793                 info['filepath'] = filename
 794                 for pp in self._pps:
 795                         info = pp.run(info)
 796                         if info is None:
 797                                 break
 798
 799         def _download_with_rtmpdump(self, filename, url, player_url):
 800                 self.report_destination(filename)
 801                 tmpfilename = self.temp_name(filename)
 802
 803                 # Check for rtmpdump first
 804                 try:
 805                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 806                 except (OSError, IOError):
 807                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 808                         return False
 809
 810                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 811                 # the connection was interrumpted and resuming appears to be
 812                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 813                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 814                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 815                 while retval == 2 or retval == 1:
 816                         prevsize = os.path.getsize(tmpfilename)
 817                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 818                         time.sleep(5.0) # This seems to be needed
 819                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 820                         cursize = os.path.getsize(tmpfilename)
 821                         if prevsize == cursize and retval == 1:
 822                                 break
 823                 if retval == 0:
 824                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 825                         self.try_rename(tmpfilename, filename)
 826                         return True
 827                 else:
 828                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 829                         return False
 830
 831         def _do_download(self, filename, url, player_url):
 832                 # Check file already present
 833                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 834                         self.report_file_already_downloaded(filename)
 835                         return True
 836
 837                 # Attempt to download using rtmpdump
 838                 if url.startswith('rtmp'):
 839                         return self._download_with_rtmpdump(filename, url, player_url)
 840
 841                 tmpfilename = self.temp_name(filename)
 842                 stream = None
 843                 open_mode = 'wb'
 844
 845                 # Do not include the Accept-Encoding header
 846                 headers = {'Youtubedl-no-compression': 'True'}
 847                 basic_request = urllib2.Request(url, None, headers)
 848                 request = urllib2.Request(url, None, headers)
 849
 850                 # Establish possible resume length
 851                 if os.path.isfile(tmpfilename):
 852                         resume_len = os.path.getsize(tmpfilename)
 853                 else:
 854                         resume_len = 0
 855
 856                 # Request parameters in case of being able to resume
 857                 if self.params.get('continuedl', False) and resume_len != 0:
 858                         self.report_resuming_byte(resume_len)
 859                         request.add_header('Range','bytes=%d-' % resume_len)
 860                         open_mode = 'ab'
 861
 862                 count = 0
 863                 retries = self.params.get('retries', 0)
 864                 while count <= retries:
 865                         # Establish connection
 866                         try:
 867                                 data = urllib2.urlopen(request)
 868                                 break
 869                         except (urllib2.HTTPError, ), err:
 870                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 871                                         # Unexpected HTTP error
 872                                         raise
 873                                 elif err.code == 416:
 874                                         # Unable to resume (requested range not satisfiable)
 875                                         try:
 876                                                 # Open the connection again without the range header
 877                                                 data = urllib2.urlopen(basic_request)
 878                                                 content_length = data.info()['Content-Length']
 879                                         except (urllib2.HTTPError, ), err:
 880                                                 if err.code < 500 or err.code >= 600:
 881                                                         raise
 882                                         else:
 883                                                 # Examine the reported length
 884                                                 if (content_length is not None and
 885                                                         (resume_len - 100 < long(content_length) < resume_len + 100)):
 886                                                         # The file had already been fully downloaded.
 887                                                         # Explanation to the above condition: in issue #175 it was revealed that
 888                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 889                                                         # changing the file size slightly and causing problems for some users. So
 890                                                         # I decided to implement a suggested change and consider the file
 891                                                         # completely downloaded if the file size differs less than 100 bytes from
 892                                                         # the one in the hard drive.
 893                                                         self.report_file_already_downloaded(filename)
 894                                                         self.try_rename(tmpfilename, filename)
 895                                                         return True
 896                                                 else:
 897                                                         # The length does not match, we start the download over
 898                                                         self.report_unable_to_resume()
 899                                                         open_mode = 'wb'
 900                                                         break
 901                         # Retry
 902                         count += 1
 903                         if count <= retries:
 904                                 self.report_retry(count, retries)
 905
 906                 if count > retries:
 907                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 908                         return False
 909
 910                 data_len = data.info().get('Content-length', None)
 911                 if data_len is not None:
 912                         data_len = long(data_len) + resume_len
 913                 data_len_str = self.format_bytes(data_len)
 914                 byte_counter = 0 + resume_len
 915                 block_size = 1024
 916                 start = time.time()
 917                 while True:
 918                         # Download and write
 919                         before = time.time()
 920                         data_block = data.read(block_size)
 921                         after = time.time()
 922                         if len(data_block) == 0:
 923                                 break
 924                         byte_counter += len(data_block)
 925
 926                         # Open file just in time
 927                         if stream is None:
 928                                 try:
 929                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 930                                         filename = self.undo_temp_name(tmpfilename)
 931                                         self.report_destination(filename)
 932                                 except (OSError, IOError), err:
 933                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 934                                         return False
 935                         try:
 936                                 stream.write(data_block)
 937                         except (IOError, OSError), err:
 938                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 939                                 return False
 940                         block_size = self.best_block_size(after - before, len(data_block))
 941
 942                         # Progress message
 943                         percent_str = self.calc_percent(byte_counter, data_len)
 944                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 945                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 946                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 947
 948                         # Apply rate limit
 949                         self.slow_down(start, byte_counter - resume_len)
 950
 951                 stream.close()
 952                 self.report_finish()
 953                 if data_len is not None and byte_counter != data_len:
 954                         raise ContentTooShortError(byte_counter, long(data_len))
 955                 self.try_rename(tmpfilename, filename)
 956
 957                 # Update file modification time
 958                 if self.params.get('updatetime', True):
 959                         self.try_utime(filename, data.info().get('last-modified', None))
 960
 961                 return True
 962
 963 class InfoExtractor(object):
 964         """Information Extractor class.
 965
 966         Information extractors are the classes that, given a URL, extract
 967         information from the video (or videos) the URL refers to. This
 968         information includes the real video URL, the video title and simplified
 969         title, author and others. The information is stored in a dictionary
 970         which is then passed to the FileDownloader. The FileDownloader
 971         processes this information possibly downloading the video to the file
 972         system, among other possible outcomes. The dictionaries must include
 973         the following fields:
 974
 975         id:             Video identifier.
 976         url:            Final video URL.
 977         uploader:       Nickname of the video uploader.
 978         title:          Literal title.
 979         stitle:         Simplified title.
 980         ext:            Video filename extension.
 981         format:         Video format.
 982         player_url:     SWF Player URL (may be None).
 983
 984         The following fields are optional. Their primary purpose is to allow
 985         youtube-dl to serve as the backend for a video search function, such
 986         as the one in youtube2mp3.  They are only used when their respective
 987         forced printing functions are called:
 988
 989         thumbnail:      Full URL to a video thumbnail image.
 990         description:    One-line video description.
 991
 992         Subclasses of this one should re-define the _real_initialize() and
 993         _real_extract() methods, as well as the suitable() static method.
 994         Probably, they should also be instantiated and added to the main
 995         downloader.
 996         """
 997
 998         _ready = False
 999         _downloader = None
1000
1001         def __init__(self, downloader=None):
1002                 """Constructor. Receives an optional downloader."""
1003                 self._ready = False
1004                 self.set_downloader(downloader)
1005
1006         @staticmethod
1007         def suitable(url):
1008                 """Receives a URL and returns True if suitable for this IE."""
1009                 return False
1010
1011         def initialize(self):
1012                 """Initializes an instance (authentication, etc)."""
1013                 if not self._ready:
1014                         self._real_initialize()
1015                         self._ready = True
1016
1017         def extract(self, url):
1018                 """Extracts URL information and returns it in list of dicts."""
1019                 self.initialize()
1020                 return self._real_extract(url)
1021
1022         def set_downloader(self, downloader):
1023                 """Sets the downloader for this IE."""
1024                 self._downloader = downloader
1025
1026         def _real_initialize(self):
1027                 """Real initialization process. Redefine in subclasses."""
1028                 pass
1029
1030         def _real_extract(self, url):
1031                 """Real extraction process. Redefine in subclasses."""
1032                 pass
1033
1034 class YoutubeIE(InfoExtractor):
1035         """Information extractor for youtube.com."""
1036
1037         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1038         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1039         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1040         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1041         _NETRC_MACHINE = 'youtube'
1042         # Listed in order of quality
1043         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1044         _video_extensions = {
1045                 '13': '3gp',
1046                 '17': 'mp4',
1047                 '18': 'mp4',
1048                 '22': 'mp4',
1049                 '37': 'mp4',
1050                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1051                 '43': 'webm',
1052                 '45': 'webm',
1053         }
1054
1055         @staticmethod
1056         def suitable(url):
1057                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1058
1059         def report_lang(self):
1060                 """Report attempt to set language."""
1061                 self._downloader.to_screen(u'[youtube] Setting language')
1062
1063         def report_login(self):
1064                 """Report attempt to log in."""
1065                 self._downloader.to_screen(u'[youtube] Logging in')
1066
1067         def report_age_confirmation(self):
1068                 """Report attempt to confirm age."""
1069                 self._downloader.to_screen(u'[youtube] Confirming age')
1070
1071         def report_video_webpage_download(self, video_id):
1072                 """Report attempt to download video webpage."""
1073                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1074
1075         def report_video_info_webpage_download(self, video_id):
1076                 """Report attempt to download video info webpage."""
1077                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1078
1079         def report_information_extraction(self, video_id):
1080                 """Report attempt to extract video information."""
1081                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1082
1083         def report_unavailable_format(self, video_id, format):
1084                 """Report extracted video URL."""
1085                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1086
1087         def report_rtmp_download(self):
1088                 """Indicate the download will use the RTMP protocol."""
1089                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1090
1091         def _real_initialize(self):
1092                 if self._downloader is None:
1093                         return
1094
1095                 username = None
1096                 password = None
1097                 downloader_params = self._downloader.params
1098
1099                 # Attempt to use provided username and password or .netrc data
1100                 if downloader_params.get('username', None) is not None:
1101                         username = downloader_params['username']
1102                         password = downloader_params['password']
1103                 elif downloader_params.get('usenetrc', False):
1104                         try:
1105                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1106                                 if info is not None:
1107                                         username = info[0]
1108                                         password = info[2]
1109                                 else:
1110                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1111                         except (IOError, netrc.NetrcParseError), err:
1112                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1113                                 return
1114
1115                 # Set language
1116                 request = urllib2.Request(self._LANG_URL)
1117                 try:
1118                         self.report_lang()
1119                         urllib2.urlopen(request).read()
1120                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1121                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1122                         return
1123
1124                 # No authentication to be performed
1125                 if username is None:
1126                         return
1127
1128                 # Log in
1129                 login_form = {
1130                                 'current_form': 'loginForm',
1131                                 'next':         '/',
1132                                 'action_login': 'Log In',
1133                                 'username':     username,
1134                                 'password':     password,
1135                                 }
1136                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1137                 try:
1138                         self.report_login()
1139                         login_results = urllib2.urlopen(request).read()
1140                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1141                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1142                                 return
1143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1144                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1145                         return
1146
1147                 # Confirm age
1148                 age_form = {
1149                                 'next_url':             '/',
1150                                 'action_confirm':       'Confirm',
1151                                 }
1152                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1153                 try:
1154                         self.report_age_confirmation()
1155                         age_results = urllib2.urlopen(request).read()
1156                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1158                         return
1159
1160         def _real_extract(self, url):
1161                 # Extract video id from URL
1162                 mobj = re.match(self._VALID_URL, url)
1163                 if mobj is None:
1164                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1165                         return
1166                 video_id = mobj.group(2)
1167
1168                 # Get video webpage
1169                 self.report_video_webpage_download(video_id)
1170                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1171                 try:
1172                         video_webpage = urllib2.urlopen(request).read()
1173                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1174                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1175                         return
1176
1177                 # Attempt to extract SWF player URL
1178                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1179                 if mobj is not None:
1180                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1181                 else:
1182                         player_url = None
1183
1184                 # Get video info
1185                 self.report_video_info_webpage_download(video_id)
1186                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1187                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1188                                            % (video_id, el_type))
1189                         request = urllib2.Request(video_info_url)
1190                         try:
1191                                 video_info_webpage = urllib2.urlopen(request).read()
1192                                 video_info = parse_qs(video_info_webpage)
1193                                 if 'token' in video_info:
1194                                         break
1195                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1197                                 return
1198                 if 'token' not in video_info:
1199                         if 'reason' in video_info:
1200                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1201                         else:
1202                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1203                         return
1204
1205                 # Start extracting information
1206                 self.report_information_extraction(video_id)
1207
1208                 # uploader
1209                 if 'author' not in video_info:
1210                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1211                         return
1212                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1213
1214                 # title
1215                 if 'title' not in video_info:
1216                         self._downloader.trouble(u'ERROR: unable to extract video title')
1217                         return
1218                 video_title = urllib.unquote_plus(video_info['title'][0])
1219                 video_title = video_title.decode('utf-8')
1220                 video_title = sanitize_title(video_title)
1221
1222                 # simplified title
1223                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1224                 simple_title = simple_title.strip(ur'_')
1225
1226                 # thumbnail image
1227                 if 'thumbnail_url' not in video_info:
1228                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1229                         video_thumbnail = ''
1230                 else:   # don't panic if we can't find it
1231                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1232
1233                 # upload date
1234                 upload_date = u'NA'
1235                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1236                 if mobj is not None:
1237                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1238                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1239                         for expression in format_expressions:
1240                                 try:
1241                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1242                                 except:
1243                                         pass
1244
1245                 # description
1246                 try:
1247                         lxml.etree
1248                 except NameError:
1249                         video_description = u'No description available.'
1250                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1251                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1252                                 if mobj is not None:
1253                                         video_description = mobj.group(1).decode('utf-8')
1254                 else:
1255                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1256                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1257                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1258                         # TODO use another parser
1259
1260                 # token
1261                 video_token = urllib.unquote_plus(video_info['token'][0])
1262
1263                 # Decide which formats to download
1264                 req_format = self._downloader.params.get('format', None)
1265
1266                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1267                         self.report_rtmp_download()
1268                         video_url_list = [(None, video_info['conn'][0])]
1269                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1270                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1271                         url_data = [parse_qs(uds) for uds in url_data_strs]
1272                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1273                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1274
1275                         format_limit = self._downloader.params.get('format_limit', None)
1276                         if format_limit is not None and format_limit in self._available_formats:
1277                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1278                         else:
1279                                 format_list = self._available_formats
1280                         existing_formats = [x for x in format_list if x in url_map]
1281                         if len(existing_formats) == 0:
1282                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1283                                 return
1284                         if req_format is None:
1285                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1286                         elif req_format == '-1':
1287                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1288                         else:
1289                                 # Specific format
1290                                 if req_format not in url_map:
1291                                         self._downloader.trouble(u'ERROR: requested format not available')
1292                                         return
1293                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1294                 else:
1295                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1296                         return
1297
1298                 for format_param, video_real_url in video_url_list:
1299                         # At this point we have a new video
1300                         self._downloader.increment_downloads()
1301
1302                         # Extension
1303                         video_extension = self._video_extensions.get(format_param, 'flv')
1304
1305                         try:
1306                                 # Process video information
1307                                 self._downloader.process_info({
1308                                         'id':           video_id.decode('utf-8'),
1309                                         'url':          video_real_url.decode('utf-8'),
1310                                         'uploader':     video_uploader.decode('utf-8'),
1311                                         'upload_date':  upload_date,
1312                                         'title':        video_title,
1313                                         'stitle':       simple_title,
1314                                         'ext':          video_extension.decode('utf-8'),
1315                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1316                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1317                                         'description':  video_description,
1318                                         'player_url':   player_url,
1319                                 })
1320                         except UnavailableVideoError, err:
1321                                 self._downloader.trouble(u'\nERROR: unable to download video')
1322
1323
1324 class MetacafeIE(InfoExtractor):
1325         """Information Extractor for metacafe.com."""
1326
1327         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1328         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1329         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1330         _youtube_ie = None
1331
1332         def __init__(self, youtube_ie, downloader=None):
1333                 InfoExtractor.__init__(self, downloader)
1334                 self._youtube_ie = youtube_ie
1335
1336         @staticmethod
1337         def suitable(url):
1338                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1339
1340         def report_disclaimer(self):
1341                 """Report disclaimer retrieval."""
1342                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1343
1344         def report_age_confirmation(self):
1345                 """Report attempt to confirm age."""
1346                 self._downloader.to_screen(u'[metacafe] Confirming age')
1347
1348         def report_download_webpage(self, video_id):
1349                 """Report webpage download."""
1350                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1351
1352         def report_extraction(self, video_id):
1353                 """Report information extraction."""
1354                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1355
1356         def _real_initialize(self):
1357                 # Retrieve disclaimer
1358                 request = urllib2.Request(self._DISCLAIMER)
1359                 try:
1360                         self.report_disclaimer()
1361                         disclaimer = urllib2.urlopen(request).read()
1362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1364                         return
1365
1366                 # Confirm age
1367                 disclaimer_form = {
1368                         'filters': '0',
1369                         'submit': "Continue - I'm over 18",
1370                         }
1371                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1372                 try:
1373                         self.report_age_confirmation()
1374                         disclaimer = urllib2.urlopen(request).read()
1375                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1376                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1377                         return
1378
1379         def _real_extract(self, url):
1380                 # Extract id and simplified title from URL
1381                 mobj = re.match(self._VALID_URL, url)
1382                 if mobj is None:
1383                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1384                         return
1385
1386                 video_id = mobj.group(1)
1387
1388                 # Check if video comes from YouTube
1389                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1390                 if mobj2 is not None:
1391                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1392                         return
1393
1394                 # At this point we have a new video
1395                 self._downloader.increment_downloads()
1396
1397                 simple_title = mobj.group(2).decode('utf-8')
1398
1399                 # Retrieve video webpage to extract further information
1400                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1401                 try:
1402                         self.report_download_webpage(video_id)
1403                         webpage = urllib2.urlopen(request).read()
1404                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1406                         return
1407
1408                 # Extract URL, uploader and title from webpage
1409                 self.report_extraction(video_id)
1410                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1411                 if mobj is not None:
1412                         mediaURL = urllib.unquote(mobj.group(1))
1413                         video_extension = mediaURL[-3:]
1414
1415                         # Extract gdaKey if available
1416                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1417                         if mobj is None:
1418                                 video_url = mediaURL
1419                         else:
1420                                 gdaKey = mobj.group(1)
1421                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1422                 else:
1423                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1424                         if mobj is None:
1425                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1426                                 return
1427                         vardict = parse_qs(mobj.group(1))
1428                         if 'mediaData' not in vardict:
1429                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1430                                 return
1431                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1432                         if mobj is None:
1433                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1434                                 return
1435                         mediaURL = mobj.group(1).replace('\\/', '/')
1436                         video_extension = mediaURL[-3:]
1437                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1438
1439                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: unable to extract title')
1442                         return
1443                 video_title = mobj.group(1).decode('utf-8')
1444                 video_title = sanitize_title(video_title)
1445
1446                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1447                 if mobj is None:
1448                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1449                         return
1450                 video_uploader = mobj.group(1)
1451
1452                 try:
1453                         # Process video information
1454                         self._downloader.process_info({
1455                                 'id':           video_id.decode('utf-8'),
1456                                 'url':          video_url.decode('utf-8'),
1457                                 'uploader':     video_uploader.decode('utf-8'),
1458                                 'upload_date':  u'NA',
1459                                 'title':        video_title,
1460                                 'stitle':       simple_title,
1461                                 'ext':          video_extension.decode('utf-8'),
1462                                 'format':       u'NA',
1463                                 'player_url':   None,
1464                         })
1465                 except UnavailableVideoError:
1466                         self._downloader.trouble(u'\nERROR: unable to download video')
1467
1468
1469 class DailymotionIE(InfoExtractor):
1470         """Information Extractor for Dailymotion"""
1471
1472         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1473
1474         def __init__(self, downloader=None):
1475                 InfoExtractor.__init__(self, downloader)
1476
1477         @staticmethod
1478         def suitable(url):
1479                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1480
1481         def report_download_webpage(self, video_id):
1482                 """Report webpage download."""
1483                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1484
1485         def report_extraction(self, video_id):
1486                 """Report information extraction."""
1487                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1488
1489         def _real_initialize(self):
1490                 return
1491
1492         def _real_extract(self, url):
1493                 # Extract id and simplified title from URL
1494                 mobj = re.match(self._VALID_URL, url)
1495                 if mobj is None:
1496                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1497                         return
1498
1499                 # At this point we have a new video
1500                 self._downloader.increment_downloads()
1501                 video_id = mobj.group(1)
1502
1503                 simple_title = mobj.group(2).decode('utf-8')
1504                 video_extension = 'flv'
1505
1506                 # Retrieve video webpage to extract further information
1507                 request = urllib2.Request(url)
1508                 try:
1509                         self.report_download_webpage(video_id)
1510                         webpage = urllib2.urlopen(request).read()
1511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1513                         return
1514
1515                 # Extract URL, uploader and title from webpage
1516                 self.report_extraction(video_id)
1517                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1518                 if mobj is None:
1519                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1520                         return
1521                 mediaURL = urllib.unquote(mobj.group(1))
1522
1523                 # if needed add http://www.dailymotion.com/ if relative URL
1524
1525                 video_url = mediaURL
1526
1527                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1528                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1529                 if mobj is None:
1530                         self._downloader.trouble(u'ERROR: unable to extract title')
1531                         return
1532                 video_title = mobj.group(1).decode('utf-8')
1533                 video_title = sanitize_title(video_title)
1534
1535                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1536                 if mobj is None:
1537                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1538                         return
1539                 video_uploader = mobj.group(1)
1540
1541                 try:
1542                         # Process video information
1543                         self._downloader.process_info({
1544                                 'id':           video_id.decode('utf-8'),
1545                                 'url':          video_url.decode('utf-8'),
1546                                 'uploader':     video_uploader.decode('utf-8'),
1547                                 'upload_date':  u'NA',
1548                                 'title':        video_title,
1549                                 'stitle':       simple_title,
1550                                 'ext':          video_extension.decode('utf-8'),
1551                                 'format':       u'NA',
1552                                 'player_url':   None,
1553                         })
1554                 except UnavailableVideoError:
1555                         self._downloader.trouble(u'\nERROR: unable to download video')
1556
1557 class GoogleIE(InfoExtractor):
1558         """Information extractor for video.google.com."""
1559
1560         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1561
1562         def __init__(self, downloader=None):
1563                 InfoExtractor.__init__(self, downloader)
1564
1565         @staticmethod
1566         def suitable(url):
1567                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1568
1569         def report_download_webpage(self, video_id):
1570                 """Report webpage download."""
1571                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1572
1573         def report_extraction(self, video_id):
1574                 """Report information extraction."""
1575                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1576
1577         def _real_initialize(self):
1578                 return
1579
1580         def _real_extract(self, url):
1581                 # Extract id from URL
1582                 mobj = re.match(self._VALID_URL, url)
1583                 if mobj is None:
1584                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1585                         return
1586
1587                 # At this point we have a new video
1588                 self._downloader.increment_downloads()
1589                 video_id = mobj.group(1)
1590
1591                 video_extension = 'mp4'
1592
1593                 # Retrieve video webpage to extract further information
1594                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1595                 try:
1596                         self.report_download_webpage(video_id)
1597                         webpage = urllib2.urlopen(request).read()
1598                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1599                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1600                         return
1601
1602                 # Extract URL, uploader, and title from webpage
1603                 self.report_extraction(video_id)
1604                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1605                 if mobj is None:
1606                         video_extension = 'flv'
1607                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1610                         return
1611                 mediaURL = urllib.unquote(mobj.group(1))
1612                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1613                 mediaURL = mediaURL.replace('\\x26', '\x26')
1614
1615                 video_url = mediaURL
1616
1617                 mobj = re.search(r'<title>(.*)</title>', webpage)
1618                 if mobj is None:
1619                         self._downloader.trouble(u'ERROR: unable to extract title')
1620                         return
1621                 video_title = mobj.group(1).decode('utf-8')
1622                 video_title = sanitize_title(video_title)
1623                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1624
1625                 # Extract video description
1626                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1627                 if mobj is None:
1628                         self._downloader.trouble(u'ERROR: unable to extract video description')
1629                         return
1630                 video_description = mobj.group(1).decode('utf-8')
1631                 if not video_description:
1632                         video_description = 'No description available.'
1633
1634                 # Extract video thumbnail
1635                 if self._downloader.params.get('forcethumbnail', False):
1636                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1637                         try:
1638                                 webpage = urllib2.urlopen(request).read()
1639                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1640                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1641                                 return
1642                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1643                         if mobj is None:
1644                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1645                                 return
1646                         video_thumbnail = mobj.group(1)
1647                 else:   # we need something to pass to process_info
1648                         video_thumbnail = ''
1649
1650
1651                 try:
1652                         # Process video information
1653                         self._downloader.process_info({
1654                                 'id':           video_id.decode('utf-8'),
1655                                 'url':          video_url.decode('utf-8'),
1656                                 'uploader':     u'NA',
1657                                 'upload_date':  u'NA',
1658                                 'title':        video_title,
1659                                 'stitle':       simple_title,
1660                                 'ext':          video_extension.decode('utf-8'),
1661                                 'format':       u'NA',
1662                                 'player_url':   None,
1663                         })
1664                 except UnavailableVideoError:
1665                         self._downloader.trouble(u'\nERROR: unable to download video')
1666
1667
1668 class PhotobucketIE(InfoExtractor):
1669         """Information extractor for photobucket.com."""
1670
1671         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1672
1673         def __init__(self, downloader=None):
1674                 InfoExtractor.__init__(self, downloader)
1675
1676         @staticmethod
1677         def suitable(url):
1678                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1679
1680         def report_download_webpage(self, video_id):
1681                 """Report webpage download."""
1682                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1683
1684         def report_extraction(self, video_id):
1685                 """Report information extraction."""
1686                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1687
1688         def _real_initialize(self):
1689                 return
1690
1691         def _real_extract(self, url):
1692                 # Extract id from URL
1693                 mobj = re.match(self._VALID_URL, url)
1694                 if mobj is None:
1695                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1696                         return
1697
1698                 # At this point we have a new video
1699                 self._downloader.increment_downloads()
1700                 video_id = mobj.group(1)
1701
1702                 video_extension = 'flv'
1703
1704                 # Retrieve video webpage to extract further information
1705                 request = urllib2.Request(url)
1706                 try:
1707                         self.report_download_webpage(video_id)
1708                         webpage = urllib2.urlopen(request).read()
1709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1711                         return
1712
1713                 # Extract URL, uploader, and title from webpage
1714                 self.report_extraction(video_id)
1715                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1716                 if mobj is None:
1717                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1718                         return
1719                 mediaURL = urllib.unquote(mobj.group(1))
1720
1721                 video_url = mediaURL
1722
1723                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1724                 if mobj is None:
1725                         self._downloader.trouble(u'ERROR: unable to extract title')
1726                         return
1727                 video_title = mobj.group(1).decode('utf-8')
1728                 video_title = sanitize_title(video_title)
1729                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1730
1731                 video_uploader = mobj.group(2).decode('utf-8')
1732
1733                 try:
1734                         # Process video information
1735                         self._downloader.process_info({
1736                                 'id':           video_id.decode('utf-8'),
1737                                 'url':          video_url.decode('utf-8'),
1738                                 'uploader':     video_uploader,
1739                                 'upload_date':  u'NA',
1740                                 'title':        video_title,
1741                                 'stitle':       simple_title,
1742                                 'ext':          video_extension.decode('utf-8'),
1743                                 'format':       u'NA',
1744                                 'player_url':   None,
1745                         })
1746                 except UnavailableVideoError:
1747                         self._downloader.trouble(u'\nERROR: unable to download video')
1748
1749
1750 class YahooIE(InfoExtractor):
1751         """Information extractor for video.yahoo.com."""
1752
1753         # _VALID_URL matches all Yahoo! Video URLs
1754         # _VPAGE_URL matches only the extractable '/watch/' URLs
1755         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1756         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1757
1758         def __init__(self, downloader=None):
1759                 InfoExtractor.__init__(self, downloader)
1760
1761         @staticmethod
1762         def suitable(url):
1763                 return (re.match(YahooIE._VALID_URL, url) is not None)
1764
1765         def report_download_webpage(self, video_id):
1766                 """Report webpage download."""
1767                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1768
1769         def report_extraction(self, video_id):
1770                 """Report information extraction."""
1771                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1772
1773         def _real_initialize(self):
1774                 return
1775
1776         def _real_extract(self, url, new_video=True):
1777                 # Extract ID from URL
1778                 mobj = re.match(self._VALID_URL, url)
1779                 if mobj is None:
1780                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1781                         return
1782
1783                 # At this point we have a new video
1784                 self._downloader.increment_downloads()
1785                 video_id = mobj.group(2)
1786                 video_extension = 'flv'
1787
1788                 # Rewrite valid but non-extractable URLs as
1789                 # extractable English language /watch/ URLs
1790                 if re.match(self._VPAGE_URL, url) is None:
1791                         request = urllib2.Request(url)
1792                         try:
1793                                 webpage = urllib2.urlopen(request).read()
1794                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1795                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1796                                 return
1797
1798                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1799                         if mobj is None:
1800                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1801                                 return
1802                         yahoo_id = mobj.group(1)
1803
1804                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1805                         if mobj is None:
1806                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1807                                 return
1808                         yahoo_vid = mobj.group(1)
1809
1810                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1811                         return self._real_extract(url, new_video=False)
1812
1813                 # Retrieve video webpage to extract further information
1814                 request = urllib2.Request(url)
1815                 try:
1816                         self.report_download_webpage(video_id)
1817                         webpage = urllib2.urlopen(request).read()
1818                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1820                         return
1821
1822                 # Extract uploader and title from webpage
1823                 self.report_extraction(video_id)
1824                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: unable to extract video title')
1827                         return
1828                 video_title = mobj.group(1).decode('utf-8')
1829                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1830
1831                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1832                 if mobj is None:
1833                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1834                         return
1835                 video_uploader = mobj.group(1).decode('utf-8')
1836
1837                 # Extract video thumbnail
1838                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1839                 if mobj is None:
1840                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1841                         return
1842                 video_thumbnail = mobj.group(1).decode('utf-8')
1843
1844                 # Extract video description
1845                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: unable to extract video description')
1848                         return
1849                 video_description = mobj.group(1).decode('utf-8')
1850                 if not video_description: video_description = 'No description available.'
1851
1852                 # Extract video height and width
1853                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: unable to extract video height')
1856                         return
1857                 yv_video_height = mobj.group(1)
1858
1859                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1860                 if mobj is None:
1861                         self._downloader.trouble(u'ERROR: unable to extract video width')
1862                         return
1863                 yv_video_width = mobj.group(1)
1864
1865                 # Retrieve video playlist to extract media URL
1866                 # I'm not completely sure what all these options are, but we
1867                 # seem to need most of them, otherwise the server sends a 401.
1868                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1869                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1870                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1871                                                                   '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1872                                                                   '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1873                 try:
1874                         self.report_download_webpage(video_id)
1875                         webpage = urllib2.urlopen(request).read()
1876                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1877                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1878                         return
1879
1880                 # Extract media URL from playlist XML
1881                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1882                 if mobj is None:
1883                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1884                         return
1885                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1886                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1887
1888                 try:
1889                         # Process video information
1890                         self._downloader.process_info({
1891                                 'id':           video_id.decode('utf-8'),
1892                                 'url':          video_url,
1893                                 'uploader':     video_uploader,
1894                                 'upload_date':  u'NA',
1895                                 'title':        video_title,
1896                                 'stitle':       simple_title,
1897                                 'ext':          video_extension.decode('utf-8'),
1898                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1899                                 'description':  video_description,
1900                                 'thumbnail':    video_thumbnail,
1901                                 'description':  video_description,
1902                                 'player_url':   None,
1903                         })
1904                 except UnavailableVideoError:
1905                         self._downloader.trouble(u'\nERROR: unable to download video')
1906
1907
1908 class VimeoIE(InfoExtractor):
1909         """Information extractor for vimeo.com."""
1910
1911         # _VALID_URL matches Vimeo URLs
1912         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1913
1914         def __init__(self, downloader=None):
1915                 InfoExtractor.__init__(self, downloader)
1916
1917         @staticmethod
1918         def suitable(url):
1919                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1920
1921         def report_download_webpage(self, video_id):
1922                 """Report webpage download."""
1923                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1924
1925         def report_extraction(self, video_id):
1926                 """Report information extraction."""
1927                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1928
1929         def _real_initialize(self):
1930                 return
1931
1932         def _real_extract(self, url, new_video=True):
1933                 # Extract ID from URL
1934                 mobj = re.match(self._VALID_URL, url)
1935                 if mobj is None:
1936                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1937                         return
1938
1939                 # At this point we have a new video
1940                 self._downloader.increment_downloads()
1941                 video_id = mobj.group(1)
1942
1943                 # Retrieve video webpage to extract further information
1944                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1945                 try:
1946                         self.report_download_webpage(video_id)
1947                         webpage = urllib2.urlopen(request).read()
1948                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1949                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1950                         return
1951
1952                 # Now we begin extracting as much information as we can from what we
1953                 # retrieved. First we extract the information common to all extractors,
1954                 # and latter we extract those that are Vimeo specific.
1955                 self.report_extraction(video_id)
1956
1957                 # Extract title
1958                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1959                 if mobj is None:
1960                         self._downloader.trouble(u'ERROR: unable to extract video title')
1961                         return
1962                 video_title = mobj.group(1).decode('utf-8')
1963                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1964
1965                 # Extract uploader
1966                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1967                 if mobj is None:
1968                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1969                         return
1970                 video_uploader = mobj.group(1).decode('utf-8')
1971
1972                 # Extract video thumbnail
1973                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1974                 if mobj is None:
1975                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1976                         return
1977                 video_thumbnail = mobj.group(1).decode('utf-8')
1978
1979                 # # Extract video description
1980                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1981                 # if mobj is None:
1982                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1983                 #       return
1984                 # video_description = mobj.group(1).decode('utf-8')
1985                 # if not video_description: video_description = 'No description available.'
1986                 video_description = 'Foo.'
1987
1988                 # Vimeo specific: extract request signature
1989                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1990                 if mobj is None:
1991                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1992                         return
1993                 sig = mobj.group(1).decode('utf-8')
1994
1995                 # Vimeo specific: Extract request signature expiration
1996                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1997                 if mobj is None:
1998                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1999                         return
2000                 sig_exp = mobj.group(1).decode('utf-8')
2001
2002                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2003
2004                 try:
2005                         # Process video information
2006                         self._downloader.process_info({
2007                                 'id':           video_id.decode('utf-8'),
2008                                 'url':          video_url,
2009                                 'uploader':     video_uploader,
2010                                 'upload_date':  u'NA',
2011                                 'title':        video_title,
2012                                 'stitle':       simple_title,
2013                                 'ext':          u'mp4',
2014                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2015                                 'description':  video_description,
2016                                 'thumbnail':    video_thumbnail,
2017                                 'description':  video_description,
2018                                 'player_url':   None,
2019                         })
2020                 except UnavailableVideoError:
2021                         self._downloader.trouble(u'ERROR: unable to download video')
2022
2023
2024 class GenericIE(InfoExtractor):
2025         """Generic last-resort information extractor."""
2026
2027         def __init__(self, downloader=None):
2028                 InfoExtractor.__init__(self, downloader)
2029
2030         @staticmethod
2031         def suitable(url):
2032                 return True
2033
2034         def report_download_webpage(self, video_id):
2035                 """Report webpage download."""
2036                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2037                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2038
2039         def report_extraction(self, video_id):
2040                 """Report information extraction."""
2041                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2042
2043         def _real_initialize(self):
2044                 return
2045
2046         def _real_extract(self, url):
2047                 # At this point we have a new video
2048                 self._downloader.increment_downloads()
2049
2050                 video_id = url.split('/')[-1]
2051                 request = urllib2.Request(url)
2052                 try:
2053                         self.report_download_webpage(video_id)
2054                         webpage = urllib2.urlopen(request).read()
2055                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2056                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2057                         return
2058                 except ValueError, err:
2059                         # since this is the last-resort InfoExtractor, if
2060                         # this error is thrown, it'll be thrown here
2061                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2062                         return
2063
2064                 self.report_extraction(video_id)
2065                 # Start with something easy: JW Player in SWFObject
2066                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2067                 if mobj is None:
2068                         # Broaden the search a little bit
2069                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2070                 if mobj is None:
2071                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2072                         return
2073
2074                 # It's possible that one of the regexes
2075                 # matched, but returned an empty group:
2076                 if mobj.group(1) is None:
2077                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2078                         return
2079
2080                 video_url = urllib.unquote(mobj.group(1))
2081                 video_id  = os.path.basename(video_url)
2082
2083                 # here's a fun little line of code for you:
2084                 video_extension = os.path.splitext(video_id)[1][1:]
2085                 video_id        = os.path.splitext(video_id)[0]
2086
2087                 # it's tempting to parse this further, but you would
2088                 # have to take into account all the variations like
2089                 #   Video Title - Site Name
2090                 #   Site Name | Video Title
2091                 #   Video Title - Tagline | Site Name
2092                 # and so on and so forth; it's just not practical
2093                 mobj = re.search(r'<title>(.*)</title>', webpage)
2094                 if mobj is None:
2095                         self._downloader.trouble(u'ERROR: unable to extract title')
2096                         return
2097                 video_title = mobj.group(1).decode('utf-8')
2098                 video_title = sanitize_title(video_title)
2099                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2100
2101                 # video uploader is domain name
2102                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2103                 if mobj is None:
2104                         self._downloader.trouble(u'ERROR: unable to extract title')
2105                         return
2106                 video_uploader = mobj.group(1).decode('utf-8')
2107
2108                 try:
2109                         # Process video information
2110                         self._downloader.process_info({
2111                                 'id':           video_id.decode('utf-8'),
2112                                 'url':          video_url.decode('utf-8'),
2113                                 'uploader':     video_uploader,
2114                                 'upload_date':  u'NA',
2115                                 'title':        video_title,
2116                                 'stitle':       simple_title,
2117                                 'ext':          video_extension.decode('utf-8'),
2118                                 'format':       u'NA',
2119                                 'player_url':   None,
2120                         })
2121                 except UnavailableVideoError, err:
2122                         self._downloader.trouble(u'\nERROR: unable to download video')
2123
2124
2125 class YoutubeSearchIE(InfoExtractor):
2126         """Information Extractor for YouTube search queries."""
2127         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2128         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2129         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2130         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2131         _youtube_ie = None
2132         _max_youtube_results = 1000
2133
2134         def __init__(self, youtube_ie, downloader=None):
2135                 InfoExtractor.__init__(self, downloader)
2136                 self._youtube_ie = youtube_ie
2137
2138         @staticmethod
2139         def suitable(url):
2140                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2141
2142         def report_download_page(self, query, pagenum):
2143                 """Report attempt to download playlist page with given number."""
2144                 query = query.decode(preferredencoding())
2145                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2146
2147         def _real_initialize(self):
2148                 self._youtube_ie.initialize()
2149
2150         def _real_extract(self, query):
2151                 mobj = re.match(self._VALID_QUERY, query)
2152                 if mobj is None:
2153                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2154                         return
2155
2156                 prefix, query = query.split(':')
2157                 prefix = prefix[8:]
2158                 query  = query.encode('utf-8')
2159                 if prefix == '':
2160                         self._download_n_results(query, 1)
2161                         return
2162                 elif prefix == 'all':
2163                         self._download_n_results(query, self._max_youtube_results)
2164                         return
2165                 else:
2166                         try:
2167                                 n = long(prefix)
2168                                 if n <= 0:
2169                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2170                                         return
2171                                 elif n > self._max_youtube_results:
2172                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2173                                         n = self._max_youtube_results
2174                                 self._download_n_results(query, n)
2175                                 return
2176                         except ValueError: # parsing prefix as integer fails
2177                                 self._download_n_results(query, 1)
2178                                 return
2179
2180         def _download_n_results(self, query, n):
2181                 """Downloads a specified number of results for a query"""
2182
2183                 video_ids = []
2184                 already_seen = set()
2185                 pagenum = 1
2186
2187                 while True:
2188                         self.report_download_page(query, pagenum)
2189                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2190                         request = urllib2.Request(result_url)
2191                         try:
2192                                 page = urllib2.urlopen(request).read()
2193                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2194                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2195                                 return
2196
2197                         # Extract video identifiers
2198                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2199                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2200                                 if video_id not in already_seen:
2201                                         video_ids.append(video_id)
2202                                         already_seen.add(video_id)
2203                                         if len(video_ids) == n:
2204                                                 # Specified n videos reached
2205                                                 for id in video_ids:
2206                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2207                                                 return
2208
2209                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2210                                 for id in video_ids:
2211                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2212                                 return
2213
2214                         pagenum = pagenum + 1
2215
2216 class GoogleSearchIE(InfoExtractor):
2217         """Information Extractor for Google Video search queries."""
2218         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2219         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2220         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2221         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2222         _google_ie = None
2223         _max_google_results = 1000
2224
2225         def __init__(self, google_ie, downloader=None):
2226                 InfoExtractor.__init__(self, downloader)
2227                 self._google_ie = google_ie
2228
2229         @staticmethod
2230         def suitable(url):
2231                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2232
2233         def report_download_page(self, query, pagenum):
2234                 """Report attempt to download playlist page with given number."""
2235                 query = query.decode(preferredencoding())
2236                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2237
2238         def _real_initialize(self):
2239                 self._google_ie.initialize()
2240
2241         def _real_extract(self, query):
2242                 mobj = re.match(self._VALID_QUERY, query)
2243                 if mobj is None:
2244                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2245                         return
2246
2247                 prefix, query = query.split(':')
2248                 prefix = prefix[8:]
2249                 query  = query.encode('utf-8')
2250                 if prefix == '':
2251                         self._download_n_results(query, 1)
2252                         return
2253                 elif prefix == 'all':
2254                         self._download_n_results(query, self._max_google_results)
2255                         return
2256                 else:
2257                         try:
2258                                 n = long(prefix)
2259                                 if n <= 0:
2260                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2261                                         return
2262                                 elif n > self._max_google_results:
2263                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2264                                         n = self._max_google_results
2265                                 self._download_n_results(query, n)
2266                                 return
2267                         except ValueError: # parsing prefix as integer fails
2268                                 self._download_n_results(query, 1)
2269                                 return
2270
2271         def _download_n_results(self, query, n):
2272                 """Downloads a specified number of results for a query"""
2273
2274                 video_ids = []
2275                 already_seen = set()
2276                 pagenum = 1
2277
2278                 while True:
2279                         self.report_download_page(query, pagenum)
2280                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2281                         request = urllib2.Request(result_url)
2282                         try:
2283                                 page = urllib2.urlopen(request).read()
2284                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2286                                 return
2287
2288                         # Extract video identifiers
2289                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2290                                 video_id = mobj.group(1)
2291                                 if video_id not in already_seen:
2292                                         video_ids.append(video_id)
2293                                         already_seen.add(video_id)
2294                                         if len(video_ids) == n:
2295                                                 # Specified n videos reached
2296                                                 for id in video_ids:
2297                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2298                                                 return
2299
2300                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2301                                 for id in video_ids:
2302                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2303                                 return
2304
2305                         pagenum = pagenum + 1
2306
2307 class YahooSearchIE(InfoExtractor):
2308         """Information Extractor for Yahoo! Video search queries."""
2309         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2310         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2311         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2312         _MORE_PAGES_INDICATOR = r'\s*Next'
2313         _yahoo_ie = None
2314         _max_yahoo_results = 1000
2315
2316         def __init__(self, yahoo_ie, downloader=None):
2317                 InfoExtractor.__init__(self, downloader)
2318                 self._yahoo_ie = yahoo_ie
2319
2320         @staticmethod
2321         def suitable(url):
2322                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2323
2324         def report_download_page(self, query, pagenum):
2325                 """Report attempt to download playlist page with given number."""
2326                 query = query.decode(preferredencoding())
2327                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2328
2329         def _real_initialize(self):
2330                 self._yahoo_ie.initialize()
2331
2332         def _real_extract(self, query):
2333                 mobj = re.match(self._VALID_QUERY, query)
2334                 if mobj is None:
2335                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2336                         return
2337
2338                 prefix, query = query.split(':')
2339                 prefix = prefix[8:]
2340                 query  = query.encode('utf-8')
2341                 if prefix == '':
2342                         self._download_n_results(query, 1)
2343                         return
2344                 elif prefix == 'all':
2345                         self._download_n_results(query, self._max_yahoo_results)
2346                         return
2347                 else:
2348                         try:
2349                                 n = long(prefix)
2350                                 if n <= 0:
2351                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2352                                         return
2353                                 elif n > self._max_yahoo_results:
2354                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2355                                         n = self._max_yahoo_results
2356                                 self._download_n_results(query, n)
2357                                 return
2358                         except ValueError: # parsing prefix as integer fails
2359                                 self._download_n_results(query, 1)
2360                                 return
2361
2362         def _download_n_results(self, query, n):
2363                 """Downloads a specified number of results for a query"""
2364
2365                 video_ids = []
2366                 already_seen = set()
2367                 pagenum = 1
2368
2369                 while True:
2370                         self.report_download_page(query, pagenum)
2371                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2372                         request = urllib2.Request(result_url)
2373                         try:
2374                                 page = urllib2.urlopen(request).read()
2375                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2376                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2377                                 return
2378
2379                         # Extract video identifiers
2380                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381                                 video_id = mobj.group(1)
2382                                 if video_id not in already_seen:
2383                                         video_ids.append(video_id)
2384                                         already_seen.add(video_id)
2385                                         if len(video_ids) == n:
2386                                                 # Specified n videos reached
2387                                                 for id in video_ids:
2388                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2389                                                 return
2390
2391                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2392                                 for id in video_ids:
2393                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2394                                 return
2395
2396                         pagenum = pagenum + 1
2397
2398 class YoutubePlaylistIE(InfoExtractor):
2399         """Information Extractor for YouTube playlists."""
2400
2401         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2402         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2403         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2404         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2405         _youtube_ie = None
2406
2407         def __init__(self, youtube_ie, downloader=None):
2408                 InfoExtractor.__init__(self, downloader)
2409                 self._youtube_ie = youtube_ie
2410
2411         @staticmethod
2412         def suitable(url):
2413                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2414
2415         def report_download_page(self, playlist_id, pagenum):
2416                 """Report attempt to download playlist page with given number."""
2417                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2418
2419         def _real_initialize(self):
2420                 self._youtube_ie.initialize()
2421
2422         def _real_extract(self, url):
2423                 # Extract playlist id
2424                 mobj = re.match(self._VALID_URL, url)
2425                 if mobj is None:
2426                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2427                         return
2428
2429                 # Single video case
2430                 if mobj.group(3) is not None:
2431                         self._youtube_ie.extract(mobj.group(3))
2432                         return
2433
2434                 # Download playlist pages
2435                 # prefix is 'p' as default for playlists but there are other types that need extra care
2436                 playlist_prefix = mobj.group(1)
2437                 if playlist_prefix == 'a':
2438                         playlist_access = 'artist'
2439                 else:
2440                         playlist_prefix = 'p'
2441                         playlist_access = 'view_play_list'
2442                 playlist_id = mobj.group(2)
2443                 video_ids = []
2444                 pagenum = 1
2445
2446                 while True:
2447                         self.report_download_page(playlist_id, pagenum)
2448                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2449                         try:
2450                                 page = urllib2.urlopen(request).read()
2451                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2452                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2453                                 return
2454
2455                         # Extract video identifiers
2456                         ids_in_page = []
2457                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2458                                 if mobj.group(1) not in ids_in_page:
2459                                         ids_in_page.append(mobj.group(1))
2460                         video_ids.extend(ids_in_page)
2461
2462                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2463                                 break
2464                         pagenum = pagenum + 1
2465
2466                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2467                 playlistend = self._downloader.params.get('playlistend', -1)
2468                 video_ids = video_ids[playliststart:playlistend]
2469
2470                 for id in video_ids:
2471                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2472                 return
2473
2474 class YoutubeUserIE(InfoExtractor):
2475         """Information Extractor for YouTube users."""
2476
2477         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2478         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2479         _GDATA_PAGE_SIZE = 50
2480         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2481         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2482         _youtube_ie = None
2483
2484         def __init__(self, youtube_ie, downloader=None):
2485                 InfoExtractor.__init__(self, downloader)
2486                 self._youtube_ie = youtube_ie
2487
2488         @staticmethod
2489         def suitable(url):
2490                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2491
2492         def report_download_page(self, username, start_index):
2493                 """Report attempt to download user page."""
2494                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2495                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2496
2497         def _real_initialize(self):
2498                 self._youtube_ie.initialize()
2499
2500         def _real_extract(self, url):
2501                 # Extract username
2502                 mobj = re.match(self._VALID_URL, url)
2503                 if mobj is None:
2504                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2505                         return
2506
2507                 username = mobj.group(1)
2508
2509                 # Download video ids using YouTube Data API. Result size per
2510                 # query is limited (currently to 50 videos) so we need to query
2511                 # page by page until there are no video ids - it means we got
2512                 # all of them.
2513
2514                 video_ids = []
2515                 pagenum = 0
2516
2517                 while True:
2518                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2519                         self.report_download_page(username, start_index)
2520
2521                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2522
2523                         try:
2524                                 page = urllib2.urlopen(request).read()
2525                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2526                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2527                                 return
2528
2529                         # Extract video identifiers
2530                         ids_in_page = []
2531
2532                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2533                                 if mobj.group(1) not in ids_in_page:
2534                                         ids_in_page.append(mobj.group(1))
2535
2536                         video_ids.extend(ids_in_page)
2537
2538                         # A little optimization - if current page is not
2539                         # "full", ie. does not contain PAGE_SIZE video ids then
2540                         # we can assume that this page is the last one - there
2541                         # are no more ids on further pages - no need to query
2542                         # again.
2543
2544                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2545                                 break
2546
2547                         pagenum += 1
2548
2549                 all_ids_count = len(video_ids)
2550                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2551                 playlistend = self._downloader.params.get('playlistend', -1)
2552
2553                 if playlistend == -1:
2554                         video_ids = video_ids[playliststart:]
2555                 else:
2556                         video_ids = video_ids[playliststart:playlistend]
2557
2558                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2559                                                                   (username, all_ids_count, len(video_ids)))
2560
2561                 for video_id in video_ids:
2562                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2563
2564
2565 class DepositFilesIE(InfoExtractor):
2566         """Information extractor for depositfiles.com"""
2567
2568         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2569
2570         def __init__(self, downloader=None):
2571                 InfoExtractor.__init__(self, downloader)
2572
2573         @staticmethod
2574         def suitable(url):
2575                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2576
2577         def report_download_webpage(self, file_id):
2578                 """Report webpage download."""
2579                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2580
2581         def report_extraction(self, file_id):
2582                 """Report information extraction."""
2583                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2584
2585         def _real_initialize(self):
2586                 return
2587
2588         def _real_extract(self, url):
2589                 # At this point we have a new file
2590                 self._downloader.increment_downloads()
2591
2592                 file_id = url.split('/')[-1]
2593                 # Rebuild url in english locale
2594                 url = 'http://depositfiles.com/en/files/' + file_id
2595
2596                 # Retrieve file webpage with 'Free download' button pressed
2597                 free_download_indication = { 'gateway_result' : '1' }
2598                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2599                 try:
2600                         self.report_download_webpage(file_id)
2601                         webpage = urllib2.urlopen(request).read()
2602                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2603                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2604                         return
2605
2606                 # Search for the real file URL
2607                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2608                 if (mobj is None) or (mobj.group(1) is None):
2609                         # Try to figure out reason of the error.
2610                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2611                         if (mobj is not None) and (mobj.group(1) is not None):
2612                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2613                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2614                         else:
2615                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2616                         return
2617
2618                 file_url = mobj.group(1)
2619                 file_extension = os.path.splitext(file_url)[1][1:]
2620
2621                 # Search for file title
2622                 mobj = re.search(r'<b title="(.*?)">', webpage)
2623                 if mobj is None:
2624                         self._downloader.trouble(u'ERROR: unable to extract title')
2625                         return
2626                 file_title = mobj.group(1).decode('utf-8')
2627
2628                 try:
2629                         # Process file information
2630                         self._downloader.process_info({
2631                                 'id':           file_id.decode('utf-8'),
2632                                 'url':          file_url.decode('utf-8'),
2633                                 'uploader':     u'NA',
2634                                 'upload_date':  u'NA',
2635                                 'title':        file_title,
2636                                 'stitle':       file_title,
2637                                 'ext':          file_extension.decode('utf-8'),
2638                                 'format':       u'NA',
2639                                 'player_url':   None,
2640                         })
2641                 except UnavailableVideoError, err:
2642                         self._downloader.trouble(u'ERROR: unable to download file')
2643
2644 class FacebookIE(InfoExtractor):
2645         """Information Extractor for Facebook"""
2646
2647         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2648         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2649         _NETRC_MACHINE = 'facebook'
2650         _available_formats = ['highqual', 'lowqual']
2651         _video_extensions = {
2652                 'highqual': 'mp4',
2653                 'lowqual': 'mp4',
2654         }
2655
2656         def __init__(self, downloader=None):
2657                 InfoExtractor.__init__(self, downloader)
2658
2659         @staticmethod
2660         def suitable(url):
2661                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2662
2663         def _reporter(self, message):
2664                 """Add header and report message."""
2665                 self._downloader.to_screen(u'[facebook] %s' % message)
2666
2667         def report_login(self):
2668                 """Report attempt to log in."""
2669                 self._reporter(u'Logging in')
2670
2671         def report_video_webpage_download(self, video_id):
2672                 """Report attempt to download video webpage."""
2673                 self._reporter(u'%s: Downloading video webpage' % video_id)
2674
2675         def report_information_extraction(self, video_id):
2676                 """Report attempt to extract video information."""
2677                 self._reporter(u'%s: Extracting video information' % video_id)
2678
2679         def _parse_page(self, video_webpage):
2680                 """Extract video information from page"""
2681                 # General data
2682                 data = {'title': r'class="video_title datawrap">(.*?)</',
2683                         'description': r'<div class="datawrap">(.*?)</div>',
2684                         'owner': r'\("video_owner_name", "(.*?)"\)',
2685                         'upload_date': r'data-date="(.*?)"',
2686                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2687                         }
2688                 video_info = {}
2689                 for piece in data.keys():
2690                         mobj = re.search(data[piece], video_webpage)
2691                         if mobj is not None:
2692                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2693
2694                 # Video urls
2695                 video_urls = {}
2696                 for fmt in self._available_formats:
2697                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2698                         if mobj is not None:
2699                                 # URL is in a Javascript segment inside an escaped Unicode format within
2700                                 # the generally utf-8 page
2701                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2702                 video_info['video_urls'] = video_urls
2703
2704                 return video_info
2705
2706         def _real_initialize(self):
2707                 if self._downloader is None:
2708                         return
2709
2710                 useremail = None
2711                 password = None
2712                 downloader_params = self._downloader.params
2713
2714                 # Attempt to use provided username and password or .netrc data
2715                 if downloader_params.get('username', None) is not None:
2716                         useremail = downloader_params['username']
2717                         password = downloader_params['password']
2718                 elif downloader_params.get('usenetrc', False):
2719                         try:
2720                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2721                                 if info is not None:
2722                                         useremail = info[0]
2723                                         password = info[2]
2724                                 else:
2725                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2726                         except (IOError, netrc.NetrcParseError), err:
2727                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2728                                 return
2729
2730                 if useremail is None:
2731                         return
2732
2733                 # Log in
2734                 login_form = {
2735                         'email': useremail,
2736                         'pass': password,
2737                         'login': 'Log+In'
2738                         }
2739                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2740                 try:
2741                         self.report_login()
2742                         login_results = urllib2.urlopen(request).read()
2743                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2744                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2745                                 return
2746                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2747                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2748                         return
2749
2750         def _real_extract(self, url):
2751                 mobj = re.match(self._VALID_URL, url)
2752                 if mobj is None:
2753                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2754                         return
2755                 video_id = mobj.group('ID')
2756
2757                 # Get video webpage
2758                 self.report_video_webpage_download(video_id)
2759                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2760                 try:
2761                         page = urllib2.urlopen(request)
2762                         video_webpage = page.read()
2763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2764                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2765                         return
2766
2767                 # Start extracting information
2768                 self.report_information_extraction(video_id)
2769
2770                 # Extract information
2771                 video_info = self._parse_page(video_webpage)
2772
2773                 # uploader
2774                 if 'owner' not in video_info:
2775                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2776                         return
2777                 video_uploader = video_info['owner']
2778
2779                 # title
2780                 if 'title' not in video_info:
2781                         self._downloader.trouble(u'ERROR: unable to extract video title')
2782                         return
2783                 video_title = video_info['title']
2784                 video_title = video_title.decode('utf-8')
2785                 video_title = sanitize_title(video_title)
2786
2787                 # simplified title
2788                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2789                 simple_title = simple_title.strip(ur'_')
2790
2791                 # thumbnail image
2792                 if 'thumbnail' not in video_info:
2793                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2794                         video_thumbnail = ''
2795                 else:
2796                         video_thumbnail = video_info['thumbnail']
2797
2798                 # upload date
2799                 upload_date = u'NA'
2800                 if 'upload_date' in video_info:
2801                         upload_time = video_info['upload_date']
2802                         timetuple = email.utils.parsedate_tz(upload_time)
2803                         if timetuple is not None:
2804                                 try:
2805                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2806                                 except:
2807                                         pass
2808
2809                 # description
2810                 video_description = video_info.get('description', 'No description available.')
2811
2812                 url_map = video_info['video_urls']
2813                 if len(url_map.keys()) > 0:
2814                         # Decide which formats to download
2815                         req_format = self._downloader.params.get('format', None)
2816                         format_limit = self._downloader.params.get('format_limit', None)
2817
2818                         if format_limit is not None and format_limit in self._available_formats:
2819                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2820                         else:
2821                                 format_list = self._available_formats
2822                         existing_formats = [x for x in format_list if x in url_map]
2823                         if len(existing_formats) == 0:
2824                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2825                                 return
2826                         if req_format is None:
2827                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2828                         elif req_format == '-1':
2829                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2830                         else:
2831                                 # Specific format
2832                                 if req_format not in url_map:
2833                                         self._downloader.trouble(u'ERROR: requested format not available')
2834                                         return
2835                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2836
2837                 for format_param, video_real_url in video_url_list:
2838
2839                         # At this point we have a new video
2840                         self._downloader.increment_downloads()
2841
2842                         # Extension
2843                         video_extension = self._video_extensions.get(format_param, 'mp4')
2844
2845                         try:
2846                                 # Process video information
2847                                 self._downloader.process_info({
2848                                         'id':           video_id.decode('utf-8'),
2849                                         'url':          video_real_url.decode('utf-8'),
2850                                         'uploader':     video_uploader.decode('utf-8'),
2851                                         'upload_date':  upload_date,
2852                                         'title':        video_title,
2853                                         'stitle':       simple_title,
2854                                         'ext':          video_extension.decode('utf-8'),
2855                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2856                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2857                                         'description':  video_description.decode('utf-8'),
2858                                         'player_url':   None,
2859                                 })
2860                         except UnavailableVideoError, err:
2861                                 self._downloader.trouble(u'\nERROR: unable to download video')
2862
2863 class BlipTVIE(InfoExtractor):
2864         """Information extractor for blip.tv"""
2865
2866         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2867         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2868
2869         @staticmethod
2870         def suitable(url):
2871                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2872
2873         def report_extraction(self, file_id):
2874                 """Report information extraction."""
2875                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2876
2877         def _simplify_title(self, title):
2878                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2879                 res = res.strip(ur'_')
2880                 return res
2881
2882         def _real_extract(self, url):
2883                 mobj = re.match(self._VALID_URL, url)
2884                 if mobj is None:
2885                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2886                         return
2887
2888                 if '?' in url:
2889                         cchar = '&'
2890                 else:
2891                         cchar = '?'
2892                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2893                 request = urllib2.Request(json_url)
2894                 self.report_extraction(mobj.group(1))
2895                 try:
2896                         json_code = urllib2.urlopen(request).read()
2897                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2899                         return
2900                 try:
2901                         json_data = json.loads(json_code)
2902                         if 'Post' in json_data:
2903                                 data = json_data['Post']
2904                         else:
2905                                 data = json_data
2906
2907                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2908                         video_url = data['media']['url']
2909                         umobj = re.match(self._URL_EXT, video_url)
2910                         if umobj is None:
2911                                 raise ValueError('Can not determine filename extension')
2912                         ext = umobj.group(1)
2913
2914                         self._downloader.increment_downloads()
2915
2916                         info = {
2917                                 'id': data['item_id'],
2918                                 'url': video_url,
2919                                 'uploader': data['display_name'],
2920                                 'upload_date': upload_date,
2921                                 'title': data['title'],
2922                                 'stitle': self._simplify_title(data['title']),
2923                                 'ext': ext,
2924                                 'format': data['media']['mimeType'],
2925                                 'thumbnail': data['thumbnailUrl'],
2926                                 'description': data['description'],
2927                                 'player_url': data['embedUrl']
2928                         }
2929                 except (ValueError,KeyError), err:
2930                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2931                         return
2932
2933                 try:
2934                         self._downloader.process_info(info)
2935                 except UnavailableVideoError, err:
2936                         self._downloader.trouble(u'\nERROR: unable to download video')
2937
2938
2939 class PostProcessor(object):
2940         """Post Processor class.
2941
2942         PostProcessor objects can be added to downloaders with their
2943         add_post_processor() method. When the downloader has finished a
2944         successful download, it will take its internal chain of PostProcessors
2945         and start calling the run() method on each one of them, first with
2946         an initial argument and then with the returned value of the previous
2947         PostProcessor.
2948
2949         The chain will be stopped if one of them ever returns None or the end
2950         of the chain is reached.
2951
2952         PostProcessor objects follow a "mutual registration" process similar
2953         to InfoExtractor objects.
2954         """
2955
2956         _downloader = None
2957
2958         def __init__(self, downloader=None):
2959                 self._downloader = downloader
2960
2961         def set_downloader(self, downloader):
2962                 """Sets the downloader for this PP."""
2963                 self._downloader = downloader
2964
2965         def run(self, information):
2966                 """Run the PostProcessor.
2967
2968                 The "information" argument is a dictionary like the ones
2969                 composed by InfoExtractors. The only difference is that this
2970                 one has an extra field called "filepath" that points to the
2971                 downloaded file.
2972
2973                 When this method returns None, the postprocessing chain is
2974                 stopped. However, this method may return an information
2975                 dictionary that will be passed to the next postprocessing
2976                 object in the chain. It can be the one it received after
2977                 changing some fields.
2978
2979                 In addition, this method may raise a PostProcessingError
2980                 exception that will be taken into account by the downloader
2981                 it was called from.
2982                 """
2983                 return information # by default, do nothing
2984
2985 class FFmpegExtractAudioPP(PostProcessor):
2986
2987         def __init__(self, downloader=None, preferredcodec=None):
2988                 PostProcessor.__init__(self, downloader)
2989                 if preferredcodec is None:
2990                         preferredcodec = 'best'
2991                 self._preferredcodec = preferredcodec
2992
2993         @staticmethod
2994         def get_audio_codec(path):
2995                 try:
2996                         cmd = ['ffprobe', '-show_streams', '--', path]
2997                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2998                         output = handle.communicate()[0]
2999                         if handle.wait() != 0:
3000                                 return None
3001                 except (IOError, OSError):
3002                         return None
3003                 audio_codec = None
3004                 for line in output.split('\n'):
3005                         if line.startswith('codec_name='):
3006                                 audio_codec = line.split('=')[1].strip()
3007                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3008                                 return audio_codec
3009                 return None
3010
3011         @staticmethod
3012         def run_ffmpeg(path, out_path, codec, more_opts):
3013                 try:
3014                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3015                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3016                         return (ret == 0)
3017                 except (IOError, OSError):
3018                         return False
3019
3020         def run(self, information):
3021                 path = information['filepath']
3022
3023                 filecodec = self.get_audio_codec(path)
3024                 if filecodec is None:
3025                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3026                         return None
3027
3028                 more_opts = []
3029                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3030                         if filecodec == 'aac' or filecodec == 'mp3':
3031                                 # Lossless if possible
3032                                 acodec = 'copy'
3033                                 extension = filecodec
3034                                 if filecodec == 'aac':
3035                                         more_opts = ['-f', 'adts']
3036                         else:
3037                                 # MP3 otherwise.
3038                                 acodec = 'libmp3lame'
3039                                 extension = 'mp3'
3040                                 more_opts = ['-ab', '128k']
3041                 else:
3042                         # We convert the audio (lossy)
3043                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3044                         extension = self._preferredcodec
3045                         more_opts = ['-ab', '128k']
3046                         if self._preferredcodec == 'aac':
3047                                 more_opts += ['-f', 'adts']
3048
3049                 (prefix, ext) = os.path.splitext(path)
3050                 new_path = prefix + '.' + extension
3051                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3052                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3053
3054                 if not status:
3055                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3056                         return None
3057
3058                 try:
3059                         os.remove(path)
3060                 except (IOError, OSError):
3061                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3062                         return None
3063
3064                 information['filepath'] = new_path
3065                 return information
3066
3067
3068 def updateSelf(downloader, filename):
3069         ''' Update the program file with the latest version from the repository '''
3070         # Note: downloader only used for options
3071         if not os.access(filename, os.W_OK):
3072                 sys.exit('ERROR: no write permissions on %s' % filename)
3073
3074         downloader.to_screen('Updating to latest stable version...')
3075
3076         try:
3077                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
3078                 latest_version = urllib.urlopen(latest_url).read().strip()
3079                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
3080                 newcontent = urllib.urlopen(prog_url).read()
3081         except (IOError, OSError), err:
3082                 sys.exit('ERROR: unable to download latest version')
3083
3084         try:
3085                 stream = open(filename, 'wb')
3086                 stream.write(newcontent)
3087                 stream.close()
3088         except (IOError, OSError), err:
3089                 sys.exit('ERROR: unable to overwrite current version')
3090
3091         downloader.to_screen('Updated to version %s' % latest_version)
3092
3093 def parseOpts():
3094         # Deferred imports
3095         import getpass
3096         import optparse
3097
3098         def _format_option_string(option):
3099                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3100
3101                 opts = []
3102
3103                 if option._short_opts: opts.append(option._short_opts[0])
3104                 if option._long_opts: opts.append(option._long_opts[0])
3105                 if len(opts) > 1: opts.insert(1, ', ')
3106
3107                 if option.takes_value(): opts.append(' %s' % option.metavar)
3108
3109                 return "".join(opts)
3110
3111         def _find_term_columns():
3112                 columns = os.environ.get('COLUMNS', None)
3113                 if columns:
3114                         return int(columns)
3115
3116                 try:
3117                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3118                         out,err = sp.communicate()
3119                         return int(out.split()[1])
3120                 except:
3121                         pass
3122                 return None
3123
3124         max_width = 80
3125         max_help_position = 80
3126
3127         # No need to wrap help messages if we're on a wide console
3128         columns = _find_term_columns()
3129         if columns: max_width = columns
3130
3131         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3132         fmt.format_option_strings = _format_option_string
3133
3134         kw = {
3135                 'version'   : __version__,
3136                 'formatter' : fmt,
3137                 'usage' : '%prog [options] url...',
3138                 'conflict_handler' : 'resolve',
3139         }
3140
3141         parser = optparse.OptionParser(**kw)
3142
3143         # option groups
3144         general        = optparse.OptionGroup(parser, 'General Options')
3145         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3146         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3147         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3148         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3149         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3150
3151         general.add_option('-h', '--help',
3152                         action='help', help='print this help text and exit')
3153         general.add_option('-v', '--version',
3154                         action='version', help='print program version and exit')
3155         general.add_option('-U', '--update',
3156                         action='store_true', dest='update_self', help='update this program to latest stable version')
3157         general.add_option('-i', '--ignore-errors',
3158                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3159         general.add_option('-r', '--rate-limit',
3160                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3161         general.add_option('-R', '--retries',
3162                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3163         general.add_option('--playlist-start',
3164                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3165         general.add_option('--playlist-end',
3166                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3167         general.add_option('--dump-user-agent',
3168                         action='store_true', dest='dump_user_agent',
3169                         help='display the current browser identification', default=False)
3170
3171         authentication.add_option('-u', '--username',
3172                         dest='username', metavar='USERNAME', help='account username')
3173         authentication.add_option('-p', '--password',
3174                         dest='password', metavar='PASSWORD', help='account password')
3175         authentication.add_option('-n', '--netrc',
3176                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3177
3178
3179         video_format.add_option('-f', '--format',
3180                         action='store', dest='format', metavar='FORMAT', help='video format code')
3181         video_format.add_option('--all-formats',
3182                         action='store_const', dest='format', help='download all available video formats', const='-1')
3183         video_format.add_option('--max-quality',
3184                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3185
3186
3187         verbosity.add_option('-q', '--quiet',
3188                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3189         verbosity.add_option('-s', '--simulate',
3190                         action='store_true', dest='simulate', help='do not download video', default=False)
3191         verbosity.add_option('-g', '--get-url',
3192                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3193         verbosity.add_option('-e', '--get-title',
3194                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3195         verbosity.add_option('--get-thumbnail',
3196                         action='store_true', dest='getthumbnail',
3197                         help='simulate, quiet but print thumbnail URL', default=False)
3198         verbosity.add_option('--get-description',
3199                         action='store_true', dest='getdescription',
3200                         help='simulate, quiet but print video description', default=False)
3201         verbosity.add_option('--get-filename',
3202                         action='store_true', dest='getfilename',
3203                         help='simulate, quiet but print output filename', default=False)
3204         verbosity.add_option('--no-progress',
3205                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3206         verbosity.add_option('--console-title',
3207                         action='store_true', dest='consoletitle',
3208                         help='display progress in console titlebar', default=False)
3209
3210
3211         filesystem.add_option('-t', '--title',
3212                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3213         filesystem.add_option('-l', '--literal',
3214                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3215         filesystem.add_option('-A', '--auto-number',
3216                         action='store_true', dest='autonumber',
3217                         help='number downloaded files starting from 00000', default=False)
3218         filesystem.add_option('-o', '--output',
3219                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3220         filesystem.add_option('-a', '--batch-file',
3221                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3222         filesystem.add_option('-w', '--no-overwrites',
3223                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3224         filesystem.add_option('-c', '--continue',
3225                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3226         filesystem.add_option('--cookies',
3227                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3228         filesystem.add_option('--no-part',
3229                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3230         filesystem.add_option('--no-mtime',
3231                         action='store_false', dest='updatetime',
3232                         help='do not use the Last-modified header to set the file modification time', default=True)
3233         filesystem.add_option('--write-description',
3234                         action='store_true', dest='writedescription',
3235                         help='write video description to a .description file', default=False)
3236         filesystem.add_option('--write-info-json',
3237                         action='store_true', dest='writeinfojson',
3238                         help='write video metadata to a .info.json file', default=False)
3239
3240
3241         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3242                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3243         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3244                         help='"best", "aac" or "mp3"; best by default')
3245
3246
3247         parser.add_option_group(general)
3248         parser.add_option_group(filesystem)
3249         parser.add_option_group(verbosity)
3250         parser.add_option_group(video_format)
3251         parser.add_option_group(authentication)
3252         parser.add_option_group(postproc)
3253
3254         opts, args = parser.parse_args()
3255
3256         return parser, opts, args
3257
3258 def main():
3259         parser, opts, args = parseOpts()
3260
3261         # Open appropriate CookieJar
3262         if opts.cookiefile is None:
3263                 jar = cookielib.CookieJar()
3264         else:
3265                 try:
3266                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3267                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3268                                 jar.load()
3269                 except (IOError, OSError), err:
3270                         sys.exit(u'ERROR: unable to open cookie file')
3271
3272         # Dump user agent
3273         if opts.dump_user_agent:
3274                 print std_headers['User-Agent']
3275                 sys.exit(0)
3276
3277         # General configuration
3278         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3279         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3280         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3281
3282         # Batch file verification
3283         batchurls = []
3284         if opts.batchfile is not None:
3285                 try:
3286                         if opts.batchfile == '-':
3287                                 batchfd = sys.stdin
3288                         else:
3289                                 batchfd = open(opts.batchfile, 'r')
3290                         batchurls = batchfd.readlines()
3291                         batchurls = [x.strip() for x in batchurls]
3292                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3293                 except IOError:
3294                         sys.exit(u'ERROR: batch file could not be read')
3295         all_urls = batchurls + args
3296
3297         # Conflicting, missing and erroneous options
3298         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3299                 parser.error(u'using .netrc conflicts with giving username/password')
3300         if opts.password is not None and opts.username is None:
3301                 parser.error(u'account username missing')
3302         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3303                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3304         if opts.usetitle and opts.useliteral:
3305                 parser.error(u'using title conflicts with using literal title')
3306         if opts.username is not None and opts.password is None:
3307                 opts.password = getpass.getpass(u'Type account password and press return:')
3308         if opts.ratelimit is not None:
3309                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3310                 if numeric_limit is None:
3311                         parser.error(u'invalid rate limit specified')
3312                 opts.ratelimit = numeric_limit
3313         if opts.retries is not None:
3314                 try:
3315                         opts.retries = long(opts.retries)
3316                 except (TypeError, ValueError), err:
3317                         parser.error(u'invalid retry count specified')
3318         try:
3319                 opts.playliststart = int(opts.playliststart)
3320                 if opts.playliststart <= 0:
3321                         raise ValueError(u'Playlist start must be positive')
3322         except (TypeError, ValueError), err:
3323                 parser.error(u'invalid playlist start number specified')
3324         try:
3325                 opts.playlistend = int(opts.playlistend)
3326                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3327                         raise ValueError(u'Playlist end must be greater than playlist start')
3328         except (TypeError, ValueError), err:
3329                 parser.error(u'invalid playlist end number specified')
3330         if opts.extractaudio:
3331                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3332                         parser.error(u'invalid audio format specified')
3333
3334         # Information extractors
3335         youtube_ie = YoutubeIE()
3336         metacafe_ie = MetacafeIE(youtube_ie)
3337         dailymotion_ie = DailymotionIE()
3338         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3339         youtube_user_ie = YoutubeUserIE(youtube_ie)
3340         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3341         google_ie = GoogleIE()
3342         google_search_ie = GoogleSearchIE(google_ie)
3343         photobucket_ie = PhotobucketIE()
3344         yahoo_ie = YahooIE()
3345         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3346         deposit_files_ie = DepositFilesIE()
3347         facebook_ie = FacebookIE()
3348         bliptv_ie = BlipTVIE()
3349         vimeo_ie = VimeoIE()
3350         generic_ie = GenericIE()
3351
3352         # File downloader
3353         fd = FileDownloader({
3354                 'usenetrc': opts.usenetrc,
3355                 'username': opts.username,
3356                 'password': opts.password,
3357                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3358                 'forceurl': opts.geturl,
3359                 'forcetitle': opts.gettitle,
3360                 'forcethumbnail': opts.getthumbnail,
3361                 'forcedescription': opts.getdescription,
3362                 'forcefilename': opts.getfilename,
3363                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3364                 'format': opts.format,
3365                 'format_limit': opts.format_limit,
3366                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3367                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3368                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3369                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3370                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3371                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3372                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3373                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3374                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3375                         or u'%(id)s.%(ext)s'),
3376                 'ignoreerrors': opts.ignoreerrors,
3377                 'ratelimit': opts.ratelimit,
3378                 'nooverwrites': opts.nooverwrites,
3379                 'retries': opts.retries,
3380                 'continuedl': opts.continue_dl,
3381                 'noprogress': opts.noprogress,
3382                 'playliststart': opts.playliststart,
3383                 'playlistend': opts.playlistend,
3384                 'logtostderr': opts.outtmpl == '-',
3385                 'consoletitle': opts.consoletitle,
3386                 'nopart': opts.nopart,
3387                 'updatetime': opts.updatetime,
3388                 'writedescription': opts.writedescription,
3389                 'writeinfojson': opts.writeinfojson,
3390                 })
3391         fd.add_info_extractor(youtube_search_ie)
3392         fd.add_info_extractor(youtube_pl_ie)
3393         fd.add_info_extractor(youtube_user_ie)
3394         fd.add_info_extractor(metacafe_ie)
3395         fd.add_info_extractor(dailymotion_ie)
3396         fd.add_info_extractor(youtube_ie)
3397         fd.add_info_extractor(google_ie)
3398         fd.add_info_extractor(google_search_ie)
3399         fd.add_info_extractor(photobucket_ie)
3400         fd.add_info_extractor(yahoo_ie)
3401         fd.add_info_extractor(yahoo_search_ie)
3402         fd.add_info_extractor(deposit_files_ie)
3403         fd.add_info_extractor(facebook_ie)
3404         fd.add_info_extractor(bliptv_ie)
3405         fd.add_info_extractor(vimeo_ie)
3406
3407         # This must come last since it's the
3408         # fallback if none of the others work
3409         fd.add_info_extractor(generic_ie)
3410
3411         # PostProcessors
3412         if opts.extractaudio:
3413                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3414
3415         # Update version
3416         if opts.update_self:
3417                 updateSelf(fd, sys.argv[0])
3418
3419         # Maybe do nothing
3420         if len(all_urls) < 1:
3421                 if not opts.update_self:
3422                         parser.error(u'you must provide at least one URL')
3423                 else:
3424                         sys.exit()
3425         retcode = fd.download(all_urls)
3426
3427         # Dump cookie jar if requested
3428         if opts.cookiefile is not None:
3429                 try:
3430                         jar.save()
3431                 except (IOError, OSError), err:
3432                         sys.exit(u'ERROR: unable to save cookie jar')
3433
3434         sys.exit(retcode)
3435
3436
3437 if __name__ == '__main__':
3438         try:
3439                 main()
3440         except DownloadError:
3441                 sys.exit(1)
3442         except SameFileError:
3443                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3444         except KeyboardInterrupt:
3445                 sys.exit(u'\nERROR: Interrupted by user')
3446
3447 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: