youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         )
  15
  16 __license__ = 'Public Domain'
  17 __version__ = '2011.09.06-phihag'
  18
  19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
  20
  21 import cookielib
  22 import datetime
  23 import gzip
  24 import htmlentitydefs
  25 import httplib
  26 import locale
  27 import math
  28 import netrc
  29 import os
  30 import os.path
  31 import re
  32 import socket
  33 import string
  34 import subprocess
  35 import sys
  36 import time
  37 import urllib
  38 import urllib2
  39 import warnings
  40 import zlib
  41
  42 if os.name == 'nt':
  43         import ctypes
  44
  45 try:
  46         import email.utils
  47 except ImportError: # Python 2.4
  48         import email.Utils
  49 try:
  50         import cStringIO as StringIO
  51 except ImportError:
  52         import StringIO
  53
  54 # parse_qs was moved from the cgi module to the urlparse module recently.
  55 try:
  56         from urlparse import parse_qs
  57 except ImportError:
  58         from cgi import parse_qs
  59
  60 try:
  61         import lxml.etree
  62 except ImportError:
  63         pass # Handled below
  64
  65 std_headers = {
  66         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  67         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  68         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  69         'Accept-Encoding': 'gzip, deflate',
  70         'Accept-Language': 'en-us,en;q=0.5',
  71 }
  72
  73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  74
  75 try:
  76         import json
  77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  78         import re
  79         class json(object):
  80                 @staticmethod
  81                 def loads(s):
  82                         s = s.decode('UTF-8')
  83                         def raiseError(msg, i):
  84                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  85                         def skipSpace(i, expectMore=True):
  86                                 while i < len(s) and s[i] in ' \t\r\n':
  87                                         i += 1
  88                                 if expectMore:
  89                                         if i >= len(s):
  90                                                 raiseError('Premature end', i)
  91                                 return i
  92                         def decodeEscape(match):
  93                                 esc = match.group(1)
  94                                 _STATIC = {
  95                                         '"': '"',
  96                                         '\\': '\\',
  97                                         '/': '/',
  98                                         'b': unichr(0x8),
  99                                         'f': unichr(0xc),
 100                                         'n': '\n',
 101                                         'r': '\r',
 102                                         't': '\t',
 103                                 }
 104                                 if esc in _STATIC:
 105                                         return _STATIC[esc]
 106                                 if esc[0] == 'u':
 107                                         if len(esc) == 1+4:
 108                                                 return unichr(int(esc[1:5], 16))
 109                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 110                                                 hi = int(esc[1:5], 16)
 111                                                 low = int(esc[7:11], 16)
 112                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 113                                 raise ValueError('Unknown escape ' + str(esc))
 114                         def parseString(i):
 115                                 i += 1
 116                                 e = i
 117                                 while True:
 118                                         e = s.index('"', e)
 119                                         bslashes = 0
 120                                         while s[e-bslashes-1] == '\\':
 121                                                 bslashes += 1
 122                                         if bslashes % 2 == 1:
 123                                                 e += 1
 124                                                 continue
 125                                         break
 126                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 127                                 stri = rexp.sub(decodeEscape, s[i:e])
 128                                 return (e+1,stri)
 129                         def parseObj(i):
 130                                 i += 1
 131                                 res = {}
 132                                 i = skipSpace(i)
 133                                 if s[i] == '}': # Empty dictionary
 134                                         return (i+1,res)
 135                                 while True:
 136                                         if s[i] != '"':
 137                                                 raiseError('Expected a string object key', i)
 138                                         i,key = parseString(i)
 139                                         i = skipSpace(i)
 140                                         if i >= len(s) or s[i] != ':':
 141                                                 raiseError('Expected a colon', i)
 142                                         i,val = parse(i+1)
 143                                         res[key] = val
 144                                         i = skipSpace(i)
 145                                         if s[i] == '}':
 146                                                 return (i+1, res)
 147                                         if s[i] != ',':
 148                                                 raiseError('Expected comma or closing curly brace', i)
 149                                         i = skipSpace(i+1)
 150                         def parseArray(i):
 151                                 res = []
 152                                 i = skipSpace(i+1)
 153                                 if s[i] == ']': # Empty array
 154                                         return (i+1,res)
 155                                 while True:
 156                                         i,val = parse(i)
 157                                         res.append(val)
 158                                         i = skipSpace(i) # Raise exception if premature end
 159                                         if s[i] == ']':
 160                                                 return (i+1, res)
 161                                         if s[i] != ',':
 162                                                 raiseError('Expected a comma or closing bracket', i)
 163                                         i = skipSpace(i+1)
 164                         def parseDiscrete(i):
 165                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 166                                         if s.startswith(k, i):
 167                                                 return (i+len(k), v)
 168                                 raiseError('Not a boolean (or null)', i)
 169                         def parseNumber(i):
 170                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 171                                 if mobj is None:
 172                                         raiseError('Not a number', i)
 173                                 nums = mobj.group(1)
 174                                 if '.' in nums or 'e' in nums or 'E' in nums:
 175                                         return (i+len(nums), float(nums))
 176                                 return (i+len(nums), int(nums))
 177                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 178                         def parse(i):
 179                                 i = skipSpace(i)
 180                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 181                                 i = skipSpace(i, False)
 182                                 return (i,res)
 183                         i,res = parse(0)
 184                         if i < len(s):
 185                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 186                         return res
 187
 188 def preferredencoding():
 189         """Get preferred encoding.
 190
 191         Returns the best encoding scheme for the system, based on
 192         locale.getpreferredencoding() and some further tweaks.
 193         """
 194         def yield_preferredencoding():
 195                 try:
 196                         pref = locale.getpreferredencoding()
 197                         u'TEST'.encode(pref)
 198                 except:
 199                         pref = 'UTF-8'
 200                 while True:
 201                         yield pref
 202         return yield_preferredencoding().next()
 203
 204
 205 def htmlentity_transform(matchobj):
 206         """Transforms an HTML entity to a Unicode character.
 207
 208         This function receives a match object and is intended to be used with
 209         the re.sub() function.
 210         """
 211         entity = matchobj.group(1)
 212
 213         # Known non-numeric HTML entity
 214         if entity in htmlentitydefs.name2codepoint:
 215                 return unichr(htmlentitydefs.name2codepoint[entity])
 216
 217         # Unicode character
 218         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 219         if mobj is not None:
 220                 numstr = mobj.group(1)
 221                 if numstr.startswith(u'x'):
 222                         base = 16
 223                         numstr = u'0%s' % numstr
 224                 else:
 225                         base = 10
 226                 return unichr(long(numstr, base))
 227
 228         # Unknown entity in name, return its literal representation
 229         return (u'&%s;' % entity)
 230
 231
 232 def sanitize_title(utitle):
 233         """Sanitizes a video title so it could be used as part of a filename."""
 234         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 235         return utitle.replace(unicode(os.sep), u'%')
 236
 237
 238 def sanitize_open(filename, open_mode):
 239         """Try to open the given filename, and slightly tweak it if this fails.
 240
 241         Attempts to open the given filename. If this fails, it tries to change
 242         the filename slightly, step by step, until it's either able to open it
 243         or it fails and raises a final exception, like the standard open()
 244         function.
 245
 246         It returns the tuple (stream, definitive_file_name).
 247         """
 248         try:
 249                 if filename == u'-':
 250                         if sys.platform == 'win32':
 251                                 import msvcrt
 252                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 253                         return (sys.stdout, filename)
 254                 stream = open(filename, open_mode)
 255                 return (stream, filename)
 256         except (IOError, OSError), err:
 257                 # In case of error, try to remove win32 forbidden chars
 258                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 259
 260                 # An exception here should be caught in the caller
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263
 264
 265 def timeconvert(timestr):
 266         """Convert RFC 2822 defined time string into system timestamp"""
 267         timestamp = None
 268         timetuple = email.utils.parsedate_tz(timestr)
 269         if timetuple is not None:
 270                 timestamp = email.utils.mktime_tz(timetuple)
 271         return timestamp
 272
 273
 274 class DownloadError(Exception):
 275         """Download Error exception.
 276
 277         This exception may be thrown by FileDownloader objects if they are not
 278         configured to continue on errors. They will contain the appropriate
 279         error message.
 280         """
 281         pass
 282
 283
 284 class SameFileError(Exception):
 285         """Same File exception.
 286
 287         This exception will be thrown by FileDownloader objects if they detect
 288         multiple files would have to be downloaded to the same file on disk.
 289         """
 290         pass
 291
 292
 293 class PostProcessingError(Exception):
 294         """Post Processing exception.
 295
 296         This exception may be raised by PostProcessor's .run() method to
 297         indicate an error in the postprocessing task.
 298         """
 299         pass
 300
 301
 302 class UnavailableVideoError(Exception):
 303         """Unavailable Format exception.
 304
 305         This exception will be thrown when a video is requested
 306         in a format that is not available for that video.
 307         """
 308         pass
 309
 310
 311 class ContentTooShortError(Exception):
 312         """Content Too Short exception.
 313
 314         This exception may be raised by FileDownloader objects when a file they
 315         download is too small for what the server announced first, indicating
 316         the connection was probably interrupted.
 317         """
 318         # Both in bytes
 319         downloaded = None
 320         expected = None
 321
 322         def __init__(self, downloaded, expected):
 323                 self.downloaded = downloaded
 324                 self.expected = expected
 325
 326
 327 class YoutubeDLHandler(urllib2.HTTPHandler):
 328         """Handler for HTTP requests and responses.
 329
 330         This class, when installed with an OpenerDirector, automatically adds
 331         the standard headers to every HTTP request and handles gzipped and
 332         deflated responses from web servers. If compression is to be avoided in
 333         a particular request, the original request in the program code only has
 334         to include the HTTP header "Youtubedl-No-Compression", which will be
 335         removed before making the real request.
 336
 337         Part of this code was copied from:
 338
 339         http://techknack.net/python-urllib2-handlers/
 340
 341         Andrew Rowls, the author of that code, agreed to release it to the
 342         public domain.
 343         """
 344
 345         @staticmethod
 346         def deflate(data):
 347                 try:
 348                         return zlib.decompress(data, -zlib.MAX_WBITS)
 349                 except zlib.error:
 350                         return zlib.decompress(data)
 351
 352         @staticmethod
 353         def addinfourl_wrapper(stream, headers, url, code):
 354                 if hasattr(urllib2.addinfourl, 'getcode'):
 355                         return urllib2.addinfourl(stream, headers, url, code)
 356                 ret = urllib2.addinfourl(stream, headers, url)
 357                 ret.code = code
 358                 return ret
 359
 360         def http_request(self, req):
 361                 for h in std_headers:
 362                         if h in req.headers:
 363                                 del req.headers[h]
 364                         req.add_header(h, std_headers[h])
 365                 if 'Youtubedl-no-compression' in req.headers:
 366                         if 'Accept-encoding' in req.headers:
 367                                 del req.headers['Accept-encoding']
 368                         del req.headers['Youtubedl-no-compression']
 369                 return req
 370
 371         def http_response(self, req, resp):
 372                 old_resp = resp
 373                 # gzip
 374                 if resp.headers.get('Content-encoding', '') == 'gzip':
 375                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 376                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 377                         resp.msg = old_resp.msg
 378                 # deflate
 379                 if resp.headers.get('Content-encoding', '') == 'deflate':
 380                         gz = StringIO.StringIO(self.deflate(resp.read()))
 381                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 382                         resp.msg = old_resp.msg
 383                 return resp
 384
 385
 386 class FileDownloader(object):
 387         """File Downloader class.
 388
 389         File downloader objects are the ones responsible of downloading the
 390         actual video file and writing it to disk if the user has requested
 391         it, among some other tasks. In most cases there should be one per
 392         program. As, given a video URL, the downloader doesn't know how to
 393         extract all the needed information, task that InfoExtractors do, it
 394         has to pass the URL to one of them.
 395
 396         For this, file downloader objects have a method that allows
 397         InfoExtractors to be registered in a given order. When it is passed
 398         a URL, the file downloader handles it to the first InfoExtractor it
 399         finds that reports being able to handle it. The InfoExtractor extracts
 400         all the information about the video or videos the URL refers to, and
 401         asks the FileDownloader to process the video information, possibly
 402         downloading the video.
 403
 404         File downloaders accept a lot of parameters. In order not to saturate
 405         the object constructor with arguments, it receives a dictionary of
 406         options instead. These options are available through the params
 407         attribute for the InfoExtractors to use. The FileDownloader also
 408         registers itself as the downloader in charge for the InfoExtractors
 409         that are added to it, so this is a "mutual registration".
 410
 411         Available options:
 412
 413         username:         Username for authentication purposes.
 414         password:         Password for authentication purposes.
 415         usenetrc:         Use netrc for authentication instead.
 416         quiet:            Do not print messages to stdout.
 417         forceurl:         Force printing final URL.
 418         forcetitle:       Force printing title.
 419         forcethumbnail:   Force printing thumbnail URL.
 420         forcedescription: Force printing description.
 421         forcefilename:    Force printing final filename.
 422         simulate:         Do not download the video files.
 423         format:           Video format code.
 424         format_limit:     Highest quality format to try.
 425         outtmpl:          Template for output names.
 426         ignoreerrors:     Do not stop on download errors.
 427         ratelimit:        Download speed limit, in bytes/sec.
 428         nooverwrites:     Prevent overwriting files.
 429         retries:          Number of times to retry for HTTP error 5xx
 430         continuedl:       Try to continue downloads if possible.
 431         noprogress:       Do not print the progress bar.
 432         playliststart:    Playlist item to start at.
 433         playlistend:      Playlist item to end at.
 434         logtostderr:      Log messages to stderr instead of stdout.
 435         consoletitle:     Display progress in console window's titlebar.
 436         nopart:           Do not use temporary .part files.
 437         updatetime:       Use the Last-modified header to set output file timestamps.
 438         writedescription: Write the video description to a .description file
 439         writeinfojson:    Write the video description to a .info.json file
 440         """
 441
 442         params = None
 443         _ies = []
 444         _pps = []
 445         _download_retcode = None
 446         _num_downloads = None
 447         _screen_file = None
 448
 449         def __init__(self, params):
 450                 """Create a FileDownloader object with the given options."""
 451                 self._ies = []
 452                 self._pps = []
 453                 self._download_retcode = 0
 454                 self._num_downloads = 0
 455                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 456                 self.params = params
 457
 458         @staticmethod
 459         def format_bytes(bytes):
 460                 if bytes is None:
 461                         return 'N/A'
 462                 if type(bytes) is str:
 463                         bytes = float(bytes)
 464                 if bytes == 0.0:
 465                         exponent = 0
 466                 else:
 467                         exponent = long(math.log(bytes, 1024.0))
 468                 suffix = 'bkMGTPEZY'[exponent]
 469                 converted = float(bytes) / float(1024 ** exponent)
 470                 return '%.2f%s' % (converted, suffix)
 471
 472         @staticmethod
 473         def calc_percent(byte_counter, data_len):
 474                 if data_len is None:
 475                         return '---.-%'
 476                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 477
 478         @staticmethod
 479         def calc_eta(start, now, total, current):
 480                 if total is None:
 481                         return '--:--'
 482                 dif = now - start
 483                 if current == 0 or dif < 0.001: # One millisecond
 484                         return '--:--'
 485                 rate = float(current) / dif
 486                 eta = long((float(total) - float(current)) / rate)
 487                 (eta_mins, eta_secs) = divmod(eta, 60)
 488                 if eta_mins > 99:
 489                         return '--:--'
 490                 return '%02d:%02d' % (eta_mins, eta_secs)
 491
 492         @staticmethod
 493         def calc_speed(start, now, bytes):
 494                 dif = now - start
 495                 if bytes == 0 or dif < 0.001: # One millisecond
 496                         return '%10s' % '---b/s'
 497                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 498
 499         @staticmethod
 500         def best_block_size(elapsed_time, bytes):
 501                 new_min = max(bytes / 2.0, 1.0)
 502                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 503                 if elapsed_time < 0.001:
 504                         return long(new_max)
 505                 rate = bytes / elapsed_time
 506                 if rate > new_max:
 507                         return long(new_max)
 508                 if rate < new_min:
 509                         return long(new_min)
 510                 return long(rate)
 511
 512         @staticmethod
 513         def parse_bytes(bytestr):
 514                 """Parse a string indicating a byte quantity into a long integer."""
 515                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 516                 if matchobj is None:
 517                         return None
 518                 number = float(matchobj.group(1))
 519                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 520                 return long(round(number * multiplier))
 521
 522         def add_info_extractor(self, ie):
 523                 """Add an InfoExtractor object to the end of the list."""
 524                 self._ies.append(ie)
 525                 ie.set_downloader(self)
 526
 527         def add_post_processor(self, pp):
 528                 """Add a PostProcessor object to the end of the chain."""
 529                 self._pps.append(pp)
 530                 pp.set_downloader(self)
 531
 532         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 533                 """Print message to stdout if not in quiet mode."""
 534                 try:
 535                         if not self.params.get('quiet', False):
 536                                 terminator = [u'\n', u''][skip_eol]
 537                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 538                         self._screen_file.flush()
 539                 except (UnicodeEncodeError), err:
 540                         if not ignore_encoding_errors:
 541                                 raise
 542
 543         def to_stderr(self, message):
 544                 """Print message to stderr."""
 545                 print >>sys.stderr, message.encode(preferredencoding())
 546
 547         def to_cons_title(self, message):
 548                 """Set console/terminal window title to message."""
 549                 if not self.params.get('consoletitle', False):
 550                         return
 551                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 552                         # c_wchar_p() might not be necessary if `message` is
 553                         # already of type unicode()
 554                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 555                 elif 'TERM' in os.environ:
 556                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 557
 558         def fixed_template(self):
 559                 """Checks if the output template is fixed."""
 560                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 561
 562         def trouble(self, message=None):
 563                 """Determine action to take when a download problem appears.
 564
 565                 Depending on if the downloader has been configured to ignore
 566                 download errors or not, this method may throw an exception or
 567                 not when errors are found, after printing the message.
 568                 """
 569                 if message is not None:
 570                         self.to_stderr(message)
 571                 if not self.params.get('ignoreerrors', False):
 572                         raise DownloadError(message)
 573                 self._download_retcode = 1
 574
 575         def slow_down(self, start_time, byte_counter):
 576                 """Sleep if the download speed is over the rate limit."""
 577                 rate_limit = self.params.get('ratelimit', None)
 578                 if rate_limit is None or byte_counter == 0:
 579                         return
 580                 now = time.time()
 581                 elapsed = now - start_time
 582                 if elapsed <= 0.0:
 583                         return
 584                 speed = float(byte_counter) / elapsed
 585                 if speed > rate_limit:
 586                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 587
 588         def temp_name(self, filename):
 589                 """Returns a temporary filename for the given filename."""
 590                 if self.params.get('nopart', False) or filename == u'-' or \
 591                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 592                         return filename
 593                 return filename + u'.part'
 594
 595         def undo_temp_name(self, filename):
 596                 if filename.endswith(u'.part'):
 597                         return filename[:-len(u'.part')]
 598                 return filename
 599
 600         def try_rename(self, old_filename, new_filename):
 601                 try:
 602                         if old_filename == new_filename:
 603                                 return
 604                         os.rename(old_filename, new_filename)
 605                 except (IOError, OSError), err:
 606                         self.trouble(u'ERROR: unable to rename file')
 607
 608         def try_utime(self, filename, last_modified_hdr):
 609                 """Try to set the last-modified time of the given file."""
 610                 if last_modified_hdr is None:
 611                         return
 612                 if not os.path.isfile(filename):
 613                         return
 614                 timestr = last_modified_hdr
 615                 if timestr is None:
 616                         return
 617                 filetime = timeconvert(timestr)
 618                 if filetime is None:
 619                         return
 620                 try:
 621                         os.utime(filename, (time.time(), filetime))
 622                 except:
 623                         pass
 624
 625         def report_writedescription(self, descfn):
 626                 """ Report that the description file is being written """
 627                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 628
 629         def report_writeinfojson(self, infofn):
 630                 """ Report that the metadata file has been written """
 631                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 632
 633         def report_destination(self, filename):
 634                 """Report destination filename."""
 635                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 636
 637         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 638                 """Report download progress."""
 639                 if self.params.get('noprogress', False):
 640                         return
 641                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 642                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 643                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 644                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 645
 646         def report_resuming_byte(self, resume_len):
 647                 """Report attempt to resume at given byte."""
 648                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 649
 650         def report_retry(self, count, retries):
 651                 """Report retry in case of HTTP error 5xx"""
 652                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 653
 654         def report_file_already_downloaded(self, file_name):
 655                 """Report file has already been fully downloaded."""
 656                 try:
 657                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 658                 except (UnicodeEncodeError), err:
 659                         self.to_screen(u'[download] The file has already been downloaded')
 660
 661         def report_unable_to_resume(self):
 662                 """Report it was impossible to resume download."""
 663                 self.to_screen(u'[download] Unable to resume')
 664
 665         def report_finish(self):
 666                 """Report download finished."""
 667                 if self.params.get('noprogress', False):
 668                         self.to_screen(u'[download] Download completed')
 669                 else:
 670                         self.to_screen(u'')
 671
 672         def increment_downloads(self):
 673                 """Increment the ordinal that assigns a number to each file."""
 674                 self._num_downloads += 1
 675
 676         def prepare_filename(self, info_dict):
 677                 """Generate the output filename."""
 678                 try:
 679                         template_dict = dict(info_dict)
 680                         template_dict['epoch'] = unicode(long(time.time()))
 681                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 682                         filename = self.params['outtmpl'] % template_dict
 683                         return filename
 684                 except (ValueError, KeyError), err:
 685                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 686                         return None
 687
 688         def process_info(self, info_dict):
 689                 """Process a single dictionary returned by an InfoExtractor."""
 690                 filename = self.prepare_filename(info_dict)
 691                 # Do nothing else if in simulate mode
 692                 if self.params.get('simulate', False):
 693                         # Forced printings
 694                         if self.params.get('forcetitle', False):
 695                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 696                         if self.params.get('forceurl', False):
 697                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 698                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 699                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 700                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 701                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 702                         if self.params.get('forcefilename', False) and filename is not None:
 703                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 704
 705                         return
 706
 707                 if filename is None:
 708                         return
 709                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 710                         self.to_stderr(u'WARNING: file exists and will be skipped')
 711                         return
 712
 713                 try:
 714                         dn = os.path.dirname(filename)
 715                         if dn != '' and not os.path.exists(dn):
 716                                 os.makedirs(dn)
 717                 except (OSError, IOError), err:
 718                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 719                         return
 720
 721                 if self.params.get('writedescription', False):
 722                         try:
 723                                 descfn = filename + '.description'
 724                                 self.report_writedescription(descfn)
 725                                 descfile = open(descfn, 'wb')
 726                                 try:
 727                                         descfile.write(info_dict['description'].encode('utf-8'))
 728                                 finally:
 729                                         descfile.close()
 730                         except (OSError, IOError):
 731                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 732                                 return
 733
 734                 if self.params.get('writeinfojson', False):
 735                         infofn = filename + '.info.json'
 736                         self.report_writeinfojson(infofn)
 737                         try:
 738                                 json.dump
 739                         except (NameError,AttributeError):
 740                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 741                                 return
 742                         try:
 743                                 infof = open(infofn, 'wb')
 744                                 try:
 745                                         json.dump(info_dict, infof)
 746                                 finally:
 747                                         infof.close()
 748                         except (OSError, IOError):
 749                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 750                                 return
 751
 752                 try:
 753                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 754                 except (OSError, IOError), err:
 755                         raise UnavailableVideoError
 756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 757                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 758                         return
 759                 except (ContentTooShortError, ), err:
 760                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 761                         return
 762
 763                 if success:
 764                         try:
 765                                 self.post_process(filename, info_dict)
 766                         except (PostProcessingError), err:
 767                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 768                                 return
 769
 770         def download(self, url_list):
 771                 """Download a given list of URLs."""
 772                 if len(url_list) > 1 and self.fixed_template():
 773                         raise SameFileError(self.params['outtmpl'])
 774
 775                 for url in url_list:
 776                         suitable_found = False
 777                         for ie in self._ies:
 778                                 # Go to next InfoExtractor if not suitable
 779                                 if not ie.suitable(url):
 780                                         continue
 781
 782                                 # Suitable InfoExtractor found
 783                                 suitable_found = True
 784
 785                                 # Extract information from URL and process it
 786                                 ie.extract(url)
 787
 788                                 # Suitable InfoExtractor had been found; go to next URL
 789                                 break
 790
 791                         if not suitable_found:
 792                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 793
 794                 return self._download_retcode
 795
 796         def post_process(self, filename, ie_info):
 797                 """Run the postprocessing chain on the given file."""
 798                 info = dict(ie_info)
 799                 info['filepath'] = filename
 800                 for pp in self._pps:
 801                         info = pp.run(info)
 802                         if info is None:
 803                                 break
 804
 805         def _download_with_rtmpdump(self, filename, url, player_url):
 806                 self.report_destination(filename)
 807                 tmpfilename = self.temp_name(filename)
 808
 809                 # Check for rtmpdump first
 810                 try:
 811                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 812                 except (OSError, IOError):
 813                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 814                         return False
 815
 816                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 817                 # the connection was interrumpted and resuming appears to be
 818                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 819                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 820                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 821                 while retval == 2 or retval == 1:
 822                         prevsize = os.path.getsize(tmpfilename)
 823                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 824                         time.sleep(5.0) # This seems to be needed
 825                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 826                         cursize = os.path.getsize(tmpfilename)
 827                         if prevsize == cursize and retval == 1:
 828                                 break
 829                 if retval == 0:
 830                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 831                         self.try_rename(tmpfilename, filename)
 832                         return True
 833                 else:
 834                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 835                         return False
 836
 837         def _do_download(self, filename, url, player_url):
 838                 # Check file already present
 839                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 840                         self.report_file_already_downloaded(filename)
 841                         return True
 842
 843                 # Attempt to download using rtmpdump
 844                 if url.startswith('rtmp'):
 845                         return self._download_with_rtmpdump(filename, url, player_url)
 846
 847                 tmpfilename = self.temp_name(filename)
 848                 stream = None
 849                 open_mode = 'wb'
 850
 851                 # Do not include the Accept-Encoding header
 852                 headers = {'Youtubedl-no-compression': 'True'}
 853                 basic_request = urllib2.Request(url, None, headers)
 854                 request = urllib2.Request(url, None, headers)
 855
 856                 # Establish possible resume length
 857                 if os.path.isfile(tmpfilename):
 858                         resume_len = os.path.getsize(tmpfilename)
 859                 else:
 860                         resume_len = 0
 861
 862                 # Request parameters in case of being able to resume
 863                 if self.params.get('continuedl', False) and resume_len != 0:
 864                         self.report_resuming_byte(resume_len)
 865                         request.add_header('Range', 'bytes=%d-' % resume_len)
 866                         open_mode = 'ab'
 867
 868                 count = 0
 869                 retries = self.params.get('retries', 0)
 870                 while count <= retries:
 871                         # Establish connection
 872                         try:
 873                                 data = urllib2.urlopen(request)
 874                                 break
 875                         except (urllib2.HTTPError, ), err:
 876                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 877                                         # Unexpected HTTP error
 878                                         raise
 879                                 elif err.code == 416:
 880                                         # Unable to resume (requested range not satisfiable)
 881                                         try:
 882                                                 # Open the connection again without the range header
 883                                                 data = urllib2.urlopen(basic_request)
 884                                                 content_length = data.info()['Content-Length']
 885                                         except (urllib2.HTTPError, ), err:
 886                                                 if err.code < 500 or err.code >= 600:
 887                                                         raise
 888                                         else:
 889                                                 # Examine the reported length
 890                                                 if (content_length is not None and
 891                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 892                                                         # The file had already been fully downloaded.
 893                                                         # Explanation to the above condition: in issue #175 it was revealed that
 894                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 895                                                         # changing the file size slightly and causing problems for some users. So
 896                                                         # I decided to implement a suggested change and consider the file
 897                                                         # completely downloaded if the file size differs less than 100 bytes from
 898                                                         # the one in the hard drive.
 899                                                         self.report_file_already_downloaded(filename)
 900                                                         self.try_rename(tmpfilename, filename)
 901                                                         return True
 902                                                 else:
 903                                                         # The length does not match, we start the download over
 904                                                         self.report_unable_to_resume()
 905                                                         open_mode = 'wb'
 906                                                         break
 907                         # Retry
 908                         count += 1
 909                         if count <= retries:
 910                                 self.report_retry(count, retries)
 911
 912                 if count > retries:
 913                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 914                         return False
 915
 916                 data_len = data.info().get('Content-length', None)
 917                 if data_len is not None:
 918                         data_len = long(data_len) + resume_len
 919                 data_len_str = self.format_bytes(data_len)
 920                 byte_counter = 0 + resume_len
 921                 block_size = 1024
 922                 start = time.time()
 923                 while True:
 924                         # Download and write
 925                         before = time.time()
 926                         data_block = data.read(block_size)
 927                         after = time.time()
 928                         if len(data_block) == 0:
 929                                 break
 930                         byte_counter += len(data_block)
 931
 932                         # Open file just in time
 933                         if stream is None:
 934                                 try:
 935                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 936                                         assert stream is not None
 937                                         filename = self.undo_temp_name(tmpfilename)
 938                                         self.report_destination(filename)
 939                                 except (OSError, IOError), err:
 940                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 941                                         return False
 942                         try:
 943                                 stream.write(data_block)
 944                         except (IOError, OSError), err:
 945                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 946                                 return False
 947                         block_size = self.best_block_size(after - before, len(data_block))
 948
 949                         # Progress message
 950                         percent_str = self.calc_percent(byte_counter, data_len)
 951                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 952                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 953                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 954
 955                         # Apply rate limit
 956                         self.slow_down(start, byte_counter - resume_len)
 957
 958                 if stream is None:
 959                         self.trouble(u'\nERROR: Did not get any data blocks')
 960                         return False
 961                 stream.close()
 962                 self.report_finish()
 963                 if data_len is not None and byte_counter != data_len:
 964                         raise ContentTooShortError(byte_counter, long(data_len))
 965                 self.try_rename(tmpfilename, filename)
 966
 967                 # Update file modification time
 968                 if self.params.get('updatetime', True):
 969                         self.try_utime(filename, data.info().get('last-modified', None))
 970
 971                 return True
 972
 973
 974 class InfoExtractor(object):
 975         """Information Extractor class.
 976
 977         Information extractors are the classes that, given a URL, extract
 978         information from the video (or videos) the URL refers to. This
 979         information includes the real video URL, the video title and simplified
 980         title, author and others. The information is stored in a dictionary
 981         which is then passed to the FileDownloader. The FileDownloader
 982         processes this information possibly downloading the video to the file
 983         system, among other possible outcomes. The dictionaries must include
 984         the following fields:
 985
 986         id:             Video identifier.
 987         url:            Final video URL.
 988         uploader:       Nickname of the video uploader.
 989         title:          Literal title.
 990         stitle:         Simplified title.
 991         ext:            Video filename extension.
 992         format:         Video format.
 993         player_url:     SWF Player URL (may be None).
 994
 995         The following fields are optional. Their primary purpose is to allow
 996         youtube-dl to serve as the backend for a video search function, such
 997         as the one in youtube2mp3.  They are only used when their respective
 998         forced printing functions are called:
 999
1000         thumbnail:      Full URL to a video thumbnail image.
1001         description:    One-line video description.
1002
1003         Subclasses of this one should re-define the _real_initialize() and
1004         _real_extract() methods, as well as the suitable() static method.
1005         Probably, they should also be instantiated and added to the main
1006         downloader.
1007         """
1008
1009         _ready = False
1010         _downloader = None
1011
1012         def __init__(self, downloader=None):
1013                 """Constructor. Receives an optional downloader."""
1014                 self._ready = False
1015                 self.set_downloader(downloader)
1016
1017         @staticmethod
1018         def suitable(url):
1019                 """Receives a URL and returns True if suitable for this IE."""
1020                 return False
1021
1022         def initialize(self):
1023                 """Initializes an instance (authentication, etc)."""
1024                 if not self._ready:
1025                         self._real_initialize()
1026                         self._ready = True
1027
1028         def extract(self, url):
1029                 """Extracts URL information and returns it in list of dicts."""
1030                 self.initialize()
1031                 return self._real_extract(url)
1032
1033         def set_downloader(self, downloader):
1034                 """Sets the downloader for this IE."""
1035                 self._downloader = downloader
1036
1037         def _real_initialize(self):
1038                 """Real initialization process. Redefine in subclasses."""
1039                 pass
1040
1041         def _real_extract(self, url):
1042                 """Real extraction process. Redefine in subclasses."""
1043                 pass
1044
1045
1046 class YoutubeIE(InfoExtractor):
1047         """Information extractor for youtube.com."""
1048
1049         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1050         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1051         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1052         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1053         _NETRC_MACHINE = 'youtube'
1054         # Listed in order of quality
1055         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1056         _video_extensions = {
1057                 '13': '3gp',
1058                 '17': 'mp4',
1059                 '18': 'mp4',
1060                 '22': 'mp4',
1061                 '37': 'mp4',
1062                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1063                 '43': 'webm',
1064                 '45': 'webm',
1065         }
1066
1067         @staticmethod
1068         def suitable(url):
1069                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1070
1071         def report_lang(self):
1072                 """Report attempt to set language."""
1073                 self._downloader.to_screen(u'[youtube] Setting language')
1074
1075         def report_login(self):
1076                 """Report attempt to log in."""
1077                 self._downloader.to_screen(u'[youtube] Logging in')
1078
1079         def report_age_confirmation(self):
1080                 """Report attempt to confirm age."""
1081                 self._downloader.to_screen(u'[youtube] Confirming age')
1082
1083         def report_video_webpage_download(self, video_id):
1084                 """Report attempt to download video webpage."""
1085                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1086
1087         def report_video_info_webpage_download(self, video_id):
1088                 """Report attempt to download video info webpage."""
1089                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1090
1091         def report_information_extraction(self, video_id):
1092                 """Report attempt to extract video information."""
1093                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1094
1095         def report_unavailable_format(self, video_id, format):
1096                 """Report extracted video URL."""
1097                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1098
1099         def report_rtmp_download(self):
1100                 """Indicate the download will use the RTMP protocol."""
1101                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1102
1103         def _real_initialize(self):
1104                 if self._downloader is None:
1105                         return
1106
1107                 username = None
1108                 password = None
1109                 downloader_params = self._downloader.params
1110
1111                 # Attempt to use provided username and password or .netrc data
1112                 if downloader_params.get('username', None) is not None:
1113                         username = downloader_params['username']
1114                         password = downloader_params['password']
1115                 elif downloader_params.get('usenetrc', False):
1116                         try:
1117                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1118                                 if info is not None:
1119                                         username = info[0]
1120                                         password = info[2]
1121                                 else:
1122                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1123                         except (IOError, netrc.NetrcParseError), err:
1124                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1125                                 return
1126
1127                 # Set language
1128                 request = urllib2.Request(self._LANG_URL)
1129                 try:
1130                         self.report_lang()
1131                         urllib2.urlopen(request).read()
1132                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1133                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1134                         return
1135
1136                 # No authentication to be performed
1137                 if username is None:
1138                         return
1139
1140                 # Log in
1141                 login_form = {
1142                                 'current_form': 'loginForm',
1143                                 'next':         '/',
1144                                 'action_login': 'Log In',
1145                                 'username':     username,
1146                                 'password':     password,
1147                                 }
1148                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1149                 try:
1150                         self.report_login()
1151                         login_results = urllib2.urlopen(request).read()
1152                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1153                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1154                                 return
1155                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1156                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1157                         return
1158
1159                 # Confirm age
1160                 age_form = {
1161                                 'next_url':             '/',
1162                                 'action_confirm':       'Confirm',
1163                                 }
1164                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1165                 try:
1166                         self.report_age_confirmation()
1167                         age_results = urllib2.urlopen(request).read()
1168                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1170                         return
1171
1172         def _real_extract(self, url):
1173                 # Extract video id from URL
1174                 mobj = re.match(self._VALID_URL, url)
1175                 if mobj is None:
1176                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1177                         return
1178                 video_id = mobj.group(2)
1179
1180                 # Get video webpage
1181                 self.report_video_webpage_download(video_id)
1182                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1183                 try:
1184                         video_webpage = urllib2.urlopen(request).read()
1185                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1187                         return
1188
1189                 # Attempt to extract SWF player URL
1190                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1191                 if mobj is not None:
1192                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1193                 else:
1194                         player_url = None
1195
1196                 # Get video info
1197                 self.report_video_info_webpage_download(video_id)
1198                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1199                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1200                                         % (video_id, el_type))
1201                         request = urllib2.Request(video_info_url)
1202                         try:
1203                                 video_info_webpage = urllib2.urlopen(request).read()
1204                                 video_info = parse_qs(video_info_webpage)
1205                                 if 'token' in video_info:
1206                                         break
1207                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1208                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1209                                 return
1210                 if 'token' not in video_info:
1211                         if 'reason' in video_info:
1212                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1213                         else:
1214                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1215                         return
1216
1217                 # Start extracting information
1218                 self.report_information_extraction(video_id)
1219
1220                 # uploader
1221                 if 'author' not in video_info:
1222                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1223                         return
1224                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1225
1226                 # title
1227                 if 'title' not in video_info:
1228                         self._downloader.trouble(u'ERROR: unable to extract video title')
1229                         return
1230                 video_title = urllib.unquote_plus(video_info['title'][0])
1231                 video_title = video_title.decode('utf-8')
1232                 video_title = sanitize_title(video_title)
1233
1234                 # simplified title
1235                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1236                 simple_title = simple_title.strip(ur'_')
1237
1238                 # thumbnail image
1239                 if 'thumbnail_url' not in video_info:
1240                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1241                         video_thumbnail = ''
1242                 else:   # don't panic if we can't find it
1243                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1244
1245                 # upload date
1246                 upload_date = u'NA'
1247                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1248                 if mobj is not None:
1249                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1250                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1251                         for expression in format_expressions:
1252                                 try:
1253                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1254                                 except:
1255                                         pass
1256
1257                 # description
1258                 try:
1259                         lxml.etree
1260                 except NameError:
1261                         video_description = u'No description available.'
1262                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1263                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1264                                 if mobj is not None:
1265                                         video_description = mobj.group(1).decode('utf-8')
1266                 else:
1267                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1268                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1269                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1270                         # TODO use another parser
1271
1272                 # token
1273                 video_token = urllib.unquote_plus(video_info['token'][0])
1274
1275                 # Decide which formats to download
1276                 req_format = self._downloader.params.get('format', None)
1277
1278                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1279                         self.report_rtmp_download()
1280                         video_url_list = [(None, video_info['conn'][0])]
1281                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1282                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1283                         url_data = [parse_qs(uds) for uds in url_data_strs]
1284                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1285                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1286
1287                         format_limit = self._downloader.params.get('format_limit', None)
1288                         if format_limit is not None and format_limit in self._available_formats:
1289                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1290                         else:
1291                                 format_list = self._available_formats
1292                         existing_formats = [x for x in format_list if x in url_map]
1293                         if len(existing_formats) == 0:
1294                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1295                                 return
1296                         if req_format is None:
1297                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1298                         elif req_format == '-1':
1299                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1300                         else:
1301                                 # Specific format
1302                                 if req_format not in url_map:
1303                                         self._downloader.trouble(u'ERROR: requested format not available')
1304                                         return
1305                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1306                 else:
1307                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1308                         return
1309
1310                 for format_param, video_real_url in video_url_list:
1311                         # At this point we have a new video
1312                         self._downloader.increment_downloads()
1313
1314                         # Extension
1315                         video_extension = self._video_extensions.get(format_param, 'flv')
1316
1317                         try:
1318                                 # Process video information
1319                                 self._downloader.process_info({
1320                                         'id':           video_id.decode('utf-8'),
1321                                         'url':          video_real_url.decode('utf-8'),
1322                                         'uploader':     video_uploader.decode('utf-8'),
1323                                         'upload_date':  upload_date,
1324                                         'title':        video_title,
1325                                         'stitle':       simple_title,
1326                                         'ext':          video_extension.decode('utf-8'),
1327                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1328                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1329                                         'description':  video_description,
1330                                         'player_url':   player_url,
1331                                 })
1332                         except UnavailableVideoError, err:
1333                                 self._downloader.trouble(u'\nERROR: unable to download video')
1334
1335
1336 class MetacafeIE(InfoExtractor):
1337         """Information Extractor for metacafe.com."""
1338
1339         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1340         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1341         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1342         _youtube_ie = None
1343
1344         def __init__(self, youtube_ie, downloader=None):
1345                 InfoExtractor.__init__(self, downloader)
1346                 self._youtube_ie = youtube_ie
1347
1348         @staticmethod
1349         def suitable(url):
1350                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1351
1352         def report_disclaimer(self):
1353                 """Report disclaimer retrieval."""
1354                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1355
1356         def report_age_confirmation(self):
1357                 """Report attempt to confirm age."""
1358                 self._downloader.to_screen(u'[metacafe] Confirming age')
1359
1360         def report_download_webpage(self, video_id):
1361                 """Report webpage download."""
1362                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1363
1364         def report_extraction(self, video_id):
1365                 """Report information extraction."""
1366                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1367
1368         def _real_initialize(self):
1369                 # Retrieve disclaimer
1370                 request = urllib2.Request(self._DISCLAIMER)
1371                 try:
1372                         self.report_disclaimer()
1373                         disclaimer = urllib2.urlopen(request).read()
1374                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1376                         return
1377
1378                 # Confirm age
1379                 disclaimer_form = {
1380                         'filters': '0',
1381                         'submit': "Continue - I'm over 18",
1382                         }
1383                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1384                 try:
1385                         self.report_age_confirmation()
1386                         disclaimer = urllib2.urlopen(request).read()
1387                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1388                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1389                         return
1390
1391         def _real_extract(self, url):
1392                 # Extract id and simplified title from URL
1393                 mobj = re.match(self._VALID_URL, url)
1394                 if mobj is None:
1395                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1396                         return
1397
1398                 video_id = mobj.group(1)
1399
1400                 # Check if video comes from YouTube
1401                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1402                 if mobj2 is not None:
1403                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1404                         return
1405
1406                 # At this point we have a new video
1407                 self._downloader.increment_downloads()
1408
1409                 simple_title = mobj.group(2).decode('utf-8')
1410
1411                 # Retrieve video webpage to extract further information
1412                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1413                 try:
1414                         self.report_download_webpage(video_id)
1415                         webpage = urllib2.urlopen(request).read()
1416                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1418                         return
1419
1420                 # Extract URL, uploader and title from webpage
1421                 self.report_extraction(video_id)
1422                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1423                 if mobj is not None:
1424                         mediaURL = urllib.unquote(mobj.group(1))
1425                         video_extension = mediaURL[-3:]
1426
1427                         # Extract gdaKey if available
1428                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1429                         if mobj is None:
1430                                 video_url = mediaURL
1431                         else:
1432                                 gdaKey = mobj.group(1)
1433                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1434                 else:
1435                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1436                         if mobj is None:
1437                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1438                                 return
1439                         vardict = parse_qs(mobj.group(1))
1440                         if 'mediaData' not in vardict:
1441                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1442                                 return
1443                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1444                         if mobj is None:
1445                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1446                                 return
1447                         mediaURL = mobj.group(1).replace('\\/', '/')
1448                         video_extension = mediaURL[-3:]
1449                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1450
1451                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1452                 if mobj is None:
1453                         self._downloader.trouble(u'ERROR: unable to extract title')
1454                         return
1455                 video_title = mobj.group(1).decode('utf-8')
1456                 video_title = sanitize_title(video_title)
1457
1458                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1459                 if mobj is None:
1460                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1461                         return
1462                 video_uploader = mobj.group(1)
1463
1464                 try:
1465                         # Process video information
1466                         self._downloader.process_info({
1467                                 'id':           video_id.decode('utf-8'),
1468                                 'url':          video_url.decode('utf-8'),
1469                                 'uploader':     video_uploader.decode('utf-8'),
1470                                 'upload_date':  u'NA',
1471                                 'title':        video_title,
1472                                 'stitle':       simple_title,
1473                                 'ext':          video_extension.decode('utf-8'),
1474                                 'format':       u'NA',
1475                                 'player_url':   None,
1476                         })
1477                 except UnavailableVideoError:
1478                         self._downloader.trouble(u'\nERROR: unable to download video')
1479
1480
1481 class DailymotionIE(InfoExtractor):
1482         """Information Extractor for Dailymotion"""
1483
1484         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1485
1486         def __init__(self, downloader=None):
1487                 InfoExtractor.__init__(self, downloader)
1488
1489         @staticmethod
1490         def suitable(url):
1491                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1492
1493         def report_download_webpage(self, video_id):
1494                 """Report webpage download."""
1495                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1496
1497         def report_extraction(self, video_id):
1498                 """Report information extraction."""
1499                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1500
1501         def _real_initialize(self):
1502                 return
1503
1504         def _real_extract(self, url):
1505                 # Extract id and simplified title from URL
1506                 mobj = re.match(self._VALID_URL, url)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1509                         return
1510
1511                 # At this point we have a new video
1512                 self._downloader.increment_downloads()
1513                 video_id = mobj.group(1)
1514
1515                 simple_title = mobj.group(2).decode('utf-8')
1516                 video_extension = 'flv'
1517
1518                 # Retrieve video webpage to extract further information
1519                 request = urllib2.Request(url)
1520                 try:
1521                         self.report_download_webpage(video_id)
1522                         webpage = urllib2.urlopen(request).read()
1523                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1524                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1525                         return
1526
1527                 # Extract URL, uploader and title from webpage
1528                 self.report_extraction(video_id)
1529                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1530                 if mobj is None:
1531                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1532                         return
1533                 mediaURL = urllib.unquote(mobj.group(1))
1534
1535                 # if needed add http://www.dailymotion.com/ if relative URL
1536
1537                 video_url = mediaURL
1538
1539                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1540                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1541                 if mobj is None:
1542                         self._downloader.trouble(u'ERROR: unable to extract title')
1543                         return
1544                 video_title = mobj.group(1).decode('utf-8')
1545                 video_title = sanitize_title(video_title)
1546
1547                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1548                 if mobj is None:
1549                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1550                         return
1551                 video_uploader = mobj.group(1)
1552
1553                 try:
1554                         # Process video information
1555                         self._downloader.process_info({
1556                                 'id':           video_id.decode('utf-8'),
1557                                 'url':          video_url.decode('utf-8'),
1558                                 'uploader':     video_uploader.decode('utf-8'),
1559                                 'upload_date':  u'NA',
1560                                 'title':        video_title,
1561                                 'stitle':       simple_title,
1562                                 'ext':          video_extension.decode('utf-8'),
1563                                 'format':       u'NA',
1564                                 'player_url':   None,
1565                         })
1566                 except UnavailableVideoError:
1567                         self._downloader.trouble(u'\nERROR: unable to download video')
1568
1569
1570 class GoogleIE(InfoExtractor):
1571         """Information extractor for video.google.com."""
1572
1573         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1574
1575         def __init__(self, downloader=None):
1576                 InfoExtractor.__init__(self, downloader)
1577
1578         @staticmethod
1579         def suitable(url):
1580                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1581
1582         def report_download_webpage(self, video_id):
1583                 """Report webpage download."""
1584                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1585
1586         def report_extraction(self, video_id):
1587                 """Report information extraction."""
1588                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1589
1590         def _real_initialize(self):
1591                 return
1592
1593         def _real_extract(self, url):
1594                 # Extract id from URL
1595                 mobj = re.match(self._VALID_URL, url)
1596                 if mobj is None:
1597                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1598                         return
1599
1600                 # At this point we have a new video
1601                 self._downloader.increment_downloads()
1602                 video_id = mobj.group(1)
1603
1604                 video_extension = 'mp4'
1605
1606                 # Retrieve video webpage to extract further information
1607                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1608                 try:
1609                         self.report_download_webpage(video_id)
1610                         webpage = urllib2.urlopen(request).read()
1611                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1613                         return
1614
1615                 # Extract URL, uploader, and title from webpage
1616                 self.report_extraction(video_id)
1617                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1618                 if mobj is None:
1619                         video_extension = 'flv'
1620                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1621                 if mobj is None:
1622                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1623                         return
1624                 mediaURL = urllib.unquote(mobj.group(1))
1625                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1626                 mediaURL = mediaURL.replace('\\x26', '\x26')
1627
1628                 video_url = mediaURL
1629
1630                 mobj = re.search(r'<title>(.*)</title>', webpage)
1631                 if mobj is None:
1632                         self._downloader.trouble(u'ERROR: unable to extract title')
1633                         return
1634                 video_title = mobj.group(1).decode('utf-8')
1635                 video_title = sanitize_title(video_title)
1636                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1637
1638                 # Extract video description
1639                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1640                 if mobj is None:
1641                         self._downloader.trouble(u'ERROR: unable to extract video description')
1642                         return
1643                 video_description = mobj.group(1).decode('utf-8')
1644                 if not video_description:
1645                         video_description = 'No description available.'
1646
1647                 # Extract video thumbnail
1648                 if self._downloader.params.get('forcethumbnail', False):
1649                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1650                         try:
1651                                 webpage = urllib2.urlopen(request).read()
1652                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1653                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1654                                 return
1655                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1656                         if mobj is None:
1657                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1658                                 return
1659                         video_thumbnail = mobj.group(1)
1660                 else:   # we need something to pass to process_info
1661                         video_thumbnail = ''
1662
1663                 try:
1664                         # Process video information
1665                         self._downloader.process_info({
1666                                 'id':           video_id.decode('utf-8'),
1667                                 'url':          video_url.decode('utf-8'),
1668                                 'uploader':     u'NA',
1669                                 'upload_date':  u'NA',
1670                                 'title':        video_title,
1671                                 'stitle':       simple_title,
1672                                 'ext':          video_extension.decode('utf-8'),
1673                                 'format':       u'NA',
1674                                 'player_url':   None,
1675                         })
1676                 except UnavailableVideoError:
1677                         self._downloader.trouble(u'\nERROR: unable to download video')
1678
1679
1680 class PhotobucketIE(InfoExtractor):
1681         """Information extractor for photobucket.com."""
1682
1683         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1684
1685         def __init__(self, downloader=None):
1686                 InfoExtractor.__init__(self, downloader)
1687
1688         @staticmethod
1689         def suitable(url):
1690                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1691
1692         def report_download_webpage(self, video_id):
1693                 """Report webpage download."""
1694                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1695
1696         def report_extraction(self, video_id):
1697                 """Report information extraction."""
1698                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1699
1700         def _real_initialize(self):
1701                 return
1702
1703         def _real_extract(self, url):
1704                 # Extract id from URL
1705                 mobj = re.match(self._VALID_URL, url)
1706                 if mobj is None:
1707                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1708                         return
1709
1710                 # At this point we have a new video
1711                 self._downloader.increment_downloads()
1712                 video_id = mobj.group(1)
1713
1714                 video_extension = 'flv'
1715
1716                 # Retrieve video webpage to extract further information
1717                 request = urllib2.Request(url)
1718                 try:
1719                         self.report_download_webpage(video_id)
1720                         webpage = urllib2.urlopen(request).read()
1721                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1722                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1723                         return
1724
1725                 # Extract URL, uploader, and title from webpage
1726                 self.report_extraction(video_id)
1727                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1728                 if mobj is None:
1729                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1730                         return
1731                 mediaURL = urllib.unquote(mobj.group(1))
1732
1733                 video_url = mediaURL
1734
1735                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: unable to extract title')
1738                         return
1739                 video_title = mobj.group(1).decode('utf-8')
1740                 video_title = sanitize_title(video_title)
1741                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1742
1743                 video_uploader = mobj.group(2).decode('utf-8')
1744
1745                 try:
1746                         # Process video information
1747                         self._downloader.process_info({
1748                                 'id':           video_id.decode('utf-8'),
1749                                 'url':          video_url.decode('utf-8'),
1750                                 'uploader':     video_uploader,
1751                                 'upload_date':  u'NA',
1752                                 'title':        video_title,
1753                                 'stitle':       simple_title,
1754                                 'ext':          video_extension.decode('utf-8'),
1755                                 'format':       u'NA',
1756                                 'player_url':   None,
1757                         })
1758                 except UnavailableVideoError:
1759                         self._downloader.trouble(u'\nERROR: unable to download video')
1760
1761
1762 class YahooIE(InfoExtractor):
1763         """Information extractor for video.yahoo.com."""
1764
1765         # _VALID_URL matches all Yahoo! Video URLs
1766         # _VPAGE_URL matches only the extractable '/watch/' URLs
1767         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1768         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1769
1770         def __init__(self, downloader=None):
1771                 InfoExtractor.__init__(self, downloader)
1772
1773         @staticmethod
1774         def suitable(url):
1775                 return (re.match(YahooIE._VALID_URL, url) is not None)
1776
1777         def report_download_webpage(self, video_id):
1778                 """Report webpage download."""
1779                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1780
1781         def report_extraction(self, video_id):
1782                 """Report information extraction."""
1783                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1784
1785         def _real_initialize(self):
1786                 return
1787
1788         def _real_extract(self, url, new_video=True):
1789                 # Extract ID from URL
1790                 mobj = re.match(self._VALID_URL, url)
1791                 if mobj is None:
1792                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1793                         return
1794
1795                 # At this point we have a new video
1796                 self._downloader.increment_downloads()
1797                 video_id = mobj.group(2)
1798                 video_extension = 'flv'
1799
1800                 # Rewrite valid but non-extractable URLs as
1801                 # extractable English language /watch/ URLs
1802                 if re.match(self._VPAGE_URL, url) is None:
1803                         request = urllib2.Request(url)
1804                         try:
1805                                 webpage = urllib2.urlopen(request).read()
1806                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1807                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1808                                 return
1809
1810                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1811                         if mobj is None:
1812                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1813                                 return
1814                         yahoo_id = mobj.group(1)
1815
1816                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1817                         if mobj is None:
1818                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1819                                 return
1820                         yahoo_vid = mobj.group(1)
1821
1822                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1823                         return self._real_extract(url, new_video=False)
1824
1825                 # Retrieve video webpage to extract further information
1826                 request = urllib2.Request(url)
1827                 try:
1828                         self.report_download_webpage(video_id)
1829                         webpage = urllib2.urlopen(request).read()
1830                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1831                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1832                         return
1833
1834                 # Extract uploader and title from webpage
1835                 self.report_extraction(video_id)
1836                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1837                 if mobj is None:
1838                         self._downloader.trouble(u'ERROR: unable to extract video title')
1839                         return
1840                 video_title = mobj.group(1).decode('utf-8')
1841                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1842
1843                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1846                         return
1847                 video_uploader = mobj.group(1).decode('utf-8')
1848
1849                 # Extract video thumbnail
1850                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1851                 if mobj is None:
1852                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1853                         return
1854                 video_thumbnail = mobj.group(1).decode('utf-8')
1855
1856                 # Extract video description
1857                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1858                 if mobj is None:
1859                         self._downloader.trouble(u'ERROR: unable to extract video description')
1860                         return
1861                 video_description = mobj.group(1).decode('utf-8')
1862                 if not video_description:
1863                         video_description = 'No description available.'
1864
1865                 # Extract video height and width
1866                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1867                 if mobj is None:
1868                         self._downloader.trouble(u'ERROR: unable to extract video height')
1869                         return
1870                 yv_video_height = mobj.group(1)
1871
1872                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1873                 if mobj is None:
1874                         self._downloader.trouble(u'ERROR: unable to extract video width')
1875                         return
1876                 yv_video_width = mobj.group(1)
1877
1878                 # Retrieve video playlist to extract media URL
1879                 # I'm not completely sure what all these options are, but we
1880                 # seem to need most of them, otherwise the server sends a 401.
1881                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1882                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1883                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1884                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1885                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1886                 try:
1887                         self.report_download_webpage(video_id)
1888                         webpage = urllib2.urlopen(request).read()
1889                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1890                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1891                         return
1892
1893                 # Extract media URL from playlist XML
1894                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1895                 if mobj is None:
1896                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1897                         return
1898                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1899                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1900
1901                 try:
1902                         # Process video information
1903                         self._downloader.process_info({
1904                                 'id':           video_id.decode('utf-8'),
1905                                 'url':          video_url,
1906                                 'uploader':     video_uploader,
1907                                 'upload_date':  u'NA',
1908                                 'title':        video_title,
1909                                 'stitle':       simple_title,
1910                                 'ext':          video_extension.decode('utf-8'),
1911                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1912                                 'description':  video_description,
1913                                 'thumbnail':    video_thumbnail,
1914                                 'description':  video_description,
1915                                 'player_url':   None,
1916                         })
1917                 except UnavailableVideoError:
1918                         self._downloader.trouble(u'\nERROR: unable to download video')
1919
1920
1921 class VimeoIE(InfoExtractor):
1922         """Information extractor for vimeo.com."""
1923
1924         # _VALID_URL matches Vimeo URLs
1925         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1926
1927         def __init__(self, downloader=None):
1928                 InfoExtractor.__init__(self, downloader)
1929
1930         @staticmethod
1931         def suitable(url):
1932                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1933
1934         def report_download_webpage(self, video_id):
1935                 """Report webpage download."""
1936                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1937
1938         def report_extraction(self, video_id):
1939                 """Report information extraction."""
1940                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1941
1942         def _real_initialize(self):
1943                 return
1944
1945         def _real_extract(self, url, new_video=True):
1946                 # Extract ID from URL
1947                 mobj = re.match(self._VALID_URL, url)
1948                 if mobj is None:
1949                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1950                         return
1951
1952                 # At this point we have a new video
1953                 self._downloader.increment_downloads()
1954                 video_id = mobj.group(1)
1955
1956                 # Retrieve video webpage to extract further information
1957                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1958                 try:
1959                         self.report_download_webpage(video_id)
1960                         webpage = urllib2.urlopen(request).read()
1961                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1962                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1963                         return
1964
1965                 # Now we begin extracting as much information as we can from what we
1966                 # retrieved. First we extract the information common to all extractors,
1967                 # and latter we extract those that are Vimeo specific.
1968                 self.report_extraction(video_id)
1969
1970                 # Extract title
1971                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1972                 if mobj is None:
1973                         self._downloader.trouble(u'ERROR: unable to extract video title')
1974                         return
1975                 video_title = mobj.group(1).decode('utf-8')
1976                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1977
1978                 # Extract uploader
1979                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1980                 if mobj is None:
1981                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1982                         return
1983                 video_uploader = mobj.group(1).decode('utf-8')
1984
1985                 # Extract video thumbnail
1986                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1987                 if mobj is None:
1988                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1989                         return
1990                 video_thumbnail = mobj.group(1).decode('utf-8')
1991
1992                 # # Extract video description
1993                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1994                 # if mobj is None:
1995                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1996                 #       return
1997                 # video_description = mobj.group(1).decode('utf-8')
1998                 # if not video_description: video_description = 'No description available.'
1999                 video_description = 'Foo.'
2000
2001                 # Vimeo specific: extract request signature
2002                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2003                 if mobj is None:
2004                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2005                         return
2006                 sig = mobj.group(1).decode('utf-8')
2007
2008                 # Vimeo specific: Extract request signature expiration
2009                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2010                 if mobj is None:
2011                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2012                         return
2013                 sig_exp = mobj.group(1).decode('utf-8')
2014
2015                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2016
2017                 try:
2018                         # Process video information
2019                         self._downloader.process_info({
2020                                 'id':           video_id.decode('utf-8'),
2021                                 'url':          video_url,
2022                                 'uploader':     video_uploader,
2023                                 'upload_date':  u'NA',
2024                                 'title':        video_title,
2025                                 'stitle':       simple_title,
2026                                 'ext':          u'mp4',
2027                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2028                                 'description':  video_description,
2029                                 'thumbnail':    video_thumbnail,
2030                                 'description':  video_description,
2031                                 'player_url':   None,
2032                         })
2033                 except UnavailableVideoError:
2034                         self._downloader.trouble(u'ERROR: unable to download video')
2035
2036
2037 class GenericIE(InfoExtractor):
2038         """Generic last-resort information extractor."""
2039
2040         def __init__(self, downloader=None):
2041                 InfoExtractor.__init__(self, downloader)
2042
2043         @staticmethod
2044         def suitable(url):
2045                 return True
2046
2047         def report_download_webpage(self, video_id):
2048                 """Report webpage download."""
2049                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2050                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2051
2052         def report_extraction(self, video_id):
2053                 """Report information extraction."""
2054                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2055
2056         def _real_initialize(self):
2057                 return
2058
2059         def _real_extract(self, url):
2060                 # At this point we have a new video
2061                 self._downloader.increment_downloads()
2062
2063                 video_id = url.split('/')[-1]
2064                 request = urllib2.Request(url)
2065                 try:
2066                         self.report_download_webpage(video_id)
2067                         webpage = urllib2.urlopen(request).read()
2068                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2069                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2070                         return
2071                 except ValueError, err:
2072                         # since this is the last-resort InfoExtractor, if
2073                         # this error is thrown, it'll be thrown here
2074                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2075                         return
2076
2077                 self.report_extraction(video_id)
2078                 # Start with something easy: JW Player in SWFObject
2079                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2080                 if mobj is None:
2081                         # Broaden the search a little bit
2082                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2083                 if mobj is None:
2084                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2085                         return
2086
2087                 # It's possible that one of the regexes
2088                 # matched, but returned an empty group:
2089                 if mobj.group(1) is None:
2090                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2091                         return
2092
2093                 video_url = urllib.unquote(mobj.group(1))
2094                 video_id = os.path.basename(video_url)
2095
2096                 # here's a fun little line of code for you:
2097                 video_extension = os.path.splitext(video_id)[1][1:]
2098                 video_id = os.path.splitext(video_id)[0]
2099
2100                 # it's tempting to parse this further, but you would
2101                 # have to take into account all the variations like
2102                 #   Video Title - Site Name
2103                 #   Site Name | Video Title
2104                 #   Video Title - Tagline | Site Name
2105                 # and so on and so forth; it's just not practical
2106                 mobj = re.search(r'<title>(.*)</title>', webpage)
2107                 if mobj is None:
2108                         self._downloader.trouble(u'ERROR: unable to extract title')
2109                         return
2110                 video_title = mobj.group(1).decode('utf-8')
2111                 video_title = sanitize_title(video_title)
2112                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2113
2114                 # video uploader is domain name
2115                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2116                 if mobj is None:
2117                         self._downloader.trouble(u'ERROR: unable to extract title')
2118                         return
2119                 video_uploader = mobj.group(1).decode('utf-8')
2120
2121                 try:
2122                         # Process video information
2123                         self._downloader.process_info({
2124                                 'id':           video_id.decode('utf-8'),
2125                                 'url':          video_url.decode('utf-8'),
2126                                 'uploader':     video_uploader,
2127                                 'upload_date':  u'NA',
2128                                 'title':        video_title,
2129                                 'stitle':       simple_title,
2130                                 'ext':          video_extension.decode('utf-8'),
2131                                 'format':       u'NA',
2132                                 'player_url':   None,
2133                         })
2134                 except UnavailableVideoError, err:
2135                         self._downloader.trouble(u'\nERROR: unable to download video')
2136
2137
2138 class YoutubeSearchIE(InfoExtractor):
2139         """Information Extractor for YouTube search queries."""
2140         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2141         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2142         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2143         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2144         _youtube_ie = None
2145         _max_youtube_results = 1000
2146
2147         def __init__(self, youtube_ie, downloader=None):
2148                 InfoExtractor.__init__(self, downloader)
2149                 self._youtube_ie = youtube_ie
2150
2151         @staticmethod
2152         def suitable(url):
2153                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2154
2155         def report_download_page(self, query, pagenum):
2156                 """Report attempt to download playlist page with given number."""
2157                 query = query.decode(preferredencoding())
2158                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2159
2160         def _real_initialize(self):
2161                 self._youtube_ie.initialize()
2162
2163         def _real_extract(self, query):
2164                 mobj = re.match(self._VALID_QUERY, query)
2165                 if mobj is None:
2166                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2167                         return
2168
2169                 prefix, query = query.split(':')
2170                 prefix = prefix[8:]
2171                 query = query.encode('utf-8')
2172                 if prefix == '':
2173                         self._download_n_results(query, 1)
2174                         return
2175                 elif prefix == 'all':
2176                         self._download_n_results(query, self._max_youtube_results)
2177                         return
2178                 else:
2179                         try:
2180                                 n = long(prefix)
2181                                 if n <= 0:
2182                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2183                                         return
2184                                 elif n > self._max_youtube_results:
2185                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2186                                         n = self._max_youtube_results
2187                                 self._download_n_results(query, n)
2188                                 return
2189                         except ValueError: # parsing prefix as integer fails
2190                                 self._download_n_results(query, 1)
2191                                 return
2192
2193         def _download_n_results(self, query, n):
2194                 """Downloads a specified number of results for a query"""
2195
2196                 video_ids = []
2197                 already_seen = set()
2198                 pagenum = 1
2199
2200                 while True:
2201                         self.report_download_page(query, pagenum)
2202                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2203                         request = urllib2.Request(result_url)
2204                         try:
2205                                 page = urllib2.urlopen(request).read()
2206                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2207                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2208                                 return
2209
2210                         # Extract video identifiers
2211                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2212                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2213                                 if video_id not in already_seen:
2214                                         video_ids.append(video_id)
2215                                         already_seen.add(video_id)
2216                                         if len(video_ids) == n:
2217                                                 # Specified n videos reached
2218                                                 for id in video_ids:
2219                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2220                                                 return
2221
2222                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2223                                 for id in video_ids:
2224                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2225                                 return
2226
2227                         pagenum = pagenum + 1
2228
2229
2230 class GoogleSearchIE(InfoExtractor):
2231         """Information Extractor for Google Video search queries."""
2232         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2233         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2234         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2235         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2236         _google_ie = None
2237         _max_google_results = 1000
2238
2239         def __init__(self, google_ie, downloader=None):
2240                 InfoExtractor.__init__(self, downloader)
2241                 self._google_ie = google_ie
2242
2243         @staticmethod
2244         def suitable(url):
2245                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2246
2247         def report_download_page(self, query, pagenum):
2248                 """Report attempt to download playlist page with given number."""
2249                 query = query.decode(preferredencoding())
2250                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2251
2252         def _real_initialize(self):
2253                 self._google_ie.initialize()
2254
2255         def _real_extract(self, query):
2256                 mobj = re.match(self._VALID_QUERY, query)
2257                 if mobj is None:
2258                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2259                         return
2260
2261                 prefix, query = query.split(':')
2262                 prefix = prefix[8:]
2263                 query = query.encode('utf-8')
2264                 if prefix == '':
2265                         self._download_n_results(query, 1)
2266                         return
2267                 elif prefix == 'all':
2268                         self._download_n_results(query, self._max_google_results)
2269                         return
2270                 else:
2271                         try:
2272                                 n = long(prefix)
2273                                 if n <= 0:
2274                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2275                                         return
2276                                 elif n > self._max_google_results:
2277                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2278                                         n = self._max_google_results
2279                                 self._download_n_results(query, n)
2280                                 return
2281                         except ValueError: # parsing prefix as integer fails
2282                                 self._download_n_results(query, 1)
2283                                 return
2284
2285         def _download_n_results(self, query, n):
2286                 """Downloads a specified number of results for a query"""
2287
2288                 video_ids = []
2289                 already_seen = set()
2290                 pagenum = 1
2291
2292                 while True:
2293                         self.report_download_page(query, pagenum)
2294                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2295                         request = urllib2.Request(result_url)
2296                         try:
2297                                 page = urllib2.urlopen(request).read()
2298                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2299                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2300                                 return
2301
2302                         # Extract video identifiers
2303                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2304                                 video_id = mobj.group(1)
2305                                 if video_id not in already_seen:
2306                                         video_ids.append(video_id)
2307                                         already_seen.add(video_id)
2308                                         if len(video_ids) == n:
2309                                                 # Specified n videos reached
2310                                                 for id in video_ids:
2311                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2312                                                 return
2313
2314                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2315                                 for id in video_ids:
2316                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2317                                 return
2318
2319                         pagenum = pagenum + 1
2320
2321
2322 class YahooSearchIE(InfoExtractor):
2323         """Information Extractor for Yahoo! Video search queries."""
2324         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2325         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2326         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2327         _MORE_PAGES_INDICATOR = r'\s*Next'
2328         _yahoo_ie = None
2329         _max_yahoo_results = 1000
2330
2331         def __init__(self, yahoo_ie, downloader=None):
2332                 InfoExtractor.__init__(self, downloader)
2333                 self._yahoo_ie = yahoo_ie
2334
2335         @staticmethod
2336         def suitable(url):
2337                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2338
2339         def report_download_page(self, query, pagenum):
2340                 """Report attempt to download playlist page with given number."""
2341                 query = query.decode(preferredencoding())
2342                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2343
2344         def _real_initialize(self):
2345                 self._yahoo_ie.initialize()
2346
2347         def _real_extract(self, query):
2348                 mobj = re.match(self._VALID_QUERY, query)
2349                 if mobj is None:
2350                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2351                         return
2352
2353                 prefix, query = query.split(':')
2354                 prefix = prefix[8:]
2355                 query = query.encode('utf-8')
2356                 if prefix == '':
2357                         self._download_n_results(query, 1)
2358                         return
2359                 elif prefix == 'all':
2360                         self._download_n_results(query, self._max_yahoo_results)
2361                         return
2362                 else:
2363                         try:
2364                                 n = long(prefix)
2365                                 if n <= 0:
2366                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2367                                         return
2368                                 elif n > self._max_yahoo_results:
2369                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2370                                         n = self._max_yahoo_results
2371                                 self._download_n_results(query, n)
2372                                 return
2373                         except ValueError: # parsing prefix as integer fails
2374                                 self._download_n_results(query, 1)
2375                                 return
2376
2377         def _download_n_results(self, query, n):
2378                 """Downloads a specified number of results for a query"""
2379
2380                 video_ids = []
2381                 already_seen = set()
2382                 pagenum = 1
2383
2384                 while True:
2385                         self.report_download_page(query, pagenum)
2386                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2387                         request = urllib2.Request(result_url)
2388                         try:
2389                                 page = urllib2.urlopen(request).read()
2390                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2392                                 return
2393
2394                         # Extract video identifiers
2395                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2396                                 video_id = mobj.group(1)
2397                                 if video_id not in already_seen:
2398                                         video_ids.append(video_id)
2399                                         already_seen.add(video_id)
2400                                         if len(video_ids) == n:
2401                                                 # Specified n videos reached
2402                                                 for id in video_ids:
2403                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2404                                                 return
2405
2406                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2407                                 for id in video_ids:
2408                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2409                                 return
2410
2411                         pagenum = pagenum + 1
2412
2413
2414 class YoutubePlaylistIE(InfoExtractor):
2415         """Information Extractor for YouTube playlists."""
2416
2417         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2418         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2419         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2420         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2421         _youtube_ie = None
2422
2423         def __init__(self, youtube_ie, downloader=None):
2424                 InfoExtractor.__init__(self, downloader)
2425                 self._youtube_ie = youtube_ie
2426
2427         @staticmethod
2428         def suitable(url):
2429                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2430
2431         def report_download_page(self, playlist_id, pagenum):
2432                 """Report attempt to download playlist page with given number."""
2433                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2434
2435         def _real_initialize(self):
2436                 self._youtube_ie.initialize()
2437
2438         def _real_extract(self, url):
2439                 # Extract playlist id
2440                 mobj = re.match(self._VALID_URL, url)
2441                 if mobj is None:
2442                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2443                         return
2444
2445                 # Single video case
2446                 if mobj.group(3) is not None:
2447                         self._youtube_ie.extract(mobj.group(3))
2448                         return
2449
2450                 # Download playlist pages
2451                 # prefix is 'p' as default for playlists but there are other types that need extra care
2452                 playlist_prefix = mobj.group(1)
2453                 if playlist_prefix == 'a':
2454                         playlist_access = 'artist'
2455                 else:
2456                         playlist_prefix = 'p'
2457                         playlist_access = 'view_play_list'
2458                 playlist_id = mobj.group(2)
2459                 video_ids = []
2460                 pagenum = 1
2461
2462                 while True:
2463                         self.report_download_page(playlist_id, pagenum)
2464                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2465                         try:
2466                                 page = urllib2.urlopen(request).read()
2467                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469                                 return
2470
2471                         # Extract video identifiers
2472                         ids_in_page = []
2473                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474                                 if mobj.group(1) not in ids_in_page:
2475                                         ids_in_page.append(mobj.group(1))
2476                         video_ids.extend(ids_in_page)
2477
2478                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2479                                 break
2480                         pagenum = pagenum + 1
2481
2482                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2483                 playlistend = self._downloader.params.get('playlistend', -1)
2484                 video_ids = video_ids[playliststart:playlistend]
2485
2486                 for id in video_ids:
2487                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2488                 return
2489
2490
2491 class YoutubeUserIE(InfoExtractor):
2492         """Information Extractor for YouTube users."""
2493
2494         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2495         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2496         _GDATA_PAGE_SIZE = 50
2497         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2498         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2499         _youtube_ie = None
2500
2501         def __init__(self, youtube_ie, downloader=None):
2502                 InfoExtractor.__init__(self, downloader)
2503                 self._youtube_ie = youtube_ie
2504
2505         @staticmethod
2506         def suitable(url):
2507                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2508
2509         def report_download_page(self, username, start_index):
2510                 """Report attempt to download user page."""
2511                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2512                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2513
2514         def _real_initialize(self):
2515                 self._youtube_ie.initialize()
2516
2517         def _real_extract(self, url):
2518                 # Extract username
2519                 mobj = re.match(self._VALID_URL, url)
2520                 if mobj is None:
2521                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2522                         return
2523
2524                 username = mobj.group(1)
2525
2526                 # Download video ids using YouTube Data API. Result size per
2527                 # query is limited (currently to 50 videos) so we need to query
2528                 # page by page until there are no video ids - it means we got
2529                 # all of them.
2530
2531                 video_ids = []
2532                 pagenum = 0
2533
2534                 while True:
2535                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2536                         self.report_download_page(username, start_index)
2537
2538                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2539
2540                         try:
2541                                 page = urllib2.urlopen(request).read()
2542                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2543                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2544                                 return
2545
2546                         # Extract video identifiers
2547                         ids_in_page = []
2548
2549                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2550                                 if mobj.group(1) not in ids_in_page:
2551                                         ids_in_page.append(mobj.group(1))
2552
2553                         video_ids.extend(ids_in_page)
2554
2555                         # A little optimization - if current page is not
2556                         # "full", ie. does not contain PAGE_SIZE video ids then
2557                         # we can assume that this page is the last one - there
2558                         # are no more ids on further pages - no need to query
2559                         # again.
2560
2561                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2562                                 break
2563
2564                         pagenum += 1
2565
2566                 all_ids_count = len(video_ids)
2567                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2568                 playlistend = self._downloader.params.get('playlistend', -1)
2569
2570                 if playlistend == -1:
2571                         video_ids = video_ids[playliststart:]
2572                 else:
2573                         video_ids = video_ids[playliststart:playlistend]
2574
2575                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2576                                 (username, all_ids_count, len(video_ids)))
2577
2578                 for video_id in video_ids:
2579                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2580
2581
2582 class DepositFilesIE(InfoExtractor):
2583         """Information extractor for depositfiles.com"""
2584
2585         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2586
2587         def __init__(self, downloader=None):
2588                 InfoExtractor.__init__(self, downloader)
2589
2590         @staticmethod
2591         def suitable(url):
2592                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2593
2594         def report_download_webpage(self, file_id):
2595                 """Report webpage download."""
2596                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2597
2598         def report_extraction(self, file_id):
2599                 """Report information extraction."""
2600                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2601
2602         def _real_initialize(self):
2603                 return
2604
2605         def _real_extract(self, url):
2606                 # At this point we have a new file
2607                 self._downloader.increment_downloads()
2608
2609                 file_id = url.split('/')[-1]
2610                 # Rebuild url in english locale
2611                 url = 'http://depositfiles.com/en/files/' + file_id
2612
2613                 # Retrieve file webpage with 'Free download' button pressed
2614                 free_download_indication = { 'gateway_result' : '1' }
2615                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2616                 try:
2617                         self.report_download_webpage(file_id)
2618                         webpage = urllib2.urlopen(request).read()
2619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2620                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2621                         return
2622
2623                 # Search for the real file URL
2624                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2625                 if (mobj is None) or (mobj.group(1) is None):
2626                         # Try to figure out reason of the error.
2627                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2628                         if (mobj is not None) and (mobj.group(1) is not None):
2629                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2630                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2631                         else:
2632                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2633                         return
2634
2635                 file_url = mobj.group(1)
2636                 file_extension = os.path.splitext(file_url)[1][1:]
2637
2638                 # Search for file title
2639                 mobj = re.search(r'<b title="(.*?)">', webpage)
2640                 if mobj is None:
2641                         self._downloader.trouble(u'ERROR: unable to extract title')
2642                         return
2643                 file_title = mobj.group(1).decode('utf-8')
2644
2645                 try:
2646                         # Process file information
2647                         self._downloader.process_info({
2648                                 'id':           file_id.decode('utf-8'),
2649                                 'url':          file_url.decode('utf-8'),
2650                                 'uploader':     u'NA',
2651                                 'upload_date':  u'NA',
2652                                 'title':        file_title,
2653                                 'stitle':       file_title,
2654                                 'ext':          file_extension.decode('utf-8'),
2655                                 'format':       u'NA',
2656                                 'player_url':   None,
2657                         })
2658                 except UnavailableVideoError, err:
2659                         self._downloader.trouble(u'ERROR: unable to download file')
2660
2661
2662 class FacebookIE(InfoExtractor):
2663         """Information Extractor for Facebook"""
2664
2665         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2666         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2667         _NETRC_MACHINE = 'facebook'
2668         _available_formats = ['highqual', 'lowqual']
2669         _video_extensions = {
2670                 'highqual': 'mp4',
2671                 'lowqual': 'mp4',
2672         }
2673
2674         def __init__(self, downloader=None):
2675                 InfoExtractor.__init__(self, downloader)
2676
2677         @staticmethod
2678         def suitable(url):
2679                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2680
2681         def _reporter(self, message):
2682                 """Add header and report message."""
2683                 self._downloader.to_screen(u'[facebook] %s' % message)
2684
2685         def report_login(self):
2686                 """Report attempt to log in."""
2687                 self._reporter(u'Logging in')
2688
2689         def report_video_webpage_download(self, video_id):
2690                 """Report attempt to download video webpage."""
2691                 self._reporter(u'%s: Downloading video webpage' % video_id)
2692
2693         def report_information_extraction(self, video_id):
2694                 """Report attempt to extract video information."""
2695                 self._reporter(u'%s: Extracting video information' % video_id)
2696
2697         def _parse_page(self, video_webpage):
2698                 """Extract video information from page"""
2699                 # General data
2700                 data = {'title': r'class="video_title datawrap">(.*?)</',
2701                         'description': r'<div class="datawrap">(.*?)</div>',
2702                         'owner': r'\("video_owner_name", "(.*?)"\)',
2703                         'upload_date': r'data-date="(.*?)"',
2704                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2705                         }
2706                 video_info = {}
2707                 for piece in data.keys():
2708                         mobj = re.search(data[piece], video_webpage)
2709                         if mobj is not None:
2710                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2711
2712                 # Video urls
2713                 video_urls = {}
2714                 for fmt in self._available_formats:
2715                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2716                         if mobj is not None:
2717                                 # URL is in a Javascript segment inside an escaped Unicode format within
2718                                 # the generally utf-8 page
2719                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2720                 video_info['video_urls'] = video_urls
2721
2722                 return video_info
2723
2724         def _real_initialize(self):
2725                 if self._downloader is None:
2726                         return
2727
2728                 useremail = None
2729                 password = None
2730                 downloader_params = self._downloader.params
2731
2732                 # Attempt to use provided username and password or .netrc data
2733                 if downloader_params.get('username', None) is not None:
2734                         useremail = downloader_params['username']
2735                         password = downloader_params['password']
2736                 elif downloader_params.get('usenetrc', False):
2737                         try:
2738                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2739                                 if info is not None:
2740                                         useremail = info[0]
2741                                         password = info[2]
2742                                 else:
2743                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2744                         except (IOError, netrc.NetrcParseError), err:
2745                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2746                                 return
2747
2748                 if useremail is None:
2749                         return
2750
2751                 # Log in
2752                 login_form = {
2753                         'email': useremail,
2754                         'pass': password,
2755                         'login': 'Log+In'
2756                         }
2757                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2758                 try:
2759                         self.report_login()
2760                         login_results = urllib2.urlopen(request).read()
2761                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2762                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2763                                 return
2764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2765                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2766                         return
2767
2768         def _real_extract(self, url):
2769                 mobj = re.match(self._VALID_URL, url)
2770                 if mobj is None:
2771                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2772                         return
2773                 video_id = mobj.group('ID')
2774
2775                 # Get video webpage
2776                 self.report_video_webpage_download(video_id)
2777                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2778                 try:
2779                         page = urllib2.urlopen(request)
2780                         video_webpage = page.read()
2781                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2782                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2783                         return
2784
2785                 # Start extracting information
2786                 self.report_information_extraction(video_id)
2787
2788                 # Extract information
2789                 video_info = self._parse_page(video_webpage)
2790
2791                 # uploader
2792                 if 'owner' not in video_info:
2793                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2794                         return
2795                 video_uploader = video_info['owner']
2796
2797                 # title
2798                 if 'title' not in video_info:
2799                         self._downloader.trouble(u'ERROR: unable to extract video title')
2800                         return
2801                 video_title = video_info['title']
2802                 video_title = video_title.decode('utf-8')
2803                 video_title = sanitize_title(video_title)
2804
2805                 # simplified title
2806                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2807                 simple_title = simple_title.strip(ur'_')
2808
2809                 # thumbnail image
2810                 if 'thumbnail' not in video_info:
2811                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2812                         video_thumbnail = ''
2813                 else:
2814                         video_thumbnail = video_info['thumbnail']
2815
2816                 # upload date
2817                 upload_date = u'NA'
2818                 if 'upload_date' in video_info:
2819                         upload_time = video_info['upload_date']
2820                         timetuple = email.utils.parsedate_tz(upload_time)
2821                         if timetuple is not None:
2822                                 try:
2823                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2824                                 except:
2825                                         pass
2826
2827                 # description
2828                 video_description = video_info.get('description', 'No description available.')
2829
2830                 url_map = video_info['video_urls']
2831                 if len(url_map.keys()) > 0:
2832                         # Decide which formats to download
2833                         req_format = self._downloader.params.get('format', None)
2834                         format_limit = self._downloader.params.get('format_limit', None)
2835
2836                         if format_limit is not None and format_limit in self._available_formats:
2837                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2838                         else:
2839                                 format_list = self._available_formats
2840                         existing_formats = [x for x in format_list if x in url_map]
2841                         if len(existing_formats) == 0:
2842                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2843                                 return
2844                         if req_format is None:
2845                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2846                         elif req_format == '-1':
2847                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2848                         else:
2849                                 # Specific format
2850                                 if req_format not in url_map:
2851                                         self._downloader.trouble(u'ERROR: requested format not available')
2852                                         return
2853                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2854
2855                 for format_param, video_real_url in video_url_list:
2856
2857                         # At this point we have a new video
2858                         self._downloader.increment_downloads()
2859
2860                         # Extension
2861                         video_extension = self._video_extensions.get(format_param, 'mp4')
2862
2863                         try:
2864                                 # Process video information
2865                                 self._downloader.process_info({
2866                                         'id':           video_id.decode('utf-8'),
2867                                         'url':          video_real_url.decode('utf-8'),
2868                                         'uploader':     video_uploader.decode('utf-8'),
2869                                         'upload_date':  upload_date,
2870                                         'title':        video_title,
2871                                         'stitle':       simple_title,
2872                                         'ext':          video_extension.decode('utf-8'),
2873                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2874                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2875                                         'description':  video_description.decode('utf-8'),
2876                                         'player_url':   None,
2877                                 })
2878                         except UnavailableVideoError, err:
2879                                 self._downloader.trouble(u'\nERROR: unable to download video')
2880
2881 class BlipTVIE(InfoExtractor):
2882         """Information extractor for blip.tv"""
2883
2884         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2885         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2886
2887         @staticmethod
2888         def suitable(url):
2889                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2890
2891         def report_extraction(self, file_id):
2892                 """Report information extraction."""
2893                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2894
2895         def _simplify_title(self, title):
2896                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2897                 res = res.strip(ur'_')
2898                 return res
2899
2900         def _real_extract(self, url):
2901                 mobj = re.match(self._VALID_URL, url)
2902                 if mobj is None:
2903                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2904                         return
2905
2906                 if '?' in url:
2907                         cchar = '&'
2908                 else:
2909                         cchar = '?'
2910                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2911                 request = urllib2.Request(json_url)
2912                 self.report_extraction(mobj.group(1))
2913                 try:
2914                         json_code = urllib2.urlopen(request).read()
2915                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2917                         return
2918                 try:
2919                         json_data = json.loads(json_code)
2920                         if 'Post' in json_data:
2921                                 data = json_data['Post']
2922                         else:
2923                                 data = json_data
2924
2925                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2926                         video_url = data['media']['url']
2927                         umobj = re.match(self._URL_EXT, video_url)
2928                         if umobj is None:
2929                                 raise ValueError('Can not determine filename extension')
2930                         ext = umobj.group(1)
2931
2932                         self._downloader.increment_downloads()
2933
2934                         info = {
2935                                 'id': data['item_id'],
2936                                 'url': video_url,
2937                                 'uploader': data['display_name'],
2938                                 'upload_date': upload_date,
2939                                 'title': data['title'],
2940                                 'stitle': self._simplify_title(data['title']),
2941                                 'ext': ext,
2942                                 'format': data['media']['mimeType'],
2943                                 'thumbnail': data['thumbnailUrl'],
2944                                 'description': data['description'],
2945                                 'player_url': data['embedUrl']
2946                         }
2947                 except (ValueError,KeyError), err:
2948                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2949                         return
2950
2951                 try:
2952                         self._downloader.process_info(info)
2953                 except UnavailableVideoError, err:
2954                         self._downloader.trouble(u'\nERROR: unable to download video')
2955
2956
2957 class PostProcessor(object):
2958         """Post Processor class.
2959
2960         PostProcessor objects can be added to downloaders with their
2961         add_post_processor() method. When the downloader has finished a
2962         successful download, it will take its internal chain of PostProcessors
2963         and start calling the run() method on each one of them, first with
2964         an initial argument and then with the returned value of the previous
2965         PostProcessor.
2966
2967         The chain will be stopped if one of them ever returns None or the end
2968         of the chain is reached.
2969
2970         PostProcessor objects follow a "mutual registration" process similar
2971         to InfoExtractor objects.
2972         """
2973
2974         _downloader = None
2975
2976         def __init__(self, downloader=None):
2977                 self._downloader = downloader
2978
2979         def set_downloader(self, downloader):
2980                 """Sets the downloader for this PP."""
2981                 self._downloader = downloader
2982
2983         def run(self, information):
2984                 """Run the PostProcessor.
2985
2986                 The "information" argument is a dictionary like the ones
2987                 composed by InfoExtractors. The only difference is that this
2988                 one has an extra field called "filepath" that points to the
2989                 downloaded file.
2990
2991                 When this method returns None, the postprocessing chain is
2992                 stopped. However, this method may return an information
2993                 dictionary that will be passed to the next postprocessing
2994                 object in the chain. It can be the one it received after
2995                 changing some fields.
2996
2997                 In addition, this method may raise a PostProcessingError
2998                 exception that will be taken into account by the downloader
2999                 it was called from.
3000                 """
3001                 return information # by default, do nothing
3002
3003
3004 class FFmpegExtractAudioPP(PostProcessor):
3005
3006         def __init__(self, downloader=None, preferredcodec=None):
3007                 PostProcessor.__init__(self, downloader)
3008                 if preferredcodec is None:
3009                         preferredcodec = 'best'
3010                 self._preferredcodec = preferredcodec
3011
3012         @staticmethod
3013         def get_audio_codec(path):
3014                 try:
3015                         cmd = ['ffprobe', '-show_streams', '--', path]
3016                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3017                         output = handle.communicate()[0]
3018                         if handle.wait() != 0:
3019                                 return None
3020                 except (IOError, OSError):
3021                         return None
3022                 audio_codec = None
3023                 for line in output.split('\n'):
3024                         if line.startswith('codec_name='):
3025                                 audio_codec = line.split('=')[1].strip()
3026                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3027                                 return audio_codec
3028                 return None
3029
3030         @staticmethod
3031         def run_ffmpeg(path, out_path, codec, more_opts):
3032                 try:
3033                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3034                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3035                         return (ret == 0)
3036                 except (IOError, OSError):
3037                         return False
3038
3039         def run(self, information):
3040                 path = information['filepath']
3041
3042                 filecodec = self.get_audio_codec(path)
3043                 if filecodec is None:
3044                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3045                         return None
3046
3047                 more_opts = []
3048                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3049                         if filecodec == 'aac' or filecodec == 'mp3':
3050                                 # Lossless if possible
3051                                 acodec = 'copy'
3052                                 extension = filecodec
3053                                 if filecodec == 'aac':
3054                                         more_opts = ['-f', 'adts']
3055                         else:
3056                                 # MP3 otherwise.
3057                                 acodec = 'libmp3lame'
3058                                 extension = 'mp3'
3059                                 more_opts = ['-ab', '128k']
3060                 else:
3061                         # We convert the audio (lossy)
3062                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3063                         extension = self._preferredcodec
3064                         more_opts = ['-ab', '128k']
3065                         if self._preferredcodec == 'aac':
3066                                 more_opts += ['-f', 'adts']
3067
3068                 (prefix, ext) = os.path.splitext(path)
3069                 new_path = prefix + '.' + extension
3070                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3071                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3072
3073                 if not status:
3074                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3075                         return None
3076
3077                 try:
3078                         os.remove(path)
3079                 except (IOError, OSError):
3080                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3081                         return None
3082
3083                 information['filepath'] = new_path
3084                 return information
3085
3086
3087 def updateSelf(downloader, filename):
3088         ''' Update the program file with the latest version from the repository '''
3089         # Note: downloader only used for options
3090         if not os.access(filename, os.W_OK):
3091                 sys.exit('ERROR: no write permissions on %s' % filename)
3092
3093         downloader.to_screen('Updating to latest version...')
3094
3095         try:
3096                 try:
3097                         urlh = urllib.urlopen(UPDATE_URL)
3098                         newcontent = urlh.read()
3099                 finally:
3100                         urlh.close()
3101         except (IOError, OSError), err:
3102                 sys.exit('ERROR: unable to download latest version')
3103
3104         try:
3105                 outf = open(filename, 'wb')
3106                 try:
3107                         outf.write(newcontent)
3108                 finally:
3109                         outf.close()
3110         except (IOError, OSError), err:
3111                 sys.exit('ERROR: unable to overwrite current version')
3112
3113         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3114
3115 def parseOpts():
3116         # Deferred imports
3117         import getpass
3118         import optparse
3119
3120         def _format_option_string(option):
3121                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3122
3123                 opts = []
3124
3125                 if option._short_opts: opts.append(option._short_opts[0])
3126                 if option._long_opts: opts.append(option._long_opts[0])
3127                 if len(opts) > 1: opts.insert(1, ', ')
3128
3129                 if option.takes_value(): opts.append(' %s' % option.metavar)
3130
3131                 return "".join(opts)
3132
3133         def _find_term_columns():
3134                 columns = os.environ.get('COLUMNS', None)
3135                 if columns:
3136                         return int(columns)
3137
3138                 try:
3139                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3140                         out,err = sp.communicate()
3141                         return int(out.split()[1])
3142                 except:
3143                         pass
3144                 return None
3145
3146         max_width = 80
3147         max_help_position = 80
3148
3149         # No need to wrap help messages if we're on a wide console
3150         columns = _find_term_columns()
3151         if columns: max_width = columns
3152
3153         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3154         fmt.format_option_strings = _format_option_string
3155
3156         kw = {
3157                 'version'   : __version__,
3158                 'formatter' : fmt,
3159                 'usage' : '%prog [options] url...',
3160                 'conflict_handler' : 'resolve',
3161         }
3162
3163         parser = optparse.OptionParser(**kw)
3164
3165         # option groups
3166         general        = optparse.OptionGroup(parser, 'General Options')
3167         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3168         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3169         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3170         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3171         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3172
3173         general.add_option('-h', '--help',
3174                         action='help', help='print this help text and exit')
3175         general.add_option('-v', '--version',
3176                         action='version', help='print program version and exit')
3177         general.add_option('-U', '--update',
3178                         action='store_true', dest='update_self', help='update this program to latest version')
3179         general.add_option('-i', '--ignore-errors',
3180                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3181         general.add_option('-r', '--rate-limit',
3182                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3183         general.add_option('-R', '--retries',
3184                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3185         general.add_option('--playlist-start',
3186                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3187         general.add_option('--playlist-end',
3188                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3189         general.add_option('--dump-user-agent',
3190                         action='store_true', dest='dump_user_agent',
3191                         help='display the current browser identification', default=False)
3192
3193         authentication.add_option('-u', '--username',
3194                         dest='username', metavar='USERNAME', help='account username')
3195         authentication.add_option('-p', '--password',
3196                         dest='password', metavar='PASSWORD', help='account password')
3197         authentication.add_option('-n', '--netrc',
3198                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3199
3200
3201         video_format.add_option('-f', '--format',
3202                         action='store', dest='format', metavar='FORMAT', help='video format code')
3203         video_format.add_option('--all-formats',
3204                         action='store_const', dest='format', help='download all available video formats', const='-1')
3205         video_format.add_option('--max-quality',
3206                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3207
3208
3209         verbosity.add_option('-q', '--quiet',
3210                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3211         verbosity.add_option('-s', '--simulate',
3212                         action='store_true', dest='simulate', help='do not download video', default=False)
3213         verbosity.add_option('-g', '--get-url',
3214                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3215         verbosity.add_option('-e', '--get-title',
3216                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3217         verbosity.add_option('--get-thumbnail',
3218                         action='store_true', dest='getthumbnail',
3219                         help='simulate, quiet but print thumbnail URL', default=False)
3220         verbosity.add_option('--get-description',
3221                         action='store_true', dest='getdescription',
3222                         help='simulate, quiet but print video description', default=False)
3223         verbosity.add_option('--get-filename',
3224                         action='store_true', dest='getfilename',
3225                         help='simulate, quiet but print output filename', default=False)
3226         verbosity.add_option('--no-progress',
3227                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3228         verbosity.add_option('--console-title',
3229                         action='store_true', dest='consoletitle',
3230                         help='display progress in console titlebar', default=False)
3231
3232
3233         filesystem.add_option('-t', '--title',
3234                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3235         filesystem.add_option('-l', '--literal',
3236                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3237         filesystem.add_option('-A', '--auto-number',
3238                         action='store_true', dest='autonumber',
3239                         help='number downloaded files starting from 00000', default=False)
3240         filesystem.add_option('-o', '--output',
3241                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3242         filesystem.add_option('-a', '--batch-file',
3243                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3244         filesystem.add_option('-w', '--no-overwrites',
3245                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3246         filesystem.add_option('-c', '--continue',
3247                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3248         filesystem.add_option('--cookies',
3249                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3250         filesystem.add_option('--no-part',
3251                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3252         filesystem.add_option('--no-mtime',
3253                         action='store_false', dest='updatetime',
3254                         help='do not use the Last-modified header to set the file modification time', default=True)
3255         filesystem.add_option('--write-description',
3256                         action='store_true', dest='writedescription',
3257                         help='write video description to a .description file', default=False)
3258         filesystem.add_option('--write-info-json',
3259                         action='store_true', dest='writeinfojson',
3260                         help='write video metadata to a .info.json file', default=False)
3261
3262
3263         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3264                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3265         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3266                         help='"best", "aac" or "mp3"; best by default')
3267
3268
3269         parser.add_option_group(general)
3270         parser.add_option_group(filesystem)
3271         parser.add_option_group(verbosity)
3272         parser.add_option_group(video_format)
3273         parser.add_option_group(authentication)
3274         parser.add_option_group(postproc)
3275
3276         opts, args = parser.parse_args()
3277
3278         return parser, opts, args
3279
3280 def main():
3281         parser, opts, args = parseOpts()
3282
3283         # Open appropriate CookieJar
3284         if opts.cookiefile is None:
3285                 jar = cookielib.CookieJar()
3286         else:
3287                 try:
3288                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3289                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3290                                 jar.load()
3291                 except (IOError, OSError), err:
3292                         sys.exit(u'ERROR: unable to open cookie file')
3293
3294         # Dump user agent
3295         if opts.dump_user_agent:
3296                 print std_headers['User-Agent']
3297                 sys.exit(0)
3298
3299         # General configuration
3300         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3301         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3302         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3303
3304         # Batch file verification
3305         batchurls = []
3306         if opts.batchfile is not None:
3307                 try:
3308                         if opts.batchfile == '-':
3309                                 batchfd = sys.stdin
3310                         else:
3311                                 batchfd = open(opts.batchfile, 'r')
3312                         batchurls = batchfd.readlines()
3313                         batchurls = [x.strip() for x in batchurls]
3314                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3315                 except IOError:
3316                         sys.exit(u'ERROR: batch file could not be read')
3317         all_urls = batchurls + args
3318
3319         # Conflicting, missing and erroneous options
3320         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3321                 parser.error(u'using .netrc conflicts with giving username/password')
3322         if opts.password is not None and opts.username is None:
3323                 parser.error(u'account username missing')
3324         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3325                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3326         if opts.usetitle and opts.useliteral:
3327                 parser.error(u'using title conflicts with using literal title')
3328         if opts.username is not None and opts.password is None:
3329                 opts.password = getpass.getpass(u'Type account password and press return:')
3330         if opts.ratelimit is not None:
3331                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3332                 if numeric_limit is None:
3333                         parser.error(u'invalid rate limit specified')
3334                 opts.ratelimit = numeric_limit
3335         if opts.retries is not None:
3336                 try:
3337                         opts.retries = long(opts.retries)
3338                 except (TypeError, ValueError), err:
3339                         parser.error(u'invalid retry count specified')
3340         try:
3341                 opts.playliststart = int(opts.playliststart)
3342                 if opts.playliststart <= 0:
3343                         raise ValueError(u'Playlist start must be positive')
3344         except (TypeError, ValueError), err:
3345                 parser.error(u'invalid playlist start number specified')
3346         try:
3347                 opts.playlistend = int(opts.playlistend)
3348                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3349                         raise ValueError(u'Playlist end must be greater than playlist start')
3350         except (TypeError, ValueError), err:
3351                 parser.error(u'invalid playlist end number specified')
3352         if opts.extractaudio:
3353                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3354                         parser.error(u'invalid audio format specified')
3355
3356         # Information extractors
3357         youtube_ie = YoutubeIE()
3358         metacafe_ie = MetacafeIE(youtube_ie)
3359         dailymotion_ie = DailymotionIE()
3360         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3361         youtube_user_ie = YoutubeUserIE(youtube_ie)
3362         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3363         google_ie = GoogleIE()
3364         google_search_ie = GoogleSearchIE(google_ie)
3365         photobucket_ie = PhotobucketIE()
3366         yahoo_ie = YahooIE()
3367         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3368         deposit_files_ie = DepositFilesIE()
3369         facebook_ie = FacebookIE()
3370         bliptv_ie = BlipTVIE()
3371         vimeo_ie = VimeoIE()
3372         generic_ie = GenericIE()
3373
3374         # File downloader
3375         fd = FileDownloader({
3376                 'usenetrc': opts.usenetrc,
3377                 'username': opts.username,
3378                 'password': opts.password,
3379                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3380                 'forceurl': opts.geturl,
3381                 'forcetitle': opts.gettitle,
3382                 'forcethumbnail': opts.getthumbnail,
3383                 'forcedescription': opts.getdescription,
3384                 'forcefilename': opts.getfilename,
3385                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3386                 'format': opts.format,
3387                 'format_limit': opts.format_limit,
3388                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3389                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3390                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3391                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3392                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3393                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3394                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3395                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3396                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3397                         or u'%(id)s.%(ext)s'),
3398                 'ignoreerrors': opts.ignoreerrors,
3399                 'ratelimit': opts.ratelimit,
3400                 'nooverwrites': opts.nooverwrites,
3401                 'retries': opts.retries,
3402                 'continuedl': opts.continue_dl,
3403                 'noprogress': opts.noprogress,
3404                 'playliststart': opts.playliststart,
3405                 'playlistend': opts.playlistend,
3406                 'logtostderr': opts.outtmpl == '-',
3407                 'consoletitle': opts.consoletitle,
3408                 'nopart': opts.nopart,
3409                 'updatetime': opts.updatetime,
3410                 'writedescription': opts.writedescription,
3411                 'writeinfojson': opts.writeinfojson,
3412                 })
3413         fd.add_info_extractor(youtube_search_ie)
3414         fd.add_info_extractor(youtube_pl_ie)
3415         fd.add_info_extractor(youtube_user_ie)
3416         fd.add_info_extractor(metacafe_ie)
3417         fd.add_info_extractor(dailymotion_ie)
3418         fd.add_info_extractor(youtube_ie)
3419         fd.add_info_extractor(google_ie)
3420         fd.add_info_extractor(google_search_ie)
3421         fd.add_info_extractor(photobucket_ie)
3422         fd.add_info_extractor(yahoo_ie)
3423         fd.add_info_extractor(yahoo_search_ie)
3424         fd.add_info_extractor(deposit_files_ie)
3425         fd.add_info_extractor(facebook_ie)
3426         fd.add_info_extractor(bliptv_ie)
3427         fd.add_info_extractor(vimeo_ie)
3428
3429         # This must come last since it's the
3430         # fallback if none of the others work
3431         fd.add_info_extractor(generic_ie)
3432
3433         # PostProcessors
3434         if opts.extractaudio:
3435                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3436
3437         # Update version
3438         if opts.update_self:
3439                 updateSelf(fd, sys.argv[0])
3440
3441         # Maybe do nothing
3442         if len(all_urls) < 1:
3443                 if not opts.update_self:
3444                         parser.error(u'you must provide at least one URL')
3445                 else:
3446                         sys.exit()
3447         retcode = fd.download(all_urls)
3448
3449         # Dump cookie jar if requested
3450         if opts.cookiefile is not None:
3451                 try:
3452                         jar.save()
3453                 except (IOError, OSError), err:
3454                         sys.exit(u'ERROR: unable to save cookie jar')
3455
3456         sys.exit(retcode)
3457
3458
3459 if __name__ == '__main__':
3460         try:
3461                 main()
3462         except DownloadError:
3463                 sys.exit(1)
3464         except SameFileError:
3465                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3466         except KeyboardInterrupt:
3467                 sys.exit(u'\nERROR: Interrupted by user')
3468
3469 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: