youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         )
  14
  15 __license__ = 'Public Domain'
  16 __version__ = '2011.08.28-phihag'
  17
  18 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
  19
  20 import cookielib
  21 import datetime
  22 import gzip
  23 import htmlentitydefs
  24 import httplib
  25 import locale
  26 import math
  27 import netrc
  28 import os
  29 import os.path
  30 import re
  31 import socket
  32 import string
  33 import subprocess
  34 import sys
  35 import time
  36 import urllib
  37 import urllib2
  38 import warnings
  39 import zlib
  40
  41 if os.name == 'nt':
  42         import ctypes
  43
  44 try:
  45         import email.utils
  46 except ImportError: # Python 2.4
  47         import email.Utils
  48 try:
  49         import cStringIO as StringIO
  50 except ImportError:
  51         import StringIO
  52
  53 # parse_qs was moved from the cgi module to the urlparse module recently.
  54 try:
  55         from urlparse import parse_qs
  56 except ImportError:
  57         from cgi import parse_qs
  58
  59 try:
  60         import lxml.etree
  61 except ImportError:
  62         pass # Handled below
  63
  64 std_headers = {
  65         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  66         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  67         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  68         'Accept-Encoding': 'gzip, deflate',
  69         'Accept-Language': 'en-us,en;q=0.5',
  70 }
  71
  72 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  73
  74 try:
  75         import json
  76 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  77         import re
  78         class json(object):
  79                 @staticmethod
  80                 def loads(s):
  81                         s = s.decode('UTF-8')
  82                         def raiseError(msg, i):
  83                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  84                         def skipSpace(i, expectMore=True):
  85                                 while i < len(s) and s[i] in ' \t\r\n':
  86                                         i += 1
  87                                 if expectMore:
  88                                         if i >= len(s):
  89                                                 raiseError('Premature end', i)
  90                                 return i
  91                         def decodeEscape(match):
  92                                 esc = match.group(1)
  93                                 _STATIC = {
  94                                         '"': '"',
  95                                         '\\': '\\',
  96                                         '/': '/',
  97                                         'b': unichr(0x8),
  98                                         'f': unichr(0xc),
  99                                         'n': '\n',
 100                                         'r': '\r',
 101                                         't': '\t',
 102                                 }
 103                                 if esc in _STATIC:
 104                                         return _STATIC[esc]
 105                                 if esc[0] == 'u':
 106                                         if len(esc) == 1+4:
 107                                                 return unichr(int(esc[1:5], 16))
 108                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 109                                                 hi = int(esc[1:5], 16)
 110                                                 low = int(esc[7:11], 16)
 111                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 112                                 raise ValueError('Unknown escape ' + str(esc))
 113                         def parseString(i):
 114                                 i += 1
 115                                 e = i
 116                                 while True:
 117                                         e = s.index('"', e)
 118                                         bslashes = 0
 119                                         while s[e-bslashes-1] == '\\':
 120                                                 bslashes += 1
 121                                         if bslashes % 2 == 1:
 122                                                 e += 1
 123                                                 continue
 124                                         break
 125                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 126                                 stri = rexp.sub(decodeEscape, s[i:e])
 127                                 return (e+1,stri)
 128                         def parseObj(i):
 129                                 i += 1
 130                                 res = {}
 131                                 i = skipSpace(i)
 132                                 if s[i] == '}': # Empty dictionary
 133                                         return (i+1,res)
 134                                 while True:
 135                                         if s[i] != '"':
 136                                                 raiseError('Expected a string object key', i)
 137                                         i,key = parseString(i)
 138                                         i = skipSpace(i)
 139                                         if i >= len(s) or s[i] != ':':
 140                                                 raiseError('Expected a colon', i)
 141                                         i,val = parse(i+1)
 142                                         res[key] = val
 143                                         i = skipSpace(i)
 144                                         if s[i] == '}':
 145                                                 return (i+1, res)
 146                                         if s[i] != ',':
 147                                                 raiseError('Expected comma or closing curly brace', i)
 148                                         i = skipSpace(i+1)
 149                         def parseArray(i):
 150                                 res = []
 151                                 i = skipSpace(i+1)
 152                                 if s[i] == ']': # Empty array
 153                                         return (i+1,res)
 154                                 while True:
 155                                         i,val = parse(i)
 156                                         res.append(val)
 157                                         i = skipSpace(i) # Raise exception if premature end
 158                                         if s[i] == ']':
 159                                                 return (i+1, res)
 160                                         if s[i] != ',':
 161                                                 raiseError('Expected a comma or closing bracket', i)
 162                                         i = skipSpace(i+1)
 163                         def parseDiscrete(i):
 164                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 165                                         if s.startswith(k, i):
 166                                                 return (i+len(k), v)
 167                                 raiseError('Not a boolean (or null)', i)
 168                         def parseNumber(i):
 169                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 170                                 if mobj is None:
 171                                         raiseError('Not a number', i)
 172                                 nums = mobj.group(1)
 173                                 if '.' in nums or 'e' in nums or 'E' in nums:
 174                                         return (i+len(nums), float(nums))
 175                                 return (i+len(nums), int(nums))
 176                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 177                         def parse(i):
 178                                 i = skipSpace(i)
 179                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 180                                 i = skipSpace(i, False)
 181                                 return (i,res)
 182                         i,res = parse(0)
 183                         if i < len(s):
 184                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 185                         return res
 186
 187 def preferredencoding():
 188         """Get preferred encoding.
 189
 190         Returns the best encoding scheme for the system, based on
 191         locale.getpreferredencoding() and some further tweaks.
 192         """
 193         def yield_preferredencoding():
 194                 try:
 195                         pref = locale.getpreferredencoding()
 196                         u'TEST'.encode(pref)
 197                 except:
 198                         pref = 'UTF-8'
 199                 while True:
 200                         yield pref
 201         return yield_preferredencoding().next()
 202
 203 def htmlentity_transform(matchobj):
 204         """Transforms an HTML entity to a Unicode character.
 205
 206         This function receives a match object and is intended to be used with
 207         the re.sub() function.
 208         """
 209         entity = matchobj.group(1)
 210
 211         # Known non-numeric HTML entity
 212         if entity in htmlentitydefs.name2codepoint:
 213                 return unichr(htmlentitydefs.name2codepoint[entity])
 214
 215         # Unicode character
 216         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 217         if mobj is not None:
 218                 numstr = mobj.group(1)
 219                 if numstr.startswith(u'x'):
 220                         base = 16
 221                         numstr = u'0%s' % numstr
 222                 else:
 223                         base = 10
 224                 return unichr(long(numstr, base))
 225
 226         # Unknown entity in name, return its literal representation
 227         return (u'&%s;' % entity)
 228
 229 def sanitize_title(utitle):
 230         """Sanitizes a video title so it could be used as part of a filename."""
 231         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 232         return utitle.replace(unicode(os.sep), u'%')
 233
 234 def sanitize_open(filename, open_mode):
 235         """Try to open the given filename, and slightly tweak it if this fails.
 236
 237         Attempts to open the given filename. If this fails, it tries to change
 238         the filename slightly, step by step, until it's either able to open it
 239         or it fails and raises a final exception, like the standard open()
 240         function.
 241
 242         It returns the tuple (stream, definitive_file_name).
 243         """
 244         try:
 245                 if filename == u'-':
 246                         if sys.platform == 'win32':
 247                                 import msvcrt
 248                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 249                         return (sys.stdout, filename)
 250                 stream = open(filename, open_mode)
 251                 return (stream, filename)
 252         except (IOError, OSError), err:
 253                 # In case of error, try to remove win32 forbidden chars
 254                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 255
 256                 # An exception here should be caught in the caller
 257                 stream = open(filename, open_mode)
 258                 return (stream, filename)
 259
 260 def timeconvert(timestr):
 261     """Convert RFC 2822 defined time string into system timestamp"""
 262     timestamp = None
 263     timetuple = email.utils.parsedate_tz(timestr)
 264     if timetuple is not None:
 265         timestamp = email.utils.mktime_tz(timetuple)
 266     return timestamp
 267
 268 class DownloadError(Exception):
 269         """Download Error exception.
 270
 271         This exception may be thrown by FileDownloader objects if they are not
 272         configured to continue on errors. They will contain the appropriate
 273         error message.
 274         """
 275         pass
 276
 277 class SameFileError(Exception):
 278         """Same File exception.
 279
 280         This exception will be thrown by FileDownloader objects if they detect
 281         multiple files would have to be downloaded to the same file on disk.
 282         """
 283         pass
 284
 285 class PostProcessingError(Exception):
 286         """Post Processing exception.
 287
 288         This exception may be raised by PostProcessor's .run() method to
 289         indicate an error in the postprocessing task.
 290         """
 291         pass
 292
 293 class UnavailableVideoError(Exception):
 294         """Unavailable Format exception.
 295
 296         This exception will be thrown when a video is requested
 297         in a format that is not available for that video.
 298         """
 299         pass
 300
 301 class ContentTooShortError(Exception):
 302         """Content Too Short exception.
 303
 304         This exception may be raised by FileDownloader objects when a file they
 305         download is too small for what the server announced first, indicating
 306         the connection was probably interrupted.
 307         """
 308         # Both in bytes
 309         downloaded = None
 310         expected = None
 311
 312         def __init__(self, downloaded, expected):
 313                 self.downloaded = downloaded
 314                 self.expected = expected
 315
 316 class YoutubeDLHandler(urllib2.HTTPHandler):
 317         """Handler for HTTP requests and responses.
 318
 319         This class, when installed with an OpenerDirector, automatically adds
 320         the standard headers to every HTTP request and handles gzipped and
 321         deflated responses from web servers. If compression is to be avoided in
 322         a particular request, the original request in the program code only has
 323         to include the HTTP header "Youtubedl-No-Compression", which will be
 324         removed before making the real request.
 325
 326         Part of this code was copied from:
 327
 328           http://techknack.net/python-urllib2-handlers/
 329
 330         Andrew Rowls, the author of that code, agreed to release it to the
 331         public domain.
 332         """
 333
 334         @staticmethod
 335         def deflate(data):
 336                 try:
 337                         return zlib.decompress(data, -zlib.MAX_WBITS)
 338                 except zlib.error:
 339                         return zlib.decompress(data)
 340
 341         @staticmethod
 342         def addinfourl_wrapper(stream, headers, url, code):
 343                 if hasattr(urllib2.addinfourl, 'getcode'):
 344                         return urllib2.addinfourl(stream, headers, url, code)
 345                 ret = urllib2.addinfourl(stream, headers, url)
 346                 ret.code = code
 347                 return ret
 348
 349         def http_request(self, req):
 350                 for h in std_headers:
 351                         if h in req.headers:
 352                                 del req.headers[h]
 353                         req.add_header(h, std_headers[h])
 354                 if 'Youtubedl-no-compression' in req.headers:
 355                         if 'Accept-encoding' in req.headers:
 356                                 del req.headers['Accept-encoding']
 357                         del req.headers['Youtubedl-no-compression']
 358                 return req
 359
 360         def http_response(self, req, resp):
 361                 old_resp = resp
 362                 # gzip
 363                 if resp.headers.get('Content-encoding', '') == 'gzip':
 364                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 365                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 366                         resp.msg = old_resp.msg
 367                 # deflate
 368                 if resp.headers.get('Content-encoding', '') == 'deflate':
 369                         gz = StringIO.StringIO(self.deflate(resp.read()))
 370                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 371                         resp.msg = old_resp.msg
 372                 return resp
 373
 374 class FileDownloader(object):
 375         """File Downloader class.
 376
 377         File downloader objects are the ones responsible of downloading the
 378         actual video file and writing it to disk if the user has requested
 379         it, among some other tasks. In most cases there should be one per
 380         program. As, given a video URL, the downloader doesn't know how to
 381         extract all the needed information, task that InfoExtractors do, it
 382         has to pass the URL to one of them.
 383
 384         For this, file downloader objects have a method that allows
 385         InfoExtractors to be registered in a given order. When it is passed
 386         a URL, the file downloader handles it to the first InfoExtractor it
 387         finds that reports being able to handle it. The InfoExtractor extracts
 388         all the information about the video or videos the URL refers to, and
 389         asks the FileDownloader to process the video information, possibly
 390         downloading the video.
 391
 392         File downloaders accept a lot of parameters. In order not to saturate
 393         the object constructor with arguments, it receives a dictionary of
 394         options instead. These options are available through the params
 395         attribute for the InfoExtractors to use. The FileDownloader also
 396         registers itself as the downloader in charge for the InfoExtractors
 397         that are added to it, so this is a "mutual registration".
 398
 399         Available options:
 400
 401         username:         Username for authentication purposes.
 402         password:         Password for authentication purposes.
 403         usenetrc:         Use netrc for authentication instead.
 404         quiet:            Do not print messages to stdout.
 405         forceurl:         Force printing final URL.
 406         forcetitle:       Force printing title.
 407         forcethumbnail:   Force printing thumbnail URL.
 408         forcedescription: Force printing description.
 409         forcefilename:    Force printing final filename.
 410         simulate:         Do not download the video files.
 411         format:           Video format code.
 412         format_limit:     Highest quality format to try.
 413         outtmpl:          Template for output names.
 414         ignoreerrors:     Do not stop on download errors.
 415         ratelimit:        Download speed limit, in bytes/sec.
 416         nooverwrites:     Prevent overwriting files.
 417         retries:          Number of times to retry for HTTP error 5xx
 418         continuedl:       Try to continue downloads if possible.
 419         noprogress:       Do not print the progress bar.
 420         playliststart:    Playlist item to start at.
 421         playlistend:      Playlist item to end at.
 422         logtostderr:      Log messages to stderr instead of stdout.
 423         consoletitle:     Display progress in console window's titlebar.
 424         nopart:           Do not use temporary .part files.
 425         updatetime:       Use the Last-modified header to set output file timestamps.
 426         writedescription: Write the video description to a .description file
 427         writeinfojson:    Write the video description to a .info.json file
 428         """
 429
 430         params = None
 431         _ies = []
 432         _pps = []
 433         _download_retcode = None
 434         _num_downloads = None
 435         _screen_file = None
 436
 437         def __init__(self, params):
 438                 """Create a FileDownloader object with the given options."""
 439                 self._ies = []
 440                 self._pps = []
 441                 self._download_retcode = 0
 442                 self._num_downloads = 0
 443                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 444                 self.params = params
 445
 446         @staticmethod
 447         def pmkdir(filename):
 448                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 449                 components = filename.split(os.sep)
 450                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 451                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 452                 for dir in aggregate:
 453                         if not os.path.exists(dir):
 454                                 os.mkdir(dir)
 455
 456         @staticmethod
 457         def format_bytes(bytes):
 458                 if bytes is None:
 459                         return 'N/A'
 460                 if type(bytes) is str:
 461                         bytes = float(bytes)
 462                 if bytes == 0.0:
 463                         exponent = 0
 464                 else:
 465                         exponent = long(math.log(bytes, 1024.0))
 466                 suffix = 'bkMGTPEZY'[exponent]
 467                 converted = float(bytes) / float(1024**exponent)
 468                 return '%.2f%s' % (converted, suffix)
 469
 470         @staticmethod
 471         def calc_percent(byte_counter, data_len):
 472                 if data_len is None:
 473                         return '---.-%'
 474                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 475
 476         @staticmethod
 477         def calc_eta(start, now, total, current):
 478                 if total is None:
 479                         return '--:--'
 480                 dif = now - start
 481                 if current == 0 or dif < 0.001: # One millisecond
 482                         return '--:--'
 483                 rate = float(current) / dif
 484                 eta = long((float(total) - float(current)) / rate)
 485                 (eta_mins, eta_secs) = divmod(eta, 60)
 486                 if eta_mins > 99:
 487                         return '--:--'
 488                 return '%02d:%02d' % (eta_mins, eta_secs)
 489
 490         @staticmethod
 491         def calc_speed(start, now, bytes):
 492                 dif = now - start
 493                 if bytes == 0 or dif < 0.001: # One millisecond
 494                         return '%10s' % '---b/s'
 495                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 496
 497         @staticmethod
 498         def best_block_size(elapsed_time, bytes):
 499                 new_min = max(bytes / 2.0, 1.0)
 500                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 501                 if elapsed_time < 0.001:
 502                         return long(new_max)
 503                 rate = bytes / elapsed_time
 504                 if rate > new_max:
 505                         return long(new_max)
 506                 if rate < new_min:
 507                         return long(new_min)
 508                 return long(rate)
 509
 510         @staticmethod
 511         def parse_bytes(bytestr):
 512                 """Parse a string indicating a byte quantity into a long integer."""
 513                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 514                 if matchobj is None:
 515                         return None
 516                 number = float(matchobj.group(1))
 517                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 518                 return long(round(number * multiplier))
 519
 520         def add_info_extractor(self, ie):
 521                 """Add an InfoExtractor object to the end of the list."""
 522                 self._ies.append(ie)
 523                 ie.set_downloader(self)
 524
 525         def add_post_processor(self, pp):
 526                 """Add a PostProcessor object to the end of the chain."""
 527                 self._pps.append(pp)
 528                 pp.set_downloader(self)
 529
 530         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 531                 """Print message to stdout if not in quiet mode."""
 532                 try:
 533                         if not self.params.get('quiet', False):
 534                                 terminator = [u'\n', u''][skip_eol]
 535                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 536                         self._screen_file.flush()
 537                 except (UnicodeEncodeError), err:
 538                         if not ignore_encoding_errors:
 539                                 raise
 540
 541         def to_stderr(self, message):
 542                 """Print message to stderr."""
 543                 print >>sys.stderr, message.encode(preferredencoding())
 544
 545         def to_cons_title(self, message):
 546                 """Set console/terminal window title to message."""
 547                 if not self.params.get('consoletitle', False):
 548                         return
 549                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 550                         # c_wchar_p() might not be necessary if `message` is
 551                         # already of type unicode()
 552                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 553                 elif 'TERM' in os.environ:
 554                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 555
 556         def fixed_template(self):
 557                 """Checks if the output template is fixed."""
 558                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 559
 560         def trouble(self, message=None):
 561                 """Determine action to take when a download problem appears.
 562
 563                 Depending on if the downloader has been configured to ignore
 564                 download errors or not, this method may throw an exception or
 565                 not when errors are found, after printing the message.
 566                 """
 567                 if message is not None:
 568                         self.to_stderr(message)
 569                 if not self.params.get('ignoreerrors', False):
 570                         raise DownloadError(message)
 571                 self._download_retcode = 1
 572
 573         def slow_down(self, start_time, byte_counter):
 574                 """Sleep if the download speed is over the rate limit."""
 575                 rate_limit = self.params.get('ratelimit', None)
 576                 if rate_limit is None or byte_counter == 0:
 577                         return
 578                 now = time.time()
 579                 elapsed = now - start_time
 580                 if elapsed <= 0.0:
 581                         return
 582                 speed = float(byte_counter) / elapsed
 583                 if speed > rate_limit:
 584                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 585
 586         def temp_name(self, filename):
 587                 """Returns a temporary filename for the given filename."""
 588                 if self.params.get('nopart', False) or filename == u'-' or \
 589                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 590                         return filename
 591                 return filename + u'.part'
 592
 593         def undo_temp_name(self, filename):
 594                 if filename.endswith(u'.part'):
 595                         return filename[:-len(u'.part')]
 596                 return filename
 597
 598         def try_rename(self, old_filename, new_filename):
 599                 try:
 600                         if old_filename == new_filename:
 601                                 return
 602                         os.rename(old_filename, new_filename)
 603                 except (IOError, OSError), err:
 604                         self.trouble(u'ERROR: unable to rename file')
 605
 606         def try_utime(self, filename, last_modified_hdr):
 607                 """Try to set the last-modified time of the given file."""
 608                 if last_modified_hdr is None:
 609                         return
 610                 if not os.path.isfile(filename):
 611                         return
 612                 timestr = last_modified_hdr
 613                 if timestr is None:
 614                         return
 615                 filetime = timeconvert(timestr)
 616                 if filetime is None:
 617                         return
 618                 try:
 619                         os.utime(filename,(time.time(), filetime))
 620                 except:
 621                         pass
 622
 623         def report_writedescription(self, descfn):
 624                 """ Report that the description file is being written """
 625                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 626
 627         def report_writeinfojson(self, infofn):
 628                 """ Report that the metadata file has been written """
 629                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 630
 631         def report_destination(self, filename):
 632                 """Report destination filename."""
 633                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 634
 635         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 636                 """Report download progress."""
 637                 if self.params.get('noprogress', False):
 638                         return
 639                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 640                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 641                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 642                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 643
 644         def report_resuming_byte(self, resume_len):
 645                 """Report attempt to resume at given byte."""
 646                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 647
 648         def report_retry(self, count, retries):
 649                 """Report retry in case of HTTP error 5xx"""
 650                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 651
 652         def report_file_already_downloaded(self, file_name):
 653                 """Report file has already been fully downloaded."""
 654                 try:
 655                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 656                 except (UnicodeEncodeError), err:
 657                         self.to_screen(u'[download] The file has already been downloaded')
 658
 659         def report_unable_to_resume(self):
 660                 """Report it was impossible to resume download."""
 661                 self.to_screen(u'[download] Unable to resume')
 662
 663         def report_finish(self):
 664                 """Report download finished."""
 665                 if self.params.get('noprogress', False):
 666                         self.to_screen(u'[download] Download completed')
 667                 else:
 668                         self.to_screen(u'')
 669
 670         def increment_downloads(self):
 671                 """Increment the ordinal that assigns a number to each file."""
 672                 self._num_downloads += 1
 673
 674         def prepare_filename(self, info_dict):
 675                 """Generate the output filename."""
 676                 try:
 677                         template_dict = dict(info_dict)
 678                         template_dict['epoch'] = unicode(long(time.time()))
 679                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 680                         filename = self.params['outtmpl'] % template_dict
 681                         return filename
 682                 except (ValueError, KeyError), err:
 683                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 684                         return None
 685
 686         def process_info(self, info_dict):
 687                 """Process a single dictionary returned by an InfoExtractor."""
 688                 filename = self.prepare_filename(info_dict)
 689                 # Do nothing else if in simulate mode
 690                 if self.params.get('simulate', False):
 691                         # Forced printings
 692                         if self.params.get('forcetitle', False):
 693                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 694                         if self.params.get('forceurl', False):
 695                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 696                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 697                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 698                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 699                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 700                         if self.params.get('forcefilename', False) and filename is not None:
 701                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 702
 703                         return
 704
 705                 if filename is None:
 706                         return
 707                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 708                         self.to_stderr(u'WARNING: file exists and will be skipped')
 709                         return
 710
 711                 try:
 712                         self.pmkdir(filename)
 713                 except (OSError, IOError), err:
 714                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 715                         return
 716
 717                 if self.params.get('writedescription', False):
 718                         try:
 719                                 descfn = filename + '.description'
 720                                 self.report_writedescription(descfn)
 721                                 descfile = open(descfn, 'wb')
 722                                 try:
 723                                         descfile.write(info_dict['description'].encode('utf-8'))
 724                                 finally:
 725                                         descfile.close()
 726                         except (OSError, IOError):
 727                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 728                                 return
 729
 730                 if self.params.get('writeinfojson', False):
 731                         infofn = filename + '.info.json'
 732                         self.report_writeinfojson(infofn)
 733                         try:
 734                                 json.dump
 735                         except (NameError,AttributeError):
 736                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 737                                 return
 738                         try:
 739                                 infof = open(infofn, 'wb')
 740                                 try:
 741                                         json.dump(info_dict, infof)
 742                                 finally:
 743                                         infof.close()
 744                         except (OSError, IOError):
 745                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 746                                 return
 747
 748                 try:
 749                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 750                 except (OSError, IOError), err:
 751                         raise UnavailableVideoError
 752                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 753                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 754                         return
 755                 except (ContentTooShortError, ), err:
 756                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 757                         return
 758
 759                 if success:
 760                         try:
 761                                 self.post_process(filename, info_dict)
 762                         except (PostProcessingError), err:
 763                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 764                                 return
 765
 766         def download(self, url_list):
 767                 """Download a given list of URLs."""
 768                 if len(url_list) > 1 and self.fixed_template():
 769                         raise SameFileError(self.params['outtmpl'])
 770
 771                 for url in url_list:
 772                         suitable_found = False
 773                         for ie in self._ies:
 774                                 # Go to next InfoExtractor if not suitable
 775                                 if not ie.suitable(url):
 776                                         continue
 777
 778                                 # Suitable InfoExtractor found
 779                                 suitable_found = True
 780
 781                                 # Extract information from URL and process it
 782                                 ie.extract(url)
 783
 784                                 # Suitable InfoExtractor had been found; go to next URL
 785                                 break
 786
 787                         if not suitable_found:
 788                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 789
 790                 return self._download_retcode
 791
 792         def post_process(self, filename, ie_info):
 793                 """Run the postprocessing chain on the given file."""
 794                 info = dict(ie_info)
 795                 info['filepath'] = filename
 796                 for pp in self._pps:
 797                         info = pp.run(info)
 798                         if info is None:
 799                                 break
 800
 801         def _download_with_rtmpdump(self, filename, url, player_url):
 802                 self.report_destination(filename)
 803                 tmpfilename = self.temp_name(filename)
 804
 805                 # Check for rtmpdump first
 806                 try:
 807                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 808                 except (OSError, IOError):
 809                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 810                         return False
 811
 812                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 813                 # the connection was interrumpted and resuming appears to be
 814                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 815                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 816                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 817                 while retval == 2 or retval == 1:
 818                         prevsize = os.path.getsize(tmpfilename)
 819                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 820                         time.sleep(5.0) # This seems to be needed
 821                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 822                         cursize = os.path.getsize(tmpfilename)
 823                         if prevsize == cursize and retval == 1:
 824                                 break
 825                 if retval == 0:
 826                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 827                         self.try_rename(tmpfilename, filename)
 828                         return True
 829                 else:
 830                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 831                         return False
 832
 833         def _do_download(self, filename, url, player_url):
 834                 # Check file already present
 835                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 836                         self.report_file_already_downloaded(filename)
 837                         return True
 838
 839                 # Attempt to download using rtmpdump
 840                 if url.startswith('rtmp'):
 841                         return self._download_with_rtmpdump(filename, url, player_url)
 842
 843                 tmpfilename = self.temp_name(filename)
 844                 stream = None
 845                 open_mode = 'wb'
 846
 847                 # Do not include the Accept-Encoding header
 848                 headers = {'Youtubedl-no-compression': 'True'}
 849                 basic_request = urllib2.Request(url, None, headers)
 850                 request = urllib2.Request(url, None, headers)
 851
 852                 # Establish possible resume length
 853                 if os.path.isfile(tmpfilename):
 854                         resume_len = os.path.getsize(tmpfilename)
 855                 else:
 856                         resume_len = 0
 857
 858                 # Request parameters in case of being able to resume
 859                 if self.params.get('continuedl', False) and resume_len != 0:
 860                         self.report_resuming_byte(resume_len)
 861                         request.add_header('Range','bytes=%d-' % resume_len)
 862                         open_mode = 'ab'
 863
 864                 count = 0
 865                 retries = self.params.get('retries', 0)
 866                 while count <= retries:
 867                         # Establish connection
 868                         try:
 869                                 data = urllib2.urlopen(request)
 870                                 break
 871                         except (urllib2.HTTPError, ), err:
 872                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 873                                         # Unexpected HTTP error
 874                                         raise
 875                                 elif err.code == 416:
 876                                         # Unable to resume (requested range not satisfiable)
 877                                         try:
 878                                                 # Open the connection again without the range header
 879                                                 data = urllib2.urlopen(basic_request)
 880                                                 content_length = data.info()['Content-Length']
 881                                         except (urllib2.HTTPError, ), err:
 882                                                 if err.code < 500 or err.code >= 600:
 883                                                         raise
 884                                         else:
 885                                                 # Examine the reported length
 886                                                 if (content_length is not None and
 887                                                         (resume_len - 100 < long(content_length) < resume_len + 100)):
 888                                                         # The file had already been fully downloaded.
 889                                                         # Explanation to the above condition: in issue #175 it was revealed that
 890                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 891                                                         # changing the file size slightly and causing problems for some users. So
 892                                                         # I decided to implement a suggested change and consider the file
 893                                                         # completely downloaded if the file size differs less than 100 bytes from
 894                                                         # the one in the hard drive.
 895                                                         self.report_file_already_downloaded(filename)
 896                                                         self.try_rename(tmpfilename, filename)
 897                                                         return True
 898                                                 else:
 899                                                         # The length does not match, we start the download over
 900                                                         self.report_unable_to_resume()
 901                                                         open_mode = 'wb'
 902                                                         break
 903                         # Retry
 904                         count += 1
 905                         if count <= retries:
 906                                 self.report_retry(count, retries)
 907
 908                 if count > retries:
 909                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 910                         return False
 911
 912                 data_len = data.info().get('Content-length', None)
 913                 if data_len is not None:
 914                         data_len = long(data_len) + resume_len
 915                 data_len_str = self.format_bytes(data_len)
 916                 byte_counter = 0 + resume_len
 917                 block_size = 1024
 918                 start = time.time()
 919                 while True:
 920                         # Download and write
 921                         before = time.time()
 922                         data_block = data.read(block_size)
 923                         after = time.time()
 924                         if len(data_block) == 0:
 925                                 break
 926                         byte_counter += len(data_block)
 927
 928                         # Open file just in time
 929                         if stream is None:
 930                                 try:
 931                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 932                                         filename = self.undo_temp_name(tmpfilename)
 933                                         self.report_destination(filename)
 934                                 except (OSError, IOError), err:
 935                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 936                                         return False
 937                         try:
 938                                 stream.write(data_block)
 939                         except (IOError, OSError), err:
 940                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 941                                 return False
 942                         block_size = self.best_block_size(after - before, len(data_block))
 943
 944                         # Progress message
 945                         percent_str = self.calc_percent(byte_counter, data_len)
 946                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 947                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 948                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 949
 950                         # Apply rate limit
 951                         self.slow_down(start, byte_counter - resume_len)
 952
 953                 stream.close()
 954                 self.report_finish()
 955                 if data_len is not None and byte_counter != data_len:
 956                         raise ContentTooShortError(byte_counter, long(data_len))
 957                 self.try_rename(tmpfilename, filename)
 958
 959                 # Update file modification time
 960                 if self.params.get('updatetime', True):
 961                         self.try_utime(filename, data.info().get('last-modified', None))
 962
 963                 return True
 964
 965 class InfoExtractor(object):
 966         """Information Extractor class.
 967
 968         Information extractors are the classes that, given a URL, extract
 969         information from the video (or videos) the URL refers to. This
 970         information includes the real video URL, the video title and simplified
 971         title, author and others. The information is stored in a dictionary
 972         which is then passed to the FileDownloader. The FileDownloader
 973         processes this information possibly downloading the video to the file
 974         system, among other possible outcomes. The dictionaries must include
 975         the following fields:
 976
 977         id:             Video identifier.
 978         url:            Final video URL.
 979         uploader:       Nickname of the video uploader.
 980         title:          Literal title.
 981         stitle:         Simplified title.
 982         ext:            Video filename extension.
 983         format:         Video format.
 984         player_url:     SWF Player URL (may be None).
 985
 986         The following fields are optional. Their primary purpose is to allow
 987         youtube-dl to serve as the backend for a video search function, such
 988         as the one in youtube2mp3.  They are only used when their respective
 989         forced printing functions are called:
 990
 991         thumbnail:      Full URL to a video thumbnail image.
 992         description:    One-line video description.
 993
 994         Subclasses of this one should re-define the _real_initialize() and
 995         _real_extract() methods, as well as the suitable() static method.
 996         Probably, they should also be instantiated and added to the main
 997         downloader.
 998         """
 999
1000         _ready = False
1001         _downloader = None
1002
1003         def __init__(self, downloader=None):
1004                 """Constructor. Receives an optional downloader."""
1005                 self._ready = False
1006                 self.set_downloader(downloader)
1007
1008         @staticmethod
1009         def suitable(url):
1010                 """Receives a URL and returns True if suitable for this IE."""
1011                 return False
1012
1013         def initialize(self):
1014                 """Initializes an instance (authentication, etc)."""
1015                 if not self._ready:
1016                         self._real_initialize()
1017                         self._ready = True
1018
1019         def extract(self, url):
1020                 """Extracts URL information and returns it in list of dicts."""
1021                 self.initialize()
1022                 return self._real_extract(url)
1023
1024         def set_downloader(self, downloader):
1025                 """Sets the downloader for this IE."""
1026                 self._downloader = downloader
1027
1028         def _real_initialize(self):
1029                 """Real initialization process. Redefine in subclasses."""
1030                 pass
1031
1032         def _real_extract(self, url):
1033                 """Real extraction process. Redefine in subclasses."""
1034                 pass
1035
1036 class YoutubeIE(InfoExtractor):
1037         """Information extractor for youtube.com."""
1038
1039         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1040         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1041         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1042         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1043         _NETRC_MACHINE = 'youtube'
1044         # Listed in order of quality
1045         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1046         _video_extensions = {
1047                 '13': '3gp',
1048                 '17': 'mp4',
1049                 '18': 'mp4',
1050                 '22': 'mp4',
1051                 '37': 'mp4',
1052                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1053                 '43': 'webm',
1054                 '45': 'webm',
1055         }
1056
1057         @staticmethod
1058         def suitable(url):
1059                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1060
1061         def report_lang(self):
1062                 """Report attempt to set language."""
1063                 self._downloader.to_screen(u'[youtube] Setting language')
1064
1065         def report_login(self):
1066                 """Report attempt to log in."""
1067                 self._downloader.to_screen(u'[youtube] Logging in')
1068
1069         def report_age_confirmation(self):
1070                 """Report attempt to confirm age."""
1071                 self._downloader.to_screen(u'[youtube] Confirming age')
1072
1073         def report_video_webpage_download(self, video_id):
1074                 """Report attempt to download video webpage."""
1075                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1076
1077         def report_video_info_webpage_download(self, video_id):
1078                 """Report attempt to download video info webpage."""
1079                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1080
1081         def report_information_extraction(self, video_id):
1082                 """Report attempt to extract video information."""
1083                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1084
1085         def report_unavailable_format(self, video_id, format):
1086                 """Report extracted video URL."""
1087                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1088
1089         def report_rtmp_download(self):
1090                 """Indicate the download will use the RTMP protocol."""
1091                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1092
1093         def _real_initialize(self):
1094                 if self._downloader is None:
1095                         return
1096
1097                 username = None
1098                 password = None
1099                 downloader_params = self._downloader.params
1100
1101                 # Attempt to use provided username and password or .netrc data
1102                 if downloader_params.get('username', None) is not None:
1103                         username = downloader_params['username']
1104                         password = downloader_params['password']
1105                 elif downloader_params.get('usenetrc', False):
1106                         try:
1107                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1108                                 if info is not None:
1109                                         username = info[0]
1110                                         password = info[2]
1111                                 else:
1112                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1113                         except (IOError, netrc.NetrcParseError), err:
1114                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1115                                 return
1116
1117                 # Set language
1118                 request = urllib2.Request(self._LANG_URL)
1119                 try:
1120                         self.report_lang()
1121                         urllib2.urlopen(request).read()
1122                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1123                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1124                         return
1125
1126                 # No authentication to be performed
1127                 if username is None:
1128                         return
1129
1130                 # Log in
1131                 login_form = {
1132                                 'current_form': 'loginForm',
1133                                 'next':         '/',
1134                                 'action_login': 'Log In',
1135                                 'username':     username,
1136                                 'password':     password,
1137                                 }
1138                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1139                 try:
1140                         self.report_login()
1141                         login_results = urllib2.urlopen(request).read()
1142                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1143                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1144                                 return
1145                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1146                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1147                         return
1148
1149                 # Confirm age
1150                 age_form = {
1151                                 'next_url':             '/',
1152                                 'action_confirm':       'Confirm',
1153                                 }
1154                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1155                 try:
1156                         self.report_age_confirmation()
1157                         age_results = urllib2.urlopen(request).read()
1158                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1160                         return
1161
1162         def _real_extract(self, url):
1163                 # Extract video id from URL
1164                 mobj = re.match(self._VALID_URL, url)
1165                 if mobj is None:
1166                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1167                         return
1168                 video_id = mobj.group(2)
1169
1170                 # Get video webpage
1171                 self.report_video_webpage_download(video_id)
1172                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1173                 try:
1174                         video_webpage = urllib2.urlopen(request).read()
1175                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1177                         return
1178
1179                 # Attempt to extract SWF player URL
1180                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1181                 if mobj is not None:
1182                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1183                 else:
1184                         player_url = None
1185
1186                 # Get video info
1187                 self.report_video_info_webpage_download(video_id)
1188                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1189                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1190                                            % (video_id, el_type))
1191                         request = urllib2.Request(video_info_url)
1192                         try:
1193                                 video_info_webpage = urllib2.urlopen(request).read()
1194                                 video_info = parse_qs(video_info_webpage)
1195                                 if 'token' in video_info:
1196                                         break
1197                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1199                                 return
1200                 if 'token' not in video_info:
1201                         if 'reason' in video_info:
1202                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1203                         else:
1204                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1205                         return
1206
1207                 # Start extracting information
1208                 self.report_information_extraction(video_id)
1209
1210                 # uploader
1211                 if 'author' not in video_info:
1212                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1213                         return
1214                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1215
1216                 # title
1217                 if 'title' not in video_info:
1218                         self._downloader.trouble(u'ERROR: unable to extract video title')
1219                         return
1220                 video_title = urllib.unquote_plus(video_info['title'][0])
1221                 video_title = video_title.decode('utf-8')
1222                 video_title = sanitize_title(video_title)
1223
1224                 # simplified title
1225                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1226                 simple_title = simple_title.strip(ur'_')
1227
1228                 # thumbnail image
1229                 if 'thumbnail_url' not in video_info:
1230                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1231                         video_thumbnail = ''
1232                 else:   # don't panic if we can't find it
1233                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1234
1235                 # upload date
1236                 upload_date = u'NA'
1237                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1238                 if mobj is not None:
1239                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1240                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1241                         for expression in format_expressions:
1242                                 try:
1243                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1244                                 except:
1245                                         pass
1246
1247                 # description
1248                 try:
1249                         lxml.etree
1250                 except NameError:
1251                         video_description = u'No description available.'
1252                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1253                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1254                                 if mobj is not None:
1255                                         video_description = mobj.group(1).decode('utf-8')
1256                 else:
1257                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1258                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1259                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1260                         # TODO use another parser
1261
1262                 # token
1263                 video_token = urllib.unquote_plus(video_info['token'][0])
1264
1265                 # Decide which formats to download
1266                 req_format = self._downloader.params.get('format', None)
1267
1268                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1269                         self.report_rtmp_download()
1270                         video_url_list = [(None, video_info['conn'][0])]
1271                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1272                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1273                         url_data = [parse_qs(uds) for uds in url_data_strs]
1274                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1275                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1276
1277                         format_limit = self._downloader.params.get('format_limit', None)
1278                         if format_limit is not None and format_limit in self._available_formats:
1279                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1280                         else:
1281                                 format_list = self._available_formats
1282                         existing_formats = [x for x in format_list if x in url_map]
1283                         if len(existing_formats) == 0:
1284                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1285                                 return
1286                         if req_format is None:
1287                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1288                         elif req_format == '-1':
1289                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1290                         else:
1291                                 # Specific format
1292                                 if req_format not in url_map:
1293                                         self._downloader.trouble(u'ERROR: requested format not available')
1294                                         return
1295                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1296                 else:
1297                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1298                         return
1299
1300                 for format_param, video_real_url in video_url_list:
1301                         # At this point we have a new video
1302                         self._downloader.increment_downloads()
1303
1304                         # Extension
1305                         video_extension = self._video_extensions.get(format_param, 'flv')
1306
1307                         try:
1308                                 # Process video information
1309                                 self._downloader.process_info({
1310                                         'id':           video_id.decode('utf-8'),
1311                                         'url':          video_real_url.decode('utf-8'),
1312                                         'uploader':     video_uploader.decode('utf-8'),
1313                                         'upload_date':  upload_date,
1314                                         'title':        video_title,
1315                                         'stitle':       simple_title,
1316                                         'ext':          video_extension.decode('utf-8'),
1317                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1318                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1319                                         'description':  video_description,
1320                                         'player_url':   player_url,
1321                                 })
1322                         except UnavailableVideoError, err:
1323                                 self._downloader.trouble(u'\nERROR: unable to download video')
1324
1325
1326 class MetacafeIE(InfoExtractor):
1327         """Information Extractor for metacafe.com."""
1328
1329         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1330         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1331         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1332         _youtube_ie = None
1333
1334         def __init__(self, youtube_ie, downloader=None):
1335                 InfoExtractor.__init__(self, downloader)
1336                 self._youtube_ie = youtube_ie
1337
1338         @staticmethod
1339         def suitable(url):
1340                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1341
1342         def report_disclaimer(self):
1343                 """Report disclaimer retrieval."""
1344                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1345
1346         def report_age_confirmation(self):
1347                 """Report attempt to confirm age."""
1348                 self._downloader.to_screen(u'[metacafe] Confirming age')
1349
1350         def report_download_webpage(self, video_id):
1351                 """Report webpage download."""
1352                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1353
1354         def report_extraction(self, video_id):
1355                 """Report information extraction."""
1356                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1357
1358         def _real_initialize(self):
1359                 # Retrieve disclaimer
1360                 request = urllib2.Request(self._DISCLAIMER)
1361                 try:
1362                         self.report_disclaimer()
1363                         disclaimer = urllib2.urlopen(request).read()
1364                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1365                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1366                         return
1367
1368                 # Confirm age
1369                 disclaimer_form = {
1370                         'filters': '0',
1371                         'submit': "Continue - I'm over 18",
1372                         }
1373                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1374                 try:
1375                         self.report_age_confirmation()
1376                         disclaimer = urllib2.urlopen(request).read()
1377                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1378                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1379                         return
1380
1381         def _real_extract(self, url):
1382                 # Extract id and simplified title from URL
1383                 mobj = re.match(self._VALID_URL, url)
1384                 if mobj is None:
1385                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1386                         return
1387
1388                 video_id = mobj.group(1)
1389
1390                 # Check if video comes from YouTube
1391                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1392                 if mobj2 is not None:
1393                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1394                         return
1395
1396                 # At this point we have a new video
1397                 self._downloader.increment_downloads()
1398
1399                 simple_title = mobj.group(2).decode('utf-8')
1400
1401                 # Retrieve video webpage to extract further information
1402                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1403                 try:
1404                         self.report_download_webpage(video_id)
1405                         webpage = urllib2.urlopen(request).read()
1406                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1407                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1408                         return
1409
1410                 # Extract URL, uploader and title from webpage
1411                 self.report_extraction(video_id)
1412                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1413                 if mobj is not None:
1414                         mediaURL = urllib.unquote(mobj.group(1))
1415                         video_extension = mediaURL[-3:]
1416
1417                         # Extract gdaKey if available
1418                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1419                         if mobj is None:
1420                                 video_url = mediaURL
1421                         else:
1422                                 gdaKey = mobj.group(1)
1423                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1424                 else:
1425                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1426                         if mobj is None:
1427                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1428                                 return
1429                         vardict = parse_qs(mobj.group(1))
1430                         if 'mediaData' not in vardict:
1431                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1432                                 return
1433                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1434                         if mobj is None:
1435                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1436                                 return
1437                         mediaURL = mobj.group(1).replace('\\/', '/')
1438                         video_extension = mediaURL[-3:]
1439                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1440
1441                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract title')
1444                         return
1445                 video_title = mobj.group(1).decode('utf-8')
1446                 video_title = sanitize_title(video_title)
1447
1448                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1449                 if mobj is None:
1450                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1451                         return
1452                 video_uploader = mobj.group(1)
1453
1454                 try:
1455                         # Process video information
1456                         self._downloader.process_info({
1457                                 'id':           video_id.decode('utf-8'),
1458                                 'url':          video_url.decode('utf-8'),
1459                                 'uploader':     video_uploader.decode('utf-8'),
1460                                 'upload_date':  u'NA',
1461                                 'title':        video_title,
1462                                 'stitle':       simple_title,
1463                                 'ext':          video_extension.decode('utf-8'),
1464                                 'format':       u'NA',
1465                                 'player_url':   None,
1466                         })
1467                 except UnavailableVideoError:
1468                         self._downloader.trouble(u'\nERROR: unable to download video')
1469
1470
1471 class DailymotionIE(InfoExtractor):
1472         """Information Extractor for Dailymotion"""
1473
1474         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1475
1476         def __init__(self, downloader=None):
1477                 InfoExtractor.__init__(self, downloader)
1478
1479         @staticmethod
1480         def suitable(url):
1481                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1482
1483         def report_download_webpage(self, video_id):
1484                 """Report webpage download."""
1485                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1486
1487         def report_extraction(self, video_id):
1488                 """Report information extraction."""
1489                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1490
1491         def _real_initialize(self):
1492                 return
1493
1494         def _real_extract(self, url):
1495                 # Extract id and simplified title from URL
1496                 mobj = re.match(self._VALID_URL, url)
1497                 if mobj is None:
1498                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1499                         return
1500
1501                 # At this point we have a new video
1502                 self._downloader.increment_downloads()
1503                 video_id = mobj.group(1)
1504
1505                 simple_title = mobj.group(2).decode('utf-8')
1506                 video_extension = 'flv'
1507
1508                 # Retrieve video webpage to extract further information
1509                 request = urllib2.Request(url)
1510                 try:
1511                         self.report_download_webpage(video_id)
1512                         webpage = urllib2.urlopen(request).read()
1513                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1514                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1515                         return
1516
1517                 # Extract URL, uploader and title from webpage
1518                 self.report_extraction(video_id)
1519                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1522                         return
1523                 mediaURL = urllib.unquote(mobj.group(1))
1524
1525                 # if needed add http://www.dailymotion.com/ if relative URL
1526
1527                 video_url = mediaURL
1528
1529                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1530                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1531                 if mobj is None:
1532                         self._downloader.trouble(u'ERROR: unable to extract title')
1533                         return
1534                 video_title = mobj.group(1).decode('utf-8')
1535                 video_title = sanitize_title(video_title)
1536
1537                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1538                 if mobj is None:
1539                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1540                         return
1541                 video_uploader = mobj.group(1)
1542
1543                 try:
1544                         # Process video information
1545                         self._downloader.process_info({
1546                                 'id':           video_id.decode('utf-8'),
1547                                 'url':          video_url.decode('utf-8'),
1548                                 'uploader':     video_uploader.decode('utf-8'),
1549                                 'upload_date':  u'NA',
1550                                 'title':        video_title,
1551                                 'stitle':       simple_title,
1552                                 'ext':          video_extension.decode('utf-8'),
1553                                 'format':       u'NA',
1554                                 'player_url':   None,
1555                         })
1556                 except UnavailableVideoError:
1557                         self._downloader.trouble(u'\nERROR: unable to download video')
1558
1559 class GoogleIE(InfoExtractor):
1560         """Information extractor for video.google.com."""
1561
1562         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1563
1564         def __init__(self, downloader=None):
1565                 InfoExtractor.__init__(self, downloader)
1566
1567         @staticmethod
1568         def suitable(url):
1569                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1570
1571         def report_download_webpage(self, video_id):
1572                 """Report webpage download."""
1573                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1574
1575         def report_extraction(self, video_id):
1576                 """Report information extraction."""
1577                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1578
1579         def _real_initialize(self):
1580                 return
1581
1582         def _real_extract(self, url):
1583                 # Extract id from URL
1584                 mobj = re.match(self._VALID_URL, url)
1585                 if mobj is None:
1586                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1587                         return
1588
1589                 # At this point we have a new video
1590                 self._downloader.increment_downloads()
1591                 video_id = mobj.group(1)
1592
1593                 video_extension = 'mp4'
1594
1595                 # Retrieve video webpage to extract further information
1596                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1597                 try:
1598                         self.report_download_webpage(video_id)
1599                         webpage = urllib2.urlopen(request).read()
1600                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1602                         return
1603
1604                 # Extract URL, uploader, and title from webpage
1605                 self.report_extraction(video_id)
1606                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1607                 if mobj is None:
1608                         video_extension = 'flv'
1609                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1610                 if mobj is None:
1611                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1612                         return
1613                 mediaURL = urllib.unquote(mobj.group(1))
1614                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1615                 mediaURL = mediaURL.replace('\\x26', '\x26')
1616
1617                 video_url = mediaURL
1618
1619                 mobj = re.search(r'<title>(.*)</title>', webpage)
1620                 if mobj is None:
1621                         self._downloader.trouble(u'ERROR: unable to extract title')
1622                         return
1623                 video_title = mobj.group(1).decode('utf-8')
1624                 video_title = sanitize_title(video_title)
1625                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1626
1627                 # Extract video description
1628                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: unable to extract video description')
1631                         return
1632                 video_description = mobj.group(1).decode('utf-8')
1633                 if not video_description:
1634                         video_description = 'No description available.'
1635
1636                 # Extract video thumbnail
1637                 if self._downloader.params.get('forcethumbnail', False):
1638                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1639                         try:
1640                                 webpage = urllib2.urlopen(request).read()
1641                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1643                                 return
1644                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1645                         if mobj is None:
1646                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1647                                 return
1648                         video_thumbnail = mobj.group(1)
1649                 else:   # we need something to pass to process_info
1650                         video_thumbnail = ''
1651
1652
1653                 try:
1654                         # Process video information
1655                         self._downloader.process_info({
1656                                 'id':           video_id.decode('utf-8'),
1657                                 'url':          video_url.decode('utf-8'),
1658                                 'uploader':     u'NA',
1659                                 'upload_date':  u'NA',
1660                                 'title':        video_title,
1661                                 'stitle':       simple_title,
1662                                 'ext':          video_extension.decode('utf-8'),
1663                                 'format':       u'NA',
1664                                 'player_url':   None,
1665                         })
1666                 except UnavailableVideoError:
1667                         self._downloader.trouble(u'\nERROR: unable to download video')
1668
1669
1670 class PhotobucketIE(InfoExtractor):
1671         """Information extractor for photobucket.com."""
1672
1673         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1674
1675         def __init__(self, downloader=None):
1676                 InfoExtractor.__init__(self, downloader)
1677
1678         @staticmethod
1679         def suitable(url):
1680                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1681
1682         def report_download_webpage(self, video_id):
1683                 """Report webpage download."""
1684                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1685
1686         def report_extraction(self, video_id):
1687                 """Report information extraction."""
1688                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1689
1690         def _real_initialize(self):
1691                 return
1692
1693         def _real_extract(self, url):
1694                 # Extract id from URL
1695                 mobj = re.match(self._VALID_URL, url)
1696                 if mobj is None:
1697                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1698                         return
1699
1700                 # At this point we have a new video
1701                 self._downloader.increment_downloads()
1702                 video_id = mobj.group(1)
1703
1704                 video_extension = 'flv'
1705
1706                 # Retrieve video webpage to extract further information
1707                 request = urllib2.Request(url)
1708                 try:
1709                         self.report_download_webpage(video_id)
1710                         webpage = urllib2.urlopen(request).read()
1711                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1712                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1713                         return
1714
1715                 # Extract URL, uploader, and title from webpage
1716                 self.report_extraction(video_id)
1717                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1720                         return
1721                 mediaURL = urllib.unquote(mobj.group(1))
1722
1723                 video_url = mediaURL
1724
1725                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: unable to extract title')
1728                         return
1729                 video_title = mobj.group(1).decode('utf-8')
1730                 video_title = sanitize_title(video_title)
1731                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1732
1733                 video_uploader = mobj.group(2).decode('utf-8')
1734
1735                 try:
1736                         # Process video information
1737                         self._downloader.process_info({
1738                                 'id':           video_id.decode('utf-8'),
1739                                 'url':          video_url.decode('utf-8'),
1740                                 'uploader':     video_uploader,
1741                                 'upload_date':  u'NA',
1742                                 'title':        video_title,
1743                                 'stitle':       simple_title,
1744                                 'ext':          video_extension.decode('utf-8'),
1745                                 'format':       u'NA',
1746                                 'player_url':   None,
1747                         })
1748                 except UnavailableVideoError:
1749                         self._downloader.trouble(u'\nERROR: unable to download video')
1750
1751
1752 class YahooIE(InfoExtractor):
1753         """Information extractor for video.yahoo.com."""
1754
1755         # _VALID_URL matches all Yahoo! Video URLs
1756         # _VPAGE_URL matches only the extractable '/watch/' URLs
1757         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1758         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1759
1760         def __init__(self, downloader=None):
1761                 InfoExtractor.__init__(self, downloader)
1762
1763         @staticmethod
1764         def suitable(url):
1765                 return (re.match(YahooIE._VALID_URL, url) is not None)
1766
1767         def report_download_webpage(self, video_id):
1768                 """Report webpage download."""
1769                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1770
1771         def report_extraction(self, video_id):
1772                 """Report information extraction."""
1773                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1774
1775         def _real_initialize(self):
1776                 return
1777
1778         def _real_extract(self, url, new_video=True):
1779                 # Extract ID from URL
1780                 mobj = re.match(self._VALID_URL, url)
1781                 if mobj is None:
1782                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1783                         return
1784
1785                 # At this point we have a new video
1786                 self._downloader.increment_downloads()
1787                 video_id = mobj.group(2)
1788                 video_extension = 'flv'
1789
1790                 # Rewrite valid but non-extractable URLs as
1791                 # extractable English language /watch/ URLs
1792                 if re.match(self._VPAGE_URL, url) is None:
1793                         request = urllib2.Request(url)
1794                         try:
1795                                 webpage = urllib2.urlopen(request).read()
1796                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1797                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1798                                 return
1799
1800                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1801                         if mobj is None:
1802                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1803                                 return
1804                         yahoo_id = mobj.group(1)
1805
1806                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1807                         if mobj is None:
1808                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1809                                 return
1810                         yahoo_vid = mobj.group(1)
1811
1812                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1813                         return self._real_extract(url, new_video=False)
1814
1815                 # Retrieve video webpage to extract further information
1816                 request = urllib2.Request(url)
1817                 try:
1818                         self.report_download_webpage(video_id)
1819                         webpage = urllib2.urlopen(request).read()
1820                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1822                         return
1823
1824                 # Extract uploader and title from webpage
1825                 self.report_extraction(video_id)
1826                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1827                 if mobj is None:
1828                         self._downloader.trouble(u'ERROR: unable to extract video title')
1829                         return
1830                 video_title = mobj.group(1).decode('utf-8')
1831                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1832
1833                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1834                 if mobj is None:
1835                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1836                         return
1837                 video_uploader = mobj.group(1).decode('utf-8')
1838
1839                 # Extract video thumbnail
1840                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1843                         return
1844                 video_thumbnail = mobj.group(1).decode('utf-8')
1845
1846                 # Extract video description
1847                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1848                 if mobj is None:
1849                         self._downloader.trouble(u'ERROR: unable to extract video description')
1850                         return
1851                 video_description = mobj.group(1).decode('utf-8')
1852                 if not video_description: video_description = 'No description available.'
1853
1854                 # Extract video height and width
1855                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1856                 if mobj is None:
1857                         self._downloader.trouble(u'ERROR: unable to extract video height')
1858                         return
1859                 yv_video_height = mobj.group(1)
1860
1861                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1862                 if mobj is None:
1863                         self._downloader.trouble(u'ERROR: unable to extract video width')
1864                         return
1865                 yv_video_width = mobj.group(1)
1866
1867                 # Retrieve video playlist to extract media URL
1868                 # I'm not completely sure what all these options are, but we
1869                 # seem to need most of them, otherwise the server sends a 401.
1870                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1871                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1872                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1873                                                                   '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1874                                                                   '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1875                 try:
1876                         self.report_download_webpage(video_id)
1877                         webpage = urllib2.urlopen(request).read()
1878                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1879                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1880                         return
1881
1882                 # Extract media URL from playlist XML
1883                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1886                         return
1887                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1888                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1889
1890                 try:
1891                         # Process video information
1892                         self._downloader.process_info({
1893                                 'id':           video_id.decode('utf-8'),
1894                                 'url':          video_url,
1895                                 'uploader':     video_uploader,
1896                                 'upload_date':  u'NA',
1897                                 'title':        video_title,
1898                                 'stitle':       simple_title,
1899                                 'ext':          video_extension.decode('utf-8'),
1900                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1901                                 'description':  video_description,
1902                                 'thumbnail':    video_thumbnail,
1903                                 'description':  video_description,
1904                                 'player_url':   None,
1905                         })
1906                 except UnavailableVideoError:
1907                         self._downloader.trouble(u'\nERROR: unable to download video')
1908
1909
1910 class VimeoIE(InfoExtractor):
1911         """Information extractor for vimeo.com."""
1912
1913         # _VALID_URL matches Vimeo URLs
1914         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1915
1916         def __init__(self, downloader=None):
1917                 InfoExtractor.__init__(self, downloader)
1918
1919         @staticmethod
1920         def suitable(url):
1921                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1922
1923         def report_download_webpage(self, video_id):
1924                 """Report webpage download."""
1925                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1926
1927         def report_extraction(self, video_id):
1928                 """Report information extraction."""
1929                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1930
1931         def _real_initialize(self):
1932                 return
1933
1934         def _real_extract(self, url, new_video=True):
1935                 # Extract ID from URL
1936                 mobj = re.match(self._VALID_URL, url)
1937                 if mobj is None:
1938                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1939                         return
1940
1941                 # At this point we have a new video
1942                 self._downloader.increment_downloads()
1943                 video_id = mobj.group(1)
1944
1945                 # Retrieve video webpage to extract further information
1946                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1947                 try:
1948                         self.report_download_webpage(video_id)
1949                         webpage = urllib2.urlopen(request).read()
1950                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1951                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1952                         return
1953
1954                 # Now we begin extracting as much information as we can from what we
1955                 # retrieved. First we extract the information common to all extractors,
1956                 # and latter we extract those that are Vimeo specific.
1957                 self.report_extraction(video_id)
1958
1959                 # Extract title
1960                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1961                 if mobj is None:
1962                         self._downloader.trouble(u'ERROR: unable to extract video title')
1963                         return
1964                 video_title = mobj.group(1).decode('utf-8')
1965                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1966
1967                 # Extract uploader
1968                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1969                 if mobj is None:
1970                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1971                         return
1972                 video_uploader = mobj.group(1).decode('utf-8')
1973
1974                 # Extract video thumbnail
1975                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1976                 if mobj is None:
1977                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1978                         return
1979                 video_thumbnail = mobj.group(1).decode('utf-8')
1980
1981                 # # Extract video description
1982                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1983                 # if mobj is None:
1984                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1985                 #       return
1986                 # video_description = mobj.group(1).decode('utf-8')
1987                 # if not video_description: video_description = 'No description available.'
1988                 video_description = 'Foo.'
1989
1990                 # Vimeo specific: extract request signature
1991                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1992                 if mobj is None:
1993                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1994                         return
1995                 sig = mobj.group(1).decode('utf-8')
1996
1997                 # Vimeo specific: Extract request signature expiration
1998                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1999                 if mobj is None:
2000                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2001                         return
2002                 sig_exp = mobj.group(1).decode('utf-8')
2003
2004                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2005
2006                 try:
2007                         # Process video information
2008                         self._downloader.process_info({
2009                                 'id':           video_id.decode('utf-8'),
2010                                 'url':          video_url,
2011                                 'uploader':     video_uploader,
2012                                 'upload_date':  u'NA',
2013                                 'title':        video_title,
2014                                 'stitle':       simple_title,
2015                                 'ext':          u'mp4',
2016                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2017                                 'description':  video_description,
2018                                 'thumbnail':    video_thumbnail,
2019                                 'description':  video_description,
2020                                 'player_url':   None,
2021                         })
2022                 except UnavailableVideoError:
2023                         self._downloader.trouble(u'ERROR: unable to download video')
2024
2025
2026 class GenericIE(InfoExtractor):
2027         """Generic last-resort information extractor."""
2028
2029         def __init__(self, downloader=None):
2030                 InfoExtractor.__init__(self, downloader)
2031
2032         @staticmethod
2033         def suitable(url):
2034                 return True
2035
2036         def report_download_webpage(self, video_id):
2037                 """Report webpage download."""
2038                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2039                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2040
2041         def report_extraction(self, video_id):
2042                 """Report information extraction."""
2043                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2044
2045         def _real_initialize(self):
2046                 return
2047
2048         def _real_extract(self, url):
2049                 # At this point we have a new video
2050                 self._downloader.increment_downloads()
2051
2052                 video_id = url.split('/')[-1]
2053                 request = urllib2.Request(url)
2054                 try:
2055                         self.report_download_webpage(video_id)
2056                         webpage = urllib2.urlopen(request).read()
2057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2058                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2059                         return
2060                 except ValueError, err:
2061                         # since this is the last-resort InfoExtractor, if
2062                         # this error is thrown, it'll be thrown here
2063                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2064                         return
2065
2066                 self.report_extraction(video_id)
2067                 # Start with something easy: JW Player in SWFObject
2068                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2069                 if mobj is None:
2070                         # Broaden the search a little bit
2071                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2072                 if mobj is None:
2073                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2074                         return
2075
2076                 # It's possible that one of the regexes
2077                 # matched, but returned an empty group:
2078                 if mobj.group(1) is None:
2079                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2080                         return
2081
2082                 video_url = urllib.unquote(mobj.group(1))
2083                 video_id  = os.path.basename(video_url)
2084
2085                 # here's a fun little line of code for you:
2086                 video_extension = os.path.splitext(video_id)[1][1:]
2087                 video_id        = os.path.splitext(video_id)[0]
2088
2089                 # it's tempting to parse this further, but you would
2090                 # have to take into account all the variations like
2091                 #   Video Title - Site Name
2092                 #   Site Name | Video Title
2093                 #   Video Title - Tagline | Site Name
2094                 # and so on and so forth; it's just not practical
2095                 mobj = re.search(r'<title>(.*)</title>', webpage)
2096                 if mobj is None:
2097                         self._downloader.trouble(u'ERROR: unable to extract title')
2098                         return
2099                 video_title = mobj.group(1).decode('utf-8')
2100                 video_title = sanitize_title(video_title)
2101                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2102
2103                 # video uploader is domain name
2104                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2105                 if mobj is None:
2106                         self._downloader.trouble(u'ERROR: unable to extract title')
2107                         return
2108                 video_uploader = mobj.group(1).decode('utf-8')
2109
2110                 try:
2111                         # Process video information
2112                         self._downloader.process_info({
2113                                 'id':           video_id.decode('utf-8'),
2114                                 'url':          video_url.decode('utf-8'),
2115                                 'uploader':     video_uploader,
2116                                 'upload_date':  u'NA',
2117                                 'title':        video_title,
2118                                 'stitle':       simple_title,
2119                                 'ext':          video_extension.decode('utf-8'),
2120                                 'format':       u'NA',
2121                                 'player_url':   None,
2122                         })
2123                 except UnavailableVideoError, err:
2124                         self._downloader.trouble(u'\nERROR: unable to download video')
2125
2126
2127 class YoutubeSearchIE(InfoExtractor):
2128         """Information Extractor for YouTube search queries."""
2129         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2130         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2131         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2132         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2133         _youtube_ie = None
2134         _max_youtube_results = 1000
2135
2136         def __init__(self, youtube_ie, downloader=None):
2137                 InfoExtractor.__init__(self, downloader)
2138                 self._youtube_ie = youtube_ie
2139
2140         @staticmethod
2141         def suitable(url):
2142                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2143
2144         def report_download_page(self, query, pagenum):
2145                 """Report attempt to download playlist page with given number."""
2146                 query = query.decode(preferredencoding())
2147                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2148
2149         def _real_initialize(self):
2150                 self._youtube_ie.initialize()
2151
2152         def _real_extract(self, query):
2153                 mobj = re.match(self._VALID_QUERY, query)
2154                 if mobj is None:
2155                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2156                         return
2157
2158                 prefix, query = query.split(':')
2159                 prefix = prefix[8:]
2160                 query  = query.encode('utf-8')
2161                 if prefix == '':
2162                         self._download_n_results(query, 1)
2163                         return
2164                 elif prefix == 'all':
2165                         self._download_n_results(query, self._max_youtube_results)
2166                         return
2167                 else:
2168                         try:
2169                                 n = long(prefix)
2170                                 if n <= 0:
2171                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2172                                         return
2173                                 elif n > self._max_youtube_results:
2174                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2175                                         n = self._max_youtube_results
2176                                 self._download_n_results(query, n)
2177                                 return
2178                         except ValueError: # parsing prefix as integer fails
2179                                 self._download_n_results(query, 1)
2180                                 return
2181
2182         def _download_n_results(self, query, n):
2183                 """Downloads a specified number of results for a query"""
2184
2185                 video_ids = []
2186                 already_seen = set()
2187                 pagenum = 1
2188
2189                 while True:
2190                         self.report_download_page(query, pagenum)
2191                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2192                         request = urllib2.Request(result_url)
2193                         try:
2194                                 page = urllib2.urlopen(request).read()
2195                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2196                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2197                                 return
2198
2199                         # Extract video identifiers
2200                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2201                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2202                                 if video_id not in already_seen:
2203                                         video_ids.append(video_id)
2204                                         already_seen.add(video_id)
2205                                         if len(video_ids) == n:
2206                                                 # Specified n videos reached
2207                                                 for id in video_ids:
2208                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2209                                                 return
2210
2211                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2212                                 for id in video_ids:
2213                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2214                                 return
2215
2216                         pagenum = pagenum + 1
2217
2218 class GoogleSearchIE(InfoExtractor):
2219         """Information Extractor for Google Video search queries."""
2220         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2221         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2222         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2223         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2224         _google_ie = None
2225         _max_google_results = 1000
2226
2227         def __init__(self, google_ie, downloader=None):
2228                 InfoExtractor.__init__(self, downloader)
2229                 self._google_ie = google_ie
2230
2231         @staticmethod
2232         def suitable(url):
2233                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2234
2235         def report_download_page(self, query, pagenum):
2236                 """Report attempt to download playlist page with given number."""
2237                 query = query.decode(preferredencoding())
2238                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2239
2240         def _real_initialize(self):
2241                 self._google_ie.initialize()
2242
2243         def _real_extract(self, query):
2244                 mobj = re.match(self._VALID_QUERY, query)
2245                 if mobj is None:
2246                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2247                         return
2248
2249                 prefix, query = query.split(':')
2250                 prefix = prefix[8:]
2251                 query  = query.encode('utf-8')
2252                 if prefix == '':
2253                         self._download_n_results(query, 1)
2254                         return
2255                 elif prefix == 'all':
2256                         self._download_n_results(query, self._max_google_results)
2257                         return
2258                 else:
2259                         try:
2260                                 n = long(prefix)
2261                                 if n <= 0:
2262                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2263                                         return
2264                                 elif n > self._max_google_results:
2265                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2266                                         n = self._max_google_results
2267                                 self._download_n_results(query, n)
2268                                 return
2269                         except ValueError: # parsing prefix as integer fails
2270                                 self._download_n_results(query, 1)
2271                                 return
2272
2273         def _download_n_results(self, query, n):
2274                 """Downloads a specified number of results for a query"""
2275
2276                 video_ids = []
2277                 already_seen = set()
2278                 pagenum = 1
2279
2280                 while True:
2281                         self.report_download_page(query, pagenum)
2282                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2283                         request = urllib2.Request(result_url)
2284                         try:
2285                                 page = urllib2.urlopen(request).read()
2286                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2287                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2288                                 return
2289
2290                         # Extract video identifiers
2291                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2292                                 video_id = mobj.group(1)
2293                                 if video_id not in already_seen:
2294                                         video_ids.append(video_id)
2295                                         already_seen.add(video_id)
2296                                         if len(video_ids) == n:
2297                                                 # Specified n videos reached
2298                                                 for id in video_ids:
2299                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2300                                                 return
2301
2302                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2303                                 for id in video_ids:
2304                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2305                                 return
2306
2307                         pagenum = pagenum + 1
2308
2309 class YahooSearchIE(InfoExtractor):
2310         """Information Extractor for Yahoo! Video search queries."""
2311         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2312         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2313         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2314         _MORE_PAGES_INDICATOR = r'\s*Next'
2315         _yahoo_ie = None
2316         _max_yahoo_results = 1000
2317
2318         def __init__(self, yahoo_ie, downloader=None):
2319                 InfoExtractor.__init__(self, downloader)
2320                 self._yahoo_ie = yahoo_ie
2321
2322         @staticmethod
2323         def suitable(url):
2324                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2325
2326         def report_download_page(self, query, pagenum):
2327                 """Report attempt to download playlist page with given number."""
2328                 query = query.decode(preferredencoding())
2329                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2330
2331         def _real_initialize(self):
2332                 self._yahoo_ie.initialize()
2333
2334         def _real_extract(self, query):
2335                 mobj = re.match(self._VALID_QUERY, query)
2336                 if mobj is None:
2337                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338                         return
2339
2340                 prefix, query = query.split(':')
2341                 prefix = prefix[8:]
2342                 query  = query.encode('utf-8')
2343                 if prefix == '':
2344                         self._download_n_results(query, 1)
2345                         return
2346                 elif prefix == 'all':
2347                         self._download_n_results(query, self._max_yahoo_results)
2348                         return
2349                 else:
2350                         try:
2351                                 n = long(prefix)
2352                                 if n <= 0:
2353                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2354                                         return
2355                                 elif n > self._max_yahoo_results:
2356                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2357                                         n = self._max_yahoo_results
2358                                 self._download_n_results(query, n)
2359                                 return
2360                         except ValueError: # parsing prefix as integer fails
2361                                 self._download_n_results(query, 1)
2362                                 return
2363
2364         def _download_n_results(self, query, n):
2365                 """Downloads a specified number of results for a query"""
2366
2367                 video_ids = []
2368                 already_seen = set()
2369                 pagenum = 1
2370
2371                 while True:
2372                         self.report_download_page(query, pagenum)
2373                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2374                         request = urllib2.Request(result_url)
2375                         try:
2376                                 page = urllib2.urlopen(request).read()
2377                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2379                                 return
2380
2381                         # Extract video identifiers
2382                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2383                                 video_id = mobj.group(1)
2384                                 if video_id not in already_seen:
2385                                         video_ids.append(video_id)
2386                                         already_seen.add(video_id)
2387                                         if len(video_ids) == n:
2388                                                 # Specified n videos reached
2389                                                 for id in video_ids:
2390                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2391                                                 return
2392
2393                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2394                                 for id in video_ids:
2395                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2396                                 return
2397
2398                         pagenum = pagenum + 1
2399
2400 class YoutubePlaylistIE(InfoExtractor):
2401         """Information Extractor for YouTube playlists."""
2402
2403         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2404         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2405         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2406         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2407         _youtube_ie = None
2408
2409         def __init__(self, youtube_ie, downloader=None):
2410                 InfoExtractor.__init__(self, downloader)
2411                 self._youtube_ie = youtube_ie
2412
2413         @staticmethod
2414         def suitable(url):
2415                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2416
2417         def report_download_page(self, playlist_id, pagenum):
2418                 """Report attempt to download playlist page with given number."""
2419                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2420
2421         def _real_initialize(self):
2422                 self._youtube_ie.initialize()
2423
2424         def _real_extract(self, url):
2425                 # Extract playlist id
2426                 mobj = re.match(self._VALID_URL, url)
2427                 if mobj is None:
2428                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2429                         return
2430
2431                 # Single video case
2432                 if mobj.group(3) is not None:
2433                         self._youtube_ie.extract(mobj.group(3))
2434                         return
2435
2436                 # Download playlist pages
2437                 # prefix is 'p' as default for playlists but there are other types that need extra care
2438                 playlist_prefix = mobj.group(1)
2439                 if playlist_prefix == 'a':
2440                         playlist_access = 'artist'
2441                 else:
2442                         playlist_prefix = 'p'
2443                         playlist_access = 'view_play_list'
2444                 playlist_id = mobj.group(2)
2445                 video_ids = []
2446                 pagenum = 1
2447
2448                 while True:
2449                         self.report_download_page(playlist_id, pagenum)
2450                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2451                         try:
2452                                 page = urllib2.urlopen(request).read()
2453                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2454                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2455                                 return
2456
2457                         # Extract video identifiers
2458                         ids_in_page = []
2459                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2460                                 if mobj.group(1) not in ids_in_page:
2461                                         ids_in_page.append(mobj.group(1))
2462                         video_ids.extend(ids_in_page)
2463
2464                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2465                                 break
2466                         pagenum = pagenum + 1
2467
2468                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2469                 playlistend = self._downloader.params.get('playlistend', -1)
2470                 video_ids = video_ids[playliststart:playlistend]
2471
2472                 for id in video_ids:
2473                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2474                 return
2475
2476 class YoutubeUserIE(InfoExtractor):
2477         """Information Extractor for YouTube users."""
2478
2479         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2480         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2481         _GDATA_PAGE_SIZE = 50
2482         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2483         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2484         _youtube_ie = None
2485
2486         def __init__(self, youtube_ie, downloader=None):
2487                 InfoExtractor.__init__(self, downloader)
2488                 self._youtube_ie = youtube_ie
2489
2490         @staticmethod
2491         def suitable(url):
2492                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2493
2494         def report_download_page(self, username, start_index):
2495                 """Report attempt to download user page."""
2496                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2497                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2498
2499         def _real_initialize(self):
2500                 self._youtube_ie.initialize()
2501
2502         def _real_extract(self, url):
2503                 # Extract username
2504                 mobj = re.match(self._VALID_URL, url)
2505                 if mobj is None:
2506                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2507                         return
2508
2509                 username = mobj.group(1)
2510
2511                 # Download video ids using YouTube Data API. Result size per
2512                 # query is limited (currently to 50 videos) so we need to query
2513                 # page by page until there are no video ids - it means we got
2514                 # all of them.
2515
2516                 video_ids = []
2517                 pagenum = 0
2518
2519                 while True:
2520                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2521                         self.report_download_page(username, start_index)
2522
2523                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2524
2525                         try:
2526                                 page = urllib2.urlopen(request).read()
2527                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2528                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2529                                 return
2530
2531                         # Extract video identifiers
2532                         ids_in_page = []
2533
2534                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2535                                 if mobj.group(1) not in ids_in_page:
2536                                         ids_in_page.append(mobj.group(1))
2537
2538                         video_ids.extend(ids_in_page)
2539
2540                         # A little optimization - if current page is not
2541                         # "full", ie. does not contain PAGE_SIZE video ids then
2542                         # we can assume that this page is the last one - there
2543                         # are no more ids on further pages - no need to query
2544                         # again.
2545
2546                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2547                                 break
2548
2549                         pagenum += 1
2550
2551                 all_ids_count = len(video_ids)
2552                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2553                 playlistend = self._downloader.params.get('playlistend', -1)
2554
2555                 if playlistend == -1:
2556                         video_ids = video_ids[playliststart:]
2557                 else:
2558                         video_ids = video_ids[playliststart:playlistend]
2559
2560                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2561                                                                   (username, all_ids_count, len(video_ids)))
2562
2563                 for video_id in video_ids:
2564                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2565
2566
2567 class DepositFilesIE(InfoExtractor):
2568         """Information extractor for depositfiles.com"""
2569
2570         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2571
2572         def __init__(self, downloader=None):
2573                 InfoExtractor.__init__(self, downloader)
2574
2575         @staticmethod
2576         def suitable(url):
2577                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2578
2579         def report_download_webpage(self, file_id):
2580                 """Report webpage download."""
2581                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2582
2583         def report_extraction(self, file_id):
2584                 """Report information extraction."""
2585                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2586
2587         def _real_initialize(self):
2588                 return
2589
2590         def _real_extract(self, url):
2591                 # At this point we have a new file
2592                 self._downloader.increment_downloads()
2593
2594                 file_id = url.split('/')[-1]
2595                 # Rebuild url in english locale
2596                 url = 'http://depositfiles.com/en/files/' + file_id
2597
2598                 # Retrieve file webpage with 'Free download' button pressed
2599                 free_download_indication = { 'gateway_result' : '1' }
2600                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2601                 try:
2602                         self.report_download_webpage(file_id)
2603                         webpage = urllib2.urlopen(request).read()
2604                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2605                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2606                         return
2607
2608                 # Search for the real file URL
2609                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2610                 if (mobj is None) or (mobj.group(1) is None):
2611                         # Try to figure out reason of the error.
2612                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2613                         if (mobj is not None) and (mobj.group(1) is not None):
2614                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2615                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2616                         else:
2617                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2618                         return
2619
2620                 file_url = mobj.group(1)
2621                 file_extension = os.path.splitext(file_url)[1][1:]
2622
2623                 # Search for file title
2624                 mobj = re.search(r'<b title="(.*?)">', webpage)
2625                 if mobj is None:
2626                         self._downloader.trouble(u'ERROR: unable to extract title')
2627                         return
2628                 file_title = mobj.group(1).decode('utf-8')
2629
2630                 try:
2631                         # Process file information
2632                         self._downloader.process_info({
2633                                 'id':           file_id.decode('utf-8'),
2634                                 'url':          file_url.decode('utf-8'),
2635                                 'uploader':     u'NA',
2636                                 'upload_date':  u'NA',
2637                                 'title':        file_title,
2638                                 'stitle':       file_title,
2639                                 'ext':          file_extension.decode('utf-8'),
2640                                 'format':       u'NA',
2641                                 'player_url':   None,
2642                         })
2643                 except UnavailableVideoError, err:
2644                         self._downloader.trouble(u'ERROR: unable to download file')
2645
2646 class FacebookIE(InfoExtractor):
2647         """Information Extractor for Facebook"""
2648
2649         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2650         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2651         _NETRC_MACHINE = 'facebook'
2652         _available_formats = ['highqual', 'lowqual']
2653         _video_extensions = {
2654                 'highqual': 'mp4',
2655                 'lowqual': 'mp4',
2656         }
2657
2658         def __init__(self, downloader=None):
2659                 InfoExtractor.__init__(self, downloader)
2660
2661         @staticmethod
2662         def suitable(url):
2663                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2664
2665         def _reporter(self, message):
2666                 """Add header and report message."""
2667                 self._downloader.to_screen(u'[facebook] %s' % message)
2668
2669         def report_login(self):
2670                 """Report attempt to log in."""
2671                 self._reporter(u'Logging in')
2672
2673         def report_video_webpage_download(self, video_id):
2674                 """Report attempt to download video webpage."""
2675                 self._reporter(u'%s: Downloading video webpage' % video_id)
2676
2677         def report_information_extraction(self, video_id):
2678                 """Report attempt to extract video information."""
2679                 self._reporter(u'%s: Extracting video information' % video_id)
2680
2681         def _parse_page(self, video_webpage):
2682                 """Extract video information from page"""
2683                 # General data
2684                 data = {'title': r'class="video_title datawrap">(.*?)</',
2685                         'description': r'<div class="datawrap">(.*?)</div>',
2686                         'owner': r'\("video_owner_name", "(.*?)"\)',
2687                         'upload_date': r'data-date="(.*?)"',
2688                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2689                         }
2690                 video_info = {}
2691                 for piece in data.keys():
2692                         mobj = re.search(data[piece], video_webpage)
2693                         if mobj is not None:
2694                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2695
2696                 # Video urls
2697                 video_urls = {}
2698                 for fmt in self._available_formats:
2699                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2700                         if mobj is not None:
2701                                 # URL is in a Javascript segment inside an escaped Unicode format within
2702                                 # the generally utf-8 page
2703                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2704                 video_info['video_urls'] = video_urls
2705
2706                 return video_info
2707
2708         def _real_initialize(self):
2709                 if self._downloader is None:
2710                         return
2711
2712                 useremail = None
2713                 password = None
2714                 downloader_params = self._downloader.params
2715
2716                 # Attempt to use provided username and password or .netrc data
2717                 if downloader_params.get('username', None) is not None:
2718                         useremail = downloader_params['username']
2719                         password = downloader_params['password']
2720                 elif downloader_params.get('usenetrc', False):
2721                         try:
2722                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2723                                 if info is not None:
2724                                         useremail = info[0]
2725                                         password = info[2]
2726                                 else:
2727                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2728                         except (IOError, netrc.NetrcParseError), err:
2729                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2730                                 return
2731
2732                 if useremail is None:
2733                         return
2734
2735                 # Log in
2736                 login_form = {
2737                         'email': useremail,
2738                         'pass': password,
2739                         'login': 'Log+In'
2740                         }
2741                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2742                 try:
2743                         self.report_login()
2744                         login_results = urllib2.urlopen(request).read()
2745                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2746                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2747                                 return
2748                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2749                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2750                         return
2751
2752         def _real_extract(self, url):
2753                 mobj = re.match(self._VALID_URL, url)
2754                 if mobj is None:
2755                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2756                         return
2757                 video_id = mobj.group('ID')
2758
2759                 # Get video webpage
2760                 self.report_video_webpage_download(video_id)
2761                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2762                 try:
2763                         page = urllib2.urlopen(request)
2764                         video_webpage = page.read()
2765                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2767                         return
2768
2769                 # Start extracting information
2770                 self.report_information_extraction(video_id)
2771
2772                 # Extract information
2773                 video_info = self._parse_page(video_webpage)
2774
2775                 # uploader
2776                 if 'owner' not in video_info:
2777                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2778                         return
2779                 video_uploader = video_info['owner']
2780
2781                 # title
2782                 if 'title' not in video_info:
2783                         self._downloader.trouble(u'ERROR: unable to extract video title')
2784                         return
2785                 video_title = video_info['title']
2786                 video_title = video_title.decode('utf-8')
2787                 video_title = sanitize_title(video_title)
2788
2789                 # simplified title
2790                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2791                 simple_title = simple_title.strip(ur'_')
2792
2793                 # thumbnail image
2794                 if 'thumbnail' not in video_info:
2795                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2796                         video_thumbnail = ''
2797                 else:
2798                         video_thumbnail = video_info['thumbnail']
2799
2800                 # upload date
2801                 upload_date = u'NA'
2802                 if 'upload_date' in video_info:
2803                         upload_time = video_info['upload_date']
2804                         timetuple = email.utils.parsedate_tz(upload_time)
2805                         if timetuple is not None:
2806                                 try:
2807                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2808                                 except:
2809                                         pass
2810
2811                 # description
2812                 video_description = video_info.get('description', 'No description available.')
2813
2814                 url_map = video_info['video_urls']
2815                 if len(url_map.keys()) > 0:
2816                         # Decide which formats to download
2817                         req_format = self._downloader.params.get('format', None)
2818                         format_limit = self._downloader.params.get('format_limit', None)
2819
2820                         if format_limit is not None and format_limit in self._available_formats:
2821                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2822                         else:
2823                                 format_list = self._available_formats
2824                         existing_formats = [x for x in format_list if x in url_map]
2825                         if len(existing_formats) == 0:
2826                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2827                                 return
2828                         if req_format is None:
2829                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2830                         elif req_format == '-1':
2831                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2832                         else:
2833                                 # Specific format
2834                                 if req_format not in url_map:
2835                                         self._downloader.trouble(u'ERROR: requested format not available')
2836                                         return
2837                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2838
2839                 for format_param, video_real_url in video_url_list:
2840
2841                         # At this point we have a new video
2842                         self._downloader.increment_downloads()
2843
2844                         # Extension
2845                         video_extension = self._video_extensions.get(format_param, 'mp4')
2846
2847                         try:
2848                                 # Process video information
2849                                 self._downloader.process_info({
2850                                         'id':           video_id.decode('utf-8'),
2851                                         'url':          video_real_url.decode('utf-8'),
2852                                         'uploader':     video_uploader.decode('utf-8'),
2853                                         'upload_date':  upload_date,
2854                                         'title':        video_title,
2855                                         'stitle':       simple_title,
2856                                         'ext':          video_extension.decode('utf-8'),
2857                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2858                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2859                                         'description':  video_description.decode('utf-8'),
2860                                         'player_url':   None,
2861                                 })
2862                         except UnavailableVideoError, err:
2863                                 self._downloader.trouble(u'\nERROR: unable to download video')
2864
2865 class BlipTVIE(InfoExtractor):
2866         """Information extractor for blip.tv"""
2867
2868         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2869         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2870
2871         @staticmethod
2872         def suitable(url):
2873                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2874
2875         def report_extraction(self, file_id):
2876                 """Report information extraction."""
2877                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2878
2879         def _simplify_title(self, title):
2880                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2881                 res = res.strip(ur'_')
2882                 return res
2883
2884         def _real_extract(self, url):
2885                 mobj = re.match(self._VALID_URL, url)
2886                 if mobj is None:
2887                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2888                         return
2889
2890                 if '?' in url:
2891                         cchar = '&'
2892                 else:
2893                         cchar = '?'
2894                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2895                 request = urllib2.Request(json_url)
2896                 self.report_extraction(mobj.group(1))
2897                 try:
2898                         json_code = urllib2.urlopen(request).read()
2899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2900                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2901                         return
2902                 try:
2903                         json_data = json.loads(json_code)
2904                         if 'Post' in json_data:
2905                                 data = json_data['Post']
2906                         else:
2907                                 data = json_data
2908
2909                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2910                         video_url = data['media']['url']
2911                         umobj = re.match(self._URL_EXT, video_url)
2912                         if umobj is None:
2913                                 raise ValueError('Can not determine filename extension')
2914                         ext = umobj.group(1)
2915
2916                         self._downloader.increment_downloads()
2917
2918                         info = {
2919                                 'id': data['item_id'],
2920                                 'url': video_url,
2921                                 'uploader': data['display_name'],
2922                                 'upload_date': upload_date,
2923                                 'title': data['title'],
2924                                 'stitle': self._simplify_title(data['title']),
2925                                 'ext': ext,
2926                                 'format': data['media']['mimeType'],
2927                                 'thumbnail': data['thumbnailUrl'],
2928                                 'description': data['description'],
2929                                 'player_url': data['embedUrl']
2930                         }
2931                 except (ValueError,KeyError), err:
2932                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2933                         return
2934
2935                 try:
2936                         self._downloader.process_info(info)
2937                 except UnavailableVideoError, err:
2938                         self._downloader.trouble(u'\nERROR: unable to download video')
2939
2940
2941 class PostProcessor(object):
2942         """Post Processor class.
2943
2944         PostProcessor objects can be added to downloaders with their
2945         add_post_processor() method. When the downloader has finished a
2946         successful download, it will take its internal chain of PostProcessors
2947         and start calling the run() method on each one of them, first with
2948         an initial argument and then with the returned value of the previous
2949         PostProcessor.
2950
2951         The chain will be stopped if one of them ever returns None or the end
2952         of the chain is reached.
2953
2954         PostProcessor objects follow a "mutual registration" process similar
2955         to InfoExtractor objects.
2956         """
2957
2958         _downloader = None
2959
2960         def __init__(self, downloader=None):
2961                 self._downloader = downloader
2962
2963         def set_downloader(self, downloader):
2964                 """Sets the downloader for this PP."""
2965                 self._downloader = downloader
2966
2967         def run(self, information):
2968                 """Run the PostProcessor.
2969
2970                 The "information" argument is a dictionary like the ones
2971                 composed by InfoExtractors. The only difference is that this
2972                 one has an extra field called "filepath" that points to the
2973                 downloaded file.
2974
2975                 When this method returns None, the postprocessing chain is
2976                 stopped. However, this method may return an information
2977                 dictionary that will be passed to the next postprocessing
2978                 object in the chain. It can be the one it received after
2979                 changing some fields.
2980
2981                 In addition, this method may raise a PostProcessingError
2982                 exception that will be taken into account by the downloader
2983                 it was called from.
2984                 """
2985                 return information # by default, do nothing
2986
2987 class FFmpegExtractAudioPP(PostProcessor):
2988
2989         def __init__(self, downloader=None, preferredcodec=None):
2990                 PostProcessor.__init__(self, downloader)
2991                 if preferredcodec is None:
2992                         preferredcodec = 'best'
2993                 self._preferredcodec = preferredcodec
2994
2995         @staticmethod
2996         def get_audio_codec(path):
2997                 try:
2998                         cmd = ['ffprobe', '-show_streams', '--', path]
2999                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3000                         output = handle.communicate()[0]
3001                         if handle.wait() != 0:
3002                                 return None
3003                 except (IOError, OSError):
3004                         return None
3005                 audio_codec = None
3006                 for line in output.split('\n'):
3007                         if line.startswith('codec_name='):
3008                                 audio_codec = line.split('=')[1].strip()
3009                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3010                                 return audio_codec
3011                 return None
3012
3013         @staticmethod
3014         def run_ffmpeg(path, out_path, codec, more_opts):
3015                 try:
3016                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3017                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3018                         return (ret == 0)
3019                 except (IOError, OSError):
3020                         return False
3021
3022         def run(self, information):
3023                 path = information['filepath']
3024
3025                 filecodec = self.get_audio_codec(path)
3026                 if filecodec is None:
3027                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3028                         return None
3029
3030                 more_opts = []
3031                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3032                         if filecodec == 'aac' or filecodec == 'mp3':
3033                                 # Lossless if possible
3034                                 acodec = 'copy'
3035                                 extension = filecodec
3036                                 if filecodec == 'aac':
3037                                         more_opts = ['-f', 'adts']
3038                         else:
3039                                 # MP3 otherwise.
3040                                 acodec = 'libmp3lame'
3041                                 extension = 'mp3'
3042                                 more_opts = ['-ab', '128k']
3043                 else:
3044                         # We convert the audio (lossy)
3045                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3046                         extension = self._preferredcodec
3047                         more_opts = ['-ab', '128k']
3048                         if self._preferredcodec == 'aac':
3049                                 more_opts += ['-f', 'adts']
3050
3051                 (prefix, ext) = os.path.splitext(path)
3052                 new_path = prefix + '.' + extension
3053                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3054                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3055
3056                 if not status:
3057                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3058                         return None
3059
3060                 try:
3061                         os.remove(path)
3062                 except (IOError, OSError):
3063                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3064                         return None
3065
3066                 information['filepath'] = new_path
3067                 return information
3068
3069
3070 def updateSelf(downloader, filename):
3071         ''' Update the program file with the latest version from the repository '''
3072         # Note: downloader only used for options
3073         if not os.access(filename, os.W_OK):
3074                 sys.exit('ERROR: no write permissions on %s' % filename)
3075
3076         downloader.to_screen('Updating to latest version...')
3077
3078         try:
3079                 try:
3080                         urlh = urllib.urlopen(UPDATE_URL)
3081                         newcontent = urlh.read()
3082                 finally:
3083                         urlh.close()
3084         except (IOError, OSError), err:
3085                 sys.exit('ERROR: unable to download latest version')
3086
3087         try:
3088                 outf = open(filename, 'wb')
3089                 try:
3090                         outf.write(newcontent)
3091                 finally:
3092                         outf.close()
3093         except (IOError, OSError), err:
3094                 sys.exit('ERROR: unable to overwrite current version')
3095
3096         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3097
3098 def parseOpts():
3099         # Deferred imports
3100         import getpass
3101         import optparse
3102
3103         def _format_option_string(option):
3104                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3105
3106                 opts = []
3107
3108                 if option._short_opts: opts.append(option._short_opts[0])
3109                 if option._long_opts: opts.append(option._long_opts[0])
3110                 if len(opts) > 1: opts.insert(1, ', ')
3111
3112                 if option.takes_value(): opts.append(' %s' % option.metavar)
3113
3114                 return "".join(opts)
3115
3116         def _find_term_columns():
3117                 columns = os.environ.get('COLUMNS', None)
3118                 if columns:
3119                         return int(columns)
3120
3121                 try:
3122                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3123                         out,err = sp.communicate()
3124                         return int(out.split()[1])
3125                 except:
3126                         pass
3127                 return None
3128
3129         max_width = 80
3130         max_help_position = 80
3131
3132         # No need to wrap help messages if we're on a wide console
3133         columns = _find_term_columns()
3134         if columns: max_width = columns
3135
3136         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3137         fmt.format_option_strings = _format_option_string
3138
3139         kw = {
3140                 'version'   : __version__,
3141                 'formatter' : fmt,
3142                 'usage' : '%prog [options] url...',
3143                 'conflict_handler' : 'resolve',
3144         }
3145
3146         parser = optparse.OptionParser(**kw)
3147
3148         # option groups
3149         general        = optparse.OptionGroup(parser, 'General Options')
3150         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3151         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3152         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3153         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3154         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3155
3156         general.add_option('-h', '--help',
3157                         action='help', help='print this help text and exit')
3158         general.add_option('-v', '--version',
3159                         action='version', help='print program version and exit')
3160         general.add_option('-U', '--update',
3161                         action='store_true', dest='update_self', help='update this program to latest stable version')
3162         general.add_option('-i', '--ignore-errors',
3163                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3164         general.add_option('-r', '--rate-limit',
3165                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3166         general.add_option('-R', '--retries',
3167                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3168         general.add_option('--playlist-start',
3169                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3170         general.add_option('--playlist-end',
3171                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3172         general.add_option('--dump-user-agent',
3173                         action='store_true', dest='dump_user_agent',
3174                         help='display the current browser identification', default=False)
3175
3176         authentication.add_option('-u', '--username',
3177                         dest='username', metavar='USERNAME', help='account username')
3178         authentication.add_option('-p', '--password',
3179                         dest='password', metavar='PASSWORD', help='account password')
3180         authentication.add_option('-n', '--netrc',
3181                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3182
3183
3184         video_format.add_option('-f', '--format',
3185                         action='store', dest='format', metavar='FORMAT', help='video format code')
3186         video_format.add_option('--all-formats',
3187                         action='store_const', dest='format', help='download all available video formats', const='-1')
3188         video_format.add_option('--max-quality',
3189                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3190
3191
3192         verbosity.add_option('-q', '--quiet',
3193                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3194         verbosity.add_option('-s', '--simulate',
3195                         action='store_true', dest='simulate', help='do not download video', default=False)
3196         verbosity.add_option('-g', '--get-url',
3197                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3198         verbosity.add_option('-e', '--get-title',
3199                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3200         verbosity.add_option('--get-thumbnail',
3201                         action='store_true', dest='getthumbnail',
3202                         help='simulate, quiet but print thumbnail URL', default=False)
3203         verbosity.add_option('--get-description',
3204                         action='store_true', dest='getdescription',
3205                         help='simulate, quiet but print video description', default=False)
3206         verbosity.add_option('--get-filename',
3207                         action='store_true', dest='getfilename',
3208                         help='simulate, quiet but print output filename', default=False)
3209         verbosity.add_option('--no-progress',
3210                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3211         verbosity.add_option('--console-title',
3212                         action='store_true', dest='consoletitle',
3213                         help='display progress in console titlebar', default=False)
3214
3215
3216         filesystem.add_option('-t', '--title',
3217                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3218         filesystem.add_option('-l', '--literal',
3219                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3220         filesystem.add_option('-A', '--auto-number',
3221                         action='store_true', dest='autonumber',
3222                         help='number downloaded files starting from 00000', default=False)
3223         filesystem.add_option('-o', '--output',
3224                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3225         filesystem.add_option('-a', '--batch-file',
3226                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3227         filesystem.add_option('-w', '--no-overwrites',
3228                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3229         filesystem.add_option('-c', '--continue',
3230                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3231         filesystem.add_option('--cookies',
3232                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3233         filesystem.add_option('--no-part',
3234                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3235         filesystem.add_option('--no-mtime',
3236                         action='store_false', dest='updatetime',
3237                         help='do not use the Last-modified header to set the file modification time', default=True)
3238         filesystem.add_option('--write-description',
3239                         action='store_true', dest='writedescription',
3240                         help='write video description to a .description file', default=False)
3241         filesystem.add_option('--write-info-json',
3242                         action='store_true', dest='writeinfojson',
3243                         help='write video metadata to a .info.json file', default=False)
3244
3245
3246         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3247                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3248         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3249                         help='"best", "aac" or "mp3"; best by default')
3250
3251
3252         parser.add_option_group(general)
3253         parser.add_option_group(filesystem)
3254         parser.add_option_group(verbosity)
3255         parser.add_option_group(video_format)
3256         parser.add_option_group(authentication)
3257         parser.add_option_group(postproc)
3258
3259         opts, args = parser.parse_args()
3260
3261         return parser, opts, args
3262
3263 def main():
3264         parser, opts, args = parseOpts()
3265
3266         # Open appropriate CookieJar
3267         if opts.cookiefile is None:
3268                 jar = cookielib.CookieJar()
3269         else:
3270                 try:
3271                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3272                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3273                                 jar.load()
3274                 except (IOError, OSError), err:
3275                         sys.exit(u'ERROR: unable to open cookie file')
3276
3277         # Dump user agent
3278         if opts.dump_user_agent:
3279                 print std_headers['User-Agent']
3280                 sys.exit(0)
3281
3282         # General configuration
3283         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3284         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3285         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3286
3287         # Batch file verification
3288         batchurls = []
3289         if opts.batchfile is not None:
3290                 try:
3291                         if opts.batchfile == '-':
3292                                 batchfd = sys.stdin
3293                         else:
3294                                 batchfd = open(opts.batchfile, 'r')
3295                         batchurls = batchfd.readlines()
3296                         batchurls = [x.strip() for x in batchurls]
3297                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3298                 except IOError:
3299                         sys.exit(u'ERROR: batch file could not be read')
3300         all_urls = batchurls + args
3301
3302         # Conflicting, missing and erroneous options
3303         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3304                 parser.error(u'using .netrc conflicts with giving username/password')
3305         if opts.password is not None and opts.username is None:
3306                 parser.error(u'account username missing')
3307         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3308                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3309         if opts.usetitle and opts.useliteral:
3310                 parser.error(u'using title conflicts with using literal title')
3311         if opts.username is not None and opts.password is None:
3312                 opts.password = getpass.getpass(u'Type account password and press return:')
3313         if opts.ratelimit is not None:
3314                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3315                 if numeric_limit is None:
3316                         parser.error(u'invalid rate limit specified')
3317                 opts.ratelimit = numeric_limit
3318         if opts.retries is not None:
3319                 try:
3320                         opts.retries = long(opts.retries)
3321                 except (TypeError, ValueError), err:
3322                         parser.error(u'invalid retry count specified')
3323         try:
3324                 opts.playliststart = int(opts.playliststart)
3325                 if opts.playliststart <= 0:
3326                         raise ValueError(u'Playlist start must be positive')
3327         except (TypeError, ValueError), err:
3328                 parser.error(u'invalid playlist start number specified')
3329         try:
3330                 opts.playlistend = int(opts.playlistend)
3331                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3332                         raise ValueError(u'Playlist end must be greater than playlist start')
3333         except (TypeError, ValueError), err:
3334                 parser.error(u'invalid playlist end number specified')
3335         if opts.extractaudio:
3336                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3337                         parser.error(u'invalid audio format specified')
3338
3339         # Information extractors
3340         youtube_ie = YoutubeIE()
3341         metacafe_ie = MetacafeIE(youtube_ie)
3342         dailymotion_ie = DailymotionIE()
3343         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3344         youtube_user_ie = YoutubeUserIE(youtube_ie)
3345         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3346         google_ie = GoogleIE()
3347         google_search_ie = GoogleSearchIE(google_ie)
3348         photobucket_ie = PhotobucketIE()
3349         yahoo_ie = YahooIE()
3350         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3351         deposit_files_ie = DepositFilesIE()
3352         facebook_ie = FacebookIE()
3353         bliptv_ie = BlipTVIE()
3354         vimeo_ie = VimeoIE()
3355         generic_ie = GenericIE()
3356
3357         # File downloader
3358         fd = FileDownloader({
3359                 'usenetrc': opts.usenetrc,
3360                 'username': opts.username,
3361                 'password': opts.password,
3362                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3363                 'forceurl': opts.geturl,
3364                 'forcetitle': opts.gettitle,
3365                 'forcethumbnail': opts.getthumbnail,
3366                 'forcedescription': opts.getdescription,
3367                 'forcefilename': opts.getfilename,
3368                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3369                 'format': opts.format,
3370                 'format_limit': opts.format_limit,
3371                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3372                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3373                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3374                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3375                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3376                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3377                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3378                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3379                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3380                         or u'%(id)s.%(ext)s'),
3381                 'ignoreerrors': opts.ignoreerrors,
3382                 'ratelimit': opts.ratelimit,
3383                 'nooverwrites': opts.nooverwrites,
3384                 'retries': opts.retries,
3385                 'continuedl': opts.continue_dl,
3386                 'noprogress': opts.noprogress,
3387                 'playliststart': opts.playliststart,
3388                 'playlistend': opts.playlistend,
3389                 'logtostderr': opts.outtmpl == '-',
3390                 'consoletitle': opts.consoletitle,
3391                 'nopart': opts.nopart,
3392                 'updatetime': opts.updatetime,
3393                 'writedescription': opts.writedescription,
3394                 'writeinfojson': opts.writeinfojson,
3395                 })
3396         fd.add_info_extractor(youtube_search_ie)
3397         fd.add_info_extractor(youtube_pl_ie)
3398         fd.add_info_extractor(youtube_user_ie)
3399         fd.add_info_extractor(metacafe_ie)
3400         fd.add_info_extractor(dailymotion_ie)
3401         fd.add_info_extractor(youtube_ie)
3402         fd.add_info_extractor(google_ie)
3403         fd.add_info_extractor(google_search_ie)
3404         fd.add_info_extractor(photobucket_ie)
3405         fd.add_info_extractor(yahoo_ie)
3406         fd.add_info_extractor(yahoo_search_ie)
3407         fd.add_info_extractor(deposit_files_ie)
3408         fd.add_info_extractor(facebook_ie)
3409         fd.add_info_extractor(bliptv_ie)
3410         fd.add_info_extractor(vimeo_ie)
3411
3412         # This must come last since it's the
3413         # fallback if none of the others work
3414         fd.add_info_extractor(generic_ie)
3415
3416         # PostProcessors
3417         if opts.extractaudio:
3418                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3419
3420         # Update version
3421         if opts.update_self:
3422                 updateSelf(fd, sys.argv[0])
3423
3424         # Maybe do nothing
3425         if len(all_urls) < 1:
3426                 if not opts.update_self:
3427                         parser.error(u'you must provide at least one URL')
3428                 else:
3429                         sys.exit()
3430         retcode = fd.download(all_urls)
3431
3432         # Dump cookie jar if requested
3433         if opts.cookiefile is not None:
3434                 try:
3435                         jar.save()
3436                 except (IOError, OSError), err:
3437                         sys.exit(u'ERROR: unable to save cookie jar')
3438
3439         sys.exit(retcode)
3440
3441
3442 if __name__ == '__main__':
3443         try:
3444                 main()
3445         except DownloadError:
3446                 sys.exit(1)
3447         except SameFileError:
3448                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3449         except KeyboardInterrupt:
3450                 sys.exit(u'\nERROR: Interrupted by user')
3451
3452 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: