youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.14'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import httplib
  27 import locale
  28 import math
  29 import netrc
  30 import os
  31 import os.path
  32 import re
  33 import socket
  34 import string
  35 import subprocess
  36 import sys
  37 import time
  38 import urllib
  39 import urllib2
  40 import warnings
  41 import zlib
  42
  43 if os.name == 'nt':
  44         import ctypes
  45
  46 try:
  47         import email.utils
  48 except ImportError: # Python 2.4
  49         import email.Utils
  50 try:
  51         import cStringIO as StringIO
  52 except ImportError:
  53         import StringIO
  54
  55 # parse_qs was moved from the cgi module to the urlparse module recently.
  56 try:
  57         from urlparse import parse_qs
  58 except ImportError:
  59         from cgi import parse_qs
  60
  61 try:
  62         import lxml.etree
  63 except ImportError:
  64         pass # Handled below
  65
  66 try:
  67         import xml.etree.ElementTree
  68 except ImportError: # Python<2.5
  69         pass # Not officially supported, but let it slip
  70
  71 std_headers = {
  72         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  73         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  74         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  75         'Accept-Encoding': 'gzip, deflate',
  76         'Accept-Language': 'en-us,en;q=0.5',
  77 }
  78
  79 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  80
  81 try:
  82         import json
  83 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  84         import re
  85         class json(object):
  86                 @staticmethod
  87                 def loads(s):
  88                         s = s.decode('UTF-8')
  89                         def raiseError(msg, i):
  90                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  91                         def skipSpace(i, expectMore=True):
  92                                 while i < len(s) and s[i] in ' \t\r\n':
  93                                         i += 1
  94                                 if expectMore:
  95                                         if i >= len(s):
  96                                                 raiseError('Premature end', i)
  97                                 return i
  98                         def decodeEscape(match):
  99                                 esc = match.group(1)
 100                                 _STATIC = {
 101                                         '"': '"',
 102                                         '\\': '\\',
 103                                         '/': '/',
 104                                         'b': unichr(0x8),
 105                                         'f': unichr(0xc),
 106                                         'n': '\n',
 107                                         'r': '\r',
 108                                         't': '\t',
 109                                 }
 110                                 if esc in _STATIC:
 111                                         return _STATIC[esc]
 112                                 if esc[0] == 'u':
 113                                         if len(esc) == 1+4:
 114                                                 return unichr(int(esc[1:5], 16))
 115                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 116                                                 hi = int(esc[1:5], 16)
 117                                                 low = int(esc[7:11], 16)
 118                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 119                                 raise ValueError('Unknown escape ' + str(esc))
 120                         def parseString(i):
 121                                 i += 1
 122                                 e = i
 123                                 while True:
 124                                         e = s.index('"', e)
 125                                         bslashes = 0
 126                                         while s[e-bslashes-1] == '\\':
 127                                                 bslashes += 1
 128                                         if bslashes % 2 == 1:
 129                                                 e += 1
 130                                                 continue
 131                                         break
 132                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 133                                 stri = rexp.sub(decodeEscape, s[i:e])
 134                                 return (e+1,stri)
 135                         def parseObj(i):
 136                                 i += 1
 137                                 res = {}
 138                                 i = skipSpace(i)
 139                                 if s[i] == '}': # Empty dictionary
 140                                         return (i+1,res)
 141                                 while True:
 142                                         if s[i] != '"':
 143                                                 raiseError('Expected a string object key', i)
 144                                         i,key = parseString(i)
 145                                         i = skipSpace(i)
 146                                         if i >= len(s) or s[i] != ':':
 147                                                 raiseError('Expected a colon', i)
 148                                         i,val = parse(i+1)
 149                                         res[key] = val
 150                                         i = skipSpace(i)
 151                                         if s[i] == '}':
 152                                                 return (i+1, res)
 153                                         if s[i] != ',':
 154                                                 raiseError('Expected comma or closing curly brace', i)
 155                                         i = skipSpace(i+1)
 156                         def parseArray(i):
 157                                 res = []
 158                                 i = skipSpace(i+1)
 159                                 if s[i] == ']': # Empty array
 160                                         return (i+1,res)
 161                                 while True:
 162                                         i,val = parse(i)
 163                                         res.append(val)
 164                                         i = skipSpace(i) # Raise exception if premature end
 165                                         if s[i] == ']':
 166                                                 return (i+1, res)
 167                                         if s[i] != ',':
 168                                                 raiseError('Expected a comma or closing bracket', i)
 169                                         i = skipSpace(i+1)
 170                         def parseDiscrete(i):
 171                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 172                                         if s.startswith(k, i):
 173                                                 return (i+len(k), v)
 174                                 raiseError('Not a boolean (or null)', i)
 175                         def parseNumber(i):
 176                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 177                                 if mobj is None:
 178                                         raiseError('Not a number', i)
 179                                 nums = mobj.group(1)
 180                                 if '.' in nums or 'e' in nums or 'E' in nums:
 181                                         return (i+len(nums), float(nums))
 182                                 return (i+len(nums), int(nums))
 183                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 184                         def parse(i):
 185                                 i = skipSpace(i)
 186                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 187                                 i = skipSpace(i, False)
 188                                 return (i,res)
 189                         i,res = parse(0)
 190                         if i < len(s):
 191                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 192                         return res
 193
 194 def preferredencoding():
 195         """Get preferred encoding.
 196
 197         Returns the best encoding scheme for the system, based on
 198         locale.getpreferredencoding() and some further tweaks.
 199         """
 200         def yield_preferredencoding():
 201                 try:
 202                         pref = locale.getpreferredencoding()
 203                         u'TEST'.encode(pref)
 204                 except:
 205                         pref = 'UTF-8'
 206                 while True:
 207                         yield pref
 208         return yield_preferredencoding().next()
 209
 210
 211 def htmlentity_transform(matchobj):
 212         """Transforms an HTML entity to a Unicode character.
 213
 214         This function receives a match object and is intended to be used with
 215         the re.sub() function.
 216         """
 217         entity = matchobj.group(1)
 218
 219         # Known non-numeric HTML entity
 220         if entity in htmlentitydefs.name2codepoint:
 221                 return unichr(htmlentitydefs.name2codepoint[entity])
 222
 223         # Unicode character
 224         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 225         if mobj is not None:
 226                 numstr = mobj.group(1)
 227                 if numstr.startswith(u'x'):
 228                         base = 16
 229                         numstr = u'0%s' % numstr
 230                 else:
 231                         base = 10
 232                 return unichr(long(numstr, base))
 233
 234         # Unknown entity in name, return its literal representation
 235         return (u'&%s;' % entity)
 236
 237
 238 def sanitize_title(utitle):
 239         """Sanitizes a video title so it could be used as part of a filename."""
 240         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 241         return utitle.replace(unicode(os.sep), u'%')
 242
 243
 244 def sanitize_open(filename, open_mode):
 245         """Try to open the given filename, and slightly tweak it if this fails.
 246
 247         Attempts to open the given filename. If this fails, it tries to change
 248         the filename slightly, step by step, until it's either able to open it
 249         or it fails and raises a final exception, like the standard open()
 250         function.
 251
 252         It returns the tuple (stream, definitive_file_name).
 253         """
 254         try:
 255                 if filename == u'-':
 256                         if sys.platform == 'win32':
 257                                 import msvcrt
 258                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 259                         return (sys.stdout, filename)
 260                 stream = open(filename, open_mode)
 261                 return (stream, filename)
 262         except (IOError, OSError), err:
 263                 # In case of error, try to remove win32 forbidden chars
 264                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 265
 266                 # An exception here should be caught in the caller
 267                 stream = open(filename, open_mode)
 268                 return (stream, filename)
 269
 270
 271 def timeconvert(timestr):
 272         """Convert RFC 2822 defined time string into system timestamp"""
 273         timestamp = None
 274         timetuple = email.utils.parsedate_tz(timestr)
 275         if timetuple is not None:
 276                 timestamp = email.utils.mktime_tz(timetuple)
 277         return timestamp
 278
 279
 280 class DownloadError(Exception):
 281         """Download Error exception.
 282
 283         This exception may be thrown by FileDownloader objects if they are not
 284         configured to continue on errors. They will contain the appropriate
 285         error message.
 286         """
 287         pass
 288
 289
 290 class SameFileError(Exception):
 291         """Same File exception.
 292
 293         This exception will be thrown by FileDownloader objects if they detect
 294         multiple files would have to be downloaded to the same file on disk.
 295         """
 296         pass
 297
 298
 299 class PostProcessingError(Exception):
 300         """Post Processing exception.
 301
 302         This exception may be raised by PostProcessor's .run() method to
 303         indicate an error in the postprocessing task.
 304         """
 305         pass
 306
 307
 308 class UnavailableVideoError(Exception):
 309         """Unavailable Format exception.
 310
 311         This exception will be thrown when a video is requested
 312         in a format that is not available for that video.
 313         """
 314         pass
 315
 316
 317 class ContentTooShortError(Exception):
 318         """Content Too Short exception.
 319
 320         This exception may be raised by FileDownloader objects when a file they
 321         download is too small for what the server announced first, indicating
 322         the connection was probably interrupted.
 323         """
 324         # Both in bytes
 325         downloaded = None
 326         expected = None
 327
 328         def __init__(self, downloaded, expected):
 329                 self.downloaded = downloaded
 330                 self.expected = expected
 331
 332
 333 class YoutubeDLHandler(urllib2.HTTPHandler):
 334         """Handler for HTTP requests and responses.
 335
 336         This class, when installed with an OpenerDirector, automatically adds
 337         the standard headers to every HTTP request and handles gzipped and
 338         deflated responses from web servers. If compression is to be avoided in
 339         a particular request, the original request in the program code only has
 340         to include the HTTP header "Youtubedl-No-Compression", which will be
 341         removed before making the real request.
 342
 343         Part of this code was copied from:
 344
 345         http://techknack.net/python-urllib2-handlers/
 346
 347         Andrew Rowls, the author of that code, agreed to release it to the
 348         public domain.
 349         """
 350
 351         @staticmethod
 352         def deflate(data):
 353                 try:
 354                         return zlib.decompress(data, -zlib.MAX_WBITS)
 355                 except zlib.error:
 356                         return zlib.decompress(data)
 357
 358         @staticmethod
 359         def addinfourl_wrapper(stream, headers, url, code):
 360                 if hasattr(urllib2.addinfourl, 'getcode'):
 361                         return urllib2.addinfourl(stream, headers, url, code)
 362                 ret = urllib2.addinfourl(stream, headers, url)
 363                 ret.code = code
 364                 return ret
 365
 366         def http_request(self, req):
 367                 for h in std_headers:
 368                         if h in req.headers:
 369                                 del req.headers[h]
 370                         req.add_header(h, std_headers[h])
 371                 if 'Youtubedl-no-compression' in req.headers:
 372                         if 'Accept-encoding' in req.headers:
 373                                 del req.headers['Accept-encoding']
 374                         del req.headers['Youtubedl-no-compression']
 375                 return req
 376
 377         def http_response(self, req, resp):
 378                 old_resp = resp
 379                 # gzip
 380                 if resp.headers.get('Content-encoding', '') == 'gzip':
 381                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 382                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 383                         resp.msg = old_resp.msg
 384                 # deflate
 385                 if resp.headers.get('Content-encoding', '') == 'deflate':
 386                         gz = StringIO.StringIO(self.deflate(resp.read()))
 387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 388                         resp.msg = old_resp.msg
 389                 return resp
 390
 391
 392 class FileDownloader(object):
 393         """File Downloader class.
 394
 395         File downloader objects are the ones responsible of downloading the
 396         actual video file and writing it to disk if the user has requested
 397         it, among some other tasks. In most cases there should be one per
 398         program. As, given a video URL, the downloader doesn't know how to
 399         extract all the needed information, task that InfoExtractors do, it
 400         has to pass the URL to one of them.
 401
 402         For this, file downloader objects have a method that allows
 403         InfoExtractors to be registered in a given order. When it is passed
 404         a URL, the file downloader handles it to the first InfoExtractor it
 405         finds that reports being able to handle it. The InfoExtractor extracts
 406         all the information about the video or videos the URL refers to, and
 407         asks the FileDownloader to process the video information, possibly
 408         downloading the video.
 409
 410         File downloaders accept a lot of parameters. In order not to saturate
 411         the object constructor with arguments, it receives a dictionary of
 412         options instead. These options are available through the params
 413         attribute for the InfoExtractors to use. The FileDownloader also
 414         registers itself as the downloader in charge for the InfoExtractors
 415         that are added to it, so this is a "mutual registration".
 416
 417         Available options:
 418
 419         username:         Username for authentication purposes.
 420         password:         Password for authentication purposes.
 421         usenetrc:         Use netrc for authentication instead.
 422         quiet:            Do not print messages to stdout.
 423         forceurl:         Force printing final URL.
 424         forcetitle:       Force printing title.
 425         forcethumbnail:   Force printing thumbnail URL.
 426         forcedescription: Force printing description.
 427         forcefilename:    Force printing final filename.
 428         simulate:         Do not download the video files.
 429         format:           Video format code.
 430         format_limit:     Highest quality format to try.
 431         outtmpl:          Template for output names.
 432         ignoreerrors:     Do not stop on download errors.
 433         ratelimit:        Download speed limit, in bytes/sec.
 434         nooverwrites:     Prevent overwriting files.
 435         retries:          Number of times to retry for HTTP error 5xx
 436         continuedl:       Try to continue downloads if possible.
 437         noprogress:       Do not print the progress bar.
 438         playliststart:    Playlist item to start at.
 439         playlistend:      Playlist item to end at.
 440         logtostderr:      Log messages to stderr instead of stdout.
 441         consoletitle:     Display progress in console window's titlebar.
 442         nopart:           Do not use temporary .part files.
 443         updatetime:       Use the Last-modified header to set output file timestamps.
 444         writedescription: Write the video description to a .description file
 445         writeinfojson:    Write the video description to a .info.json file
 446         """
 447
 448         params = None
 449         _ies = []
 450         _pps = []
 451         _download_retcode = None
 452         _num_downloads = None
 453         _screen_file = None
 454
 455         def __init__(self, params):
 456                 """Create a FileDownloader object with the given options."""
 457                 self._ies = []
 458                 self._pps = []
 459                 self._download_retcode = 0
 460                 self._num_downloads = 0
 461                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 462                 self.params = params
 463
 464         @staticmethod
 465         def format_bytes(bytes):
 466                 if bytes is None:
 467                         return 'N/A'
 468                 if type(bytes) is str:
 469                         bytes = float(bytes)
 470                 if bytes == 0.0:
 471                         exponent = 0
 472                 else:
 473                         exponent = long(math.log(bytes, 1024.0))
 474                 suffix = 'bkMGTPEZY'[exponent]
 475                 converted = float(bytes) / float(1024 ** exponent)
 476                 return '%.2f%s' % (converted, suffix)
 477
 478         @staticmethod
 479         def calc_percent(byte_counter, data_len):
 480                 if data_len is None:
 481                         return '---.-%'
 482                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 483
 484         @staticmethod
 485         def calc_eta(start, now, total, current):
 486                 if total is None:
 487                         return '--:--'
 488                 dif = now - start
 489                 if current == 0 or dif < 0.001: # One millisecond
 490                         return '--:--'
 491                 rate = float(current) / dif
 492                 eta = long((float(total) - float(current)) / rate)
 493                 (eta_mins, eta_secs) = divmod(eta, 60)
 494                 if eta_mins > 99:
 495                         return '--:--'
 496                 return '%02d:%02d' % (eta_mins, eta_secs)
 497
 498         @staticmethod
 499         def calc_speed(start, now, bytes):
 500                 dif = now - start
 501                 if bytes == 0 or dif < 0.001: # One millisecond
 502                         return '%10s' % '---b/s'
 503                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 504
 505         @staticmethod
 506         def best_block_size(elapsed_time, bytes):
 507                 new_min = max(bytes / 2.0, 1.0)
 508                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 509                 if elapsed_time < 0.001:
 510                         return long(new_max)
 511                 rate = bytes / elapsed_time
 512                 if rate > new_max:
 513                         return long(new_max)
 514                 if rate < new_min:
 515                         return long(new_min)
 516                 return long(rate)
 517
 518         @staticmethod
 519         def parse_bytes(bytestr):
 520                 """Parse a string indicating a byte quantity into a long integer."""
 521                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 522                 if matchobj is None:
 523                         return None
 524                 number = float(matchobj.group(1))
 525                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 526                 return long(round(number * multiplier))
 527
 528         def add_info_extractor(self, ie):
 529                 """Add an InfoExtractor object to the end of the list."""
 530                 self._ies.append(ie)
 531                 ie.set_downloader(self)
 532
 533         def add_post_processor(self, pp):
 534                 """Add a PostProcessor object to the end of the chain."""
 535                 self._pps.append(pp)
 536                 pp.set_downloader(self)
 537
 538         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 539                 """Print message to stdout if not in quiet mode."""
 540                 try:
 541                         if not self.params.get('quiet', False):
 542                                 terminator = [u'\n', u''][skip_eol]
 543                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 544                         self._screen_file.flush()
 545                 except (UnicodeEncodeError), err:
 546                         if not ignore_encoding_errors:
 547                                 raise
 548
 549         def to_stderr(self, message):
 550                 """Print message to stderr."""
 551                 print >>sys.stderr, message.encode(preferredencoding())
 552
 553         def to_cons_title(self, message):
 554                 """Set console/terminal window title to message."""
 555                 if not self.params.get('consoletitle', False):
 556                         return
 557                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 558                         # c_wchar_p() might not be necessary if `message` is
 559                         # already of type unicode()
 560                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 561                 elif 'TERM' in os.environ:
 562                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 563
 564         def fixed_template(self):
 565                 """Checks if the output template is fixed."""
 566                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 567
 568         def trouble(self, message=None):
 569                 """Determine action to take when a download problem appears.
 570
 571                 Depending on if the downloader has been configured to ignore
 572                 download errors or not, this method may throw an exception or
 573                 not when errors are found, after printing the message.
 574                 """
 575                 if message is not None:
 576                         self.to_stderr(message)
 577                 if not self.params.get('ignoreerrors', False):
 578                         raise DownloadError(message)
 579                 self._download_retcode = 1
 580
 581         def slow_down(self, start_time, byte_counter):
 582                 """Sleep if the download speed is over the rate limit."""
 583                 rate_limit = self.params.get('ratelimit', None)
 584                 if rate_limit is None or byte_counter == 0:
 585                         return
 586                 now = time.time()
 587                 elapsed = now - start_time
 588                 if elapsed <= 0.0:
 589                         return
 590                 speed = float(byte_counter) / elapsed
 591                 if speed > rate_limit:
 592                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 593
 594         def temp_name(self, filename):
 595                 """Returns a temporary filename for the given filename."""
 596                 if self.params.get('nopart', False) or filename == u'-' or \
 597                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 598                         return filename
 599                 return filename + u'.part'
 600
 601         def undo_temp_name(self, filename):
 602                 if filename.endswith(u'.part'):
 603                         return filename[:-len(u'.part')]
 604                 return filename
 605
 606         def try_rename(self, old_filename, new_filename):
 607                 try:
 608                         if old_filename == new_filename:
 609                                 return
 610                         os.rename(old_filename, new_filename)
 611                 except (IOError, OSError), err:
 612                         self.trouble(u'ERROR: unable to rename file')
 613
 614         def try_utime(self, filename, last_modified_hdr):
 615                 """Try to set the last-modified time of the given file."""
 616                 if last_modified_hdr is None:
 617                         return
 618                 if not os.path.isfile(filename):
 619                         return
 620                 timestr = last_modified_hdr
 621                 if timestr is None:
 622                         return
 623                 filetime = timeconvert(timestr)
 624                 if filetime is None:
 625                         return
 626                 try:
 627                         os.utime(filename, (time.time(), filetime))
 628                 except:
 629                         pass
 630
 631         def report_writedescription(self, descfn):
 632                 """ Report that the description file is being written """
 633                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 634
 635         def report_writeinfojson(self, infofn):
 636                 """ Report that the metadata file has been written """
 637                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 638
 639         def report_destination(self, filename):
 640                 """Report destination filename."""
 641                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 642
 643         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 644                 """Report download progress."""
 645                 if self.params.get('noprogress', False):
 646                         return
 647                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 648                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 649                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 650                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 651
 652         def report_resuming_byte(self, resume_len):
 653                 """Report attempt to resume at given byte."""
 654                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 655
 656         def report_retry(self, count, retries):
 657                 """Report retry in case of HTTP error 5xx"""
 658                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 659
 660         def report_file_already_downloaded(self, file_name):
 661                 """Report file has already been fully downloaded."""
 662                 try:
 663                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 664                 except (UnicodeEncodeError), err:
 665                         self.to_screen(u'[download] The file has already been downloaded')
 666
 667         def report_unable_to_resume(self):
 668                 """Report it was impossible to resume download."""
 669                 self.to_screen(u'[download] Unable to resume')
 670
 671         def report_finish(self):
 672                 """Report download finished."""
 673                 if self.params.get('noprogress', False):
 674                         self.to_screen(u'[download] Download completed')
 675                 else:
 676                         self.to_screen(u'')
 677
 678         def increment_downloads(self):
 679                 """Increment the ordinal that assigns a number to each file."""
 680                 self._num_downloads += 1
 681
 682         def prepare_filename(self, info_dict):
 683                 """Generate the output filename."""
 684                 try:
 685                         template_dict = dict(info_dict)
 686                         template_dict['epoch'] = unicode(long(time.time()))
 687                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 688                         filename = self.params['outtmpl'] % template_dict
 689                         return filename
 690                 except (ValueError, KeyError), err:
 691                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 692                         return None
 693
 694         def process_info(self, info_dict):
 695                 """Process a single dictionary returned by an InfoExtractor."""
 696                 filename = self.prepare_filename(info_dict)
 697                 # Do nothing else if in simulate mode
 698                 if self.params.get('simulate', False):
 699                         # Forced printings
 700                         if self.params.get('forcetitle', False):
 701                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 702                         if self.params.get('forceurl', False):
 703                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 704                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 705                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 706                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 707                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 708                         if self.params.get('forcefilename', False) and filename is not None:
 709                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 710
 711                         return
 712
 713                 if filename is None:
 714                         return
 715                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 716                         self.to_stderr(u'WARNING: file exists and will be skipped')
 717                         return
 718
 719                 try:
 720                         dn = os.path.dirname(filename)
 721                         if dn != '' and not os.path.exists(dn):
 722                                 os.makedirs(dn)
 723                 except (OSError, IOError), err:
 724                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 725                         return
 726
 727                 if self.params.get('writedescription', False):
 728                         try:
 729                                 descfn = filename + '.description'
 730                                 self.report_writedescription(descfn)
 731                                 descfile = open(descfn, 'wb')
 732                                 try:
 733                                         descfile.write(info_dict['description'].encode('utf-8'))
 734                                 finally:
 735                                         descfile.close()
 736                         except (OSError, IOError):
 737                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 738                                 return
 739
 740                 if self.params.get('writeinfojson', False):
 741                         infofn = filename + '.info.json'
 742                         self.report_writeinfojson(infofn)
 743                         try:
 744                                 json.dump
 745                         except (NameError,AttributeError):
 746                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 747                                 return
 748                         try:
 749                                 infof = open(infofn, 'wb')
 750                                 try:
 751                                         json.dump(info_dict, infof)
 752                                 finally:
 753                                         infof.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 756                                 return
 757
 758                 try:
 759                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 760                 except (OSError, IOError), err:
 761                         raise UnavailableVideoError
 762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 763                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 764                         return
 765                 except (ContentTooShortError, ), err:
 766                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 767                         return
 768
 769                 if success:
 770                         try:
 771                                 self.post_process(filename, info_dict)
 772                         except (PostProcessingError), err:
 773                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 774                                 return
 775
 776         def download(self, url_list):
 777                 """Download a given list of URLs."""
 778                 if len(url_list) > 1 and self.fixed_template():
 779                         raise SameFileError(self.params['outtmpl'])
 780
 781                 for url in url_list:
 782                         suitable_found = False
 783                         for ie in self._ies:
 784                                 # Go to next InfoExtractor if not suitable
 785                                 if not ie.suitable(url):
 786                                         continue
 787
 788                                 # Suitable InfoExtractor found
 789                                 suitable_found = True
 790
 791                                 # Extract information from URL and process it
 792                                 ie.extract(url)
 793
 794                                 # Suitable InfoExtractor had been found; go to next URL
 795                                 break
 796
 797                         if not suitable_found:
 798                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 799
 800                 return self._download_retcode
 801
 802         def post_process(self, filename, ie_info):
 803                 """Run the postprocessing chain on the given file."""
 804                 info = dict(ie_info)
 805                 info['filepath'] = filename
 806                 for pp in self._pps:
 807                         info = pp.run(info)
 808                         if info is None:
 809                                 break
 810
 811         def _download_with_rtmpdump(self, filename, url, player_url):
 812                 self.report_destination(filename)
 813                 tmpfilename = self.temp_name(filename)
 814
 815                 # Check for rtmpdump first
 816                 try:
 817                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 818                 except (OSError, IOError):
 819                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 820                         return False
 821
 822                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 823                 # the connection was interrumpted and resuming appears to be
 824                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 825                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 826                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 827                 while retval == 2 or retval == 1:
 828                         prevsize = os.path.getsize(tmpfilename)
 829                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 830                         time.sleep(5.0) # This seems to be needed
 831                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 832                         cursize = os.path.getsize(tmpfilename)
 833                         if prevsize == cursize and retval == 1:
 834                                 break
 835                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 836                         if prevsize == cursize and retval == 2 and cursize > 1024:
 837                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 838                                 retval = 0
 839                                 break
 840                 if retval == 0:
 841                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 842                         self.try_rename(tmpfilename, filename)
 843                         return True
 844                 else:
 845                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 846                         return False
 847
 848         def _do_download(self, filename, url, player_url):
 849                 # Check file already present
 850                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 851                         self.report_file_already_downloaded(filename)
 852                         return True
 853
 854                 # Attempt to download using rtmpdump
 855                 if url.startswith('rtmp'):
 856                         return self._download_with_rtmpdump(filename, url, player_url)
 857
 858                 tmpfilename = self.temp_name(filename)
 859                 stream = None
 860                 open_mode = 'wb'
 861
 862                 # Do not include the Accept-Encoding header
 863                 headers = {'Youtubedl-no-compression': 'True'}
 864                 basic_request = urllib2.Request(url, None, headers)
 865                 request = urllib2.Request(url, None, headers)
 866
 867                 # Establish possible resume length
 868                 if os.path.isfile(tmpfilename):
 869                         resume_len = os.path.getsize(tmpfilename)
 870                 else:
 871                         resume_len = 0
 872
 873                 # Request parameters in case of being able to resume
 874                 if self.params.get('continuedl', False) and resume_len != 0:
 875                         self.report_resuming_byte(resume_len)
 876                         request.add_header('Range', 'bytes=%d-' % resume_len)
 877                         open_mode = 'ab'
 878
 879                 count = 0
 880                 retries = self.params.get('retries', 0)
 881                 while count <= retries:
 882                         # Establish connection
 883                         try:
 884                                 data = urllib2.urlopen(request)
 885                                 break
 886                         except (urllib2.HTTPError, ), err:
 887                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 888                                         # Unexpected HTTP error
 889                                         raise
 890                                 elif err.code == 416:
 891                                         # Unable to resume (requested range not satisfiable)
 892                                         try:
 893                                                 # Open the connection again without the range header
 894                                                 data = urllib2.urlopen(basic_request)
 895                                                 content_length = data.info()['Content-Length']
 896                                         except (urllib2.HTTPError, ), err:
 897                                                 if err.code < 500 or err.code >= 600:
 898                                                         raise
 899                                         else:
 900                                                 # Examine the reported length
 901                                                 if (content_length is not None and
 902                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 903                                                         # The file had already been fully downloaded.
 904                                                         # Explanation to the above condition: in issue #175 it was revealed that
 905                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 906                                                         # changing the file size slightly and causing problems for some users. So
 907                                                         # I decided to implement a suggested change and consider the file
 908                                                         # completely downloaded if the file size differs less than 100 bytes from
 909                                                         # the one in the hard drive.
 910                                                         self.report_file_already_downloaded(filename)
 911                                                         self.try_rename(tmpfilename, filename)
 912                                                         return True
 913                                                 else:
 914                                                         # The length does not match, we start the download over
 915                                                         self.report_unable_to_resume()
 916                                                         open_mode = 'wb'
 917                                                         break
 918                         # Retry
 919                         count += 1
 920                         if count <= retries:
 921                                 self.report_retry(count, retries)
 922
 923                 if count > retries:
 924                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 925                         return False
 926
 927                 data_len = data.info().get('Content-length', None)
 928                 if data_len is not None:
 929                         data_len = long(data_len) + resume_len
 930                 data_len_str = self.format_bytes(data_len)
 931                 byte_counter = 0 + resume_len
 932                 block_size = 1024
 933                 start = time.time()
 934                 while True:
 935                         # Download and write
 936                         before = time.time()
 937                         data_block = data.read(block_size)
 938                         after = time.time()
 939                         if len(data_block) == 0:
 940                                 break
 941                         byte_counter += len(data_block)
 942
 943                         # Open file just in time
 944                         if stream is None:
 945                                 try:
 946                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 947                                         assert stream is not None
 948                                         filename = self.undo_temp_name(tmpfilename)
 949                                         self.report_destination(filename)
 950                                 except (OSError, IOError), err:
 951                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 952                                         return False
 953                         try:
 954                                 stream.write(data_block)
 955                         except (IOError, OSError), err:
 956                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 957                                 return False
 958                         block_size = self.best_block_size(after - before, len(data_block))
 959
 960                         # Progress message
 961                         percent_str = self.calc_percent(byte_counter, data_len)
 962                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 963                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 964                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 965
 966                         # Apply rate limit
 967                         self.slow_down(start, byte_counter - resume_len)
 968
 969                 if stream is None:
 970                         self.trouble(u'\nERROR: Did not get any data blocks')
 971                         return False
 972                 stream.close()
 973                 self.report_finish()
 974                 if data_len is not None and byte_counter != data_len:
 975                         raise ContentTooShortError(byte_counter, long(data_len))
 976                 self.try_rename(tmpfilename, filename)
 977
 978                 # Update file modification time
 979                 if self.params.get('updatetime', True):
 980                         self.try_utime(filename, data.info().get('last-modified', None))
 981
 982                 return True
 983
 984
 985 class InfoExtractor(object):
 986         """Information Extractor class.
 987
 988         Information extractors are the classes that, given a URL, extract
 989         information from the video (or videos) the URL refers to. This
 990         information includes the real video URL, the video title and simplified
 991         title, author and others. The information is stored in a dictionary
 992         which is then passed to the FileDownloader. The FileDownloader
 993         processes this information possibly downloading the video to the file
 994         system, among other possible outcomes. The dictionaries must include
 995         the following fields:
 996
 997         id:             Video identifier.
 998         url:            Final video URL.
 999         uploader:       Nickname of the video uploader.
1000         title:          Literal title.
1001         stitle:         Simplified title.
1002         ext:            Video filename extension.
1003         format:         Video format.
1004         player_url:     SWF Player URL (may be None).
1005
1006         The following fields are optional. Their primary purpose is to allow
1007         youtube-dl to serve as the backend for a video search function, such
1008         as the one in youtube2mp3.  They are only used when their respective
1009         forced printing functions are called:
1010
1011         thumbnail:      Full URL to a video thumbnail image.
1012         description:    One-line video description.
1013
1014         Subclasses of this one should re-define the _real_initialize() and
1015         _real_extract() methods, as well as the suitable() static method.
1016         Probably, they should also be instantiated and added to the main
1017         downloader.
1018         """
1019
1020         _ready = False
1021         _downloader = None
1022
1023         def __init__(self, downloader=None):
1024                 """Constructor. Receives an optional downloader."""
1025                 self._ready = False
1026                 self.set_downloader(downloader)
1027
1028         @staticmethod
1029         def suitable(url):
1030                 """Receives a URL and returns True if suitable for this IE."""
1031                 return False
1032
1033         def initialize(self):
1034                 """Initializes an instance (authentication, etc)."""
1035                 if not self._ready:
1036                         self._real_initialize()
1037                         self._ready = True
1038
1039         def extract(self, url):
1040                 """Extracts URL information and returns it in list of dicts."""
1041                 self.initialize()
1042                 return self._real_extract(url)
1043
1044         def set_downloader(self, downloader):
1045                 """Sets the downloader for this IE."""
1046                 self._downloader = downloader
1047
1048         def _real_initialize(self):
1049                 """Real initialization process. Redefine in subclasses."""
1050                 pass
1051
1052         def _real_extract(self, url):
1053                 """Real extraction process. Redefine in subclasses."""
1054                 pass
1055
1056
1057 class YoutubeIE(InfoExtractor):
1058         """Information extractor for youtube.com."""
1059
1060         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1061         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1062         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1063         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1064         _NETRC_MACHINE = 'youtube'
1065         # Listed in order of quality
1066         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1067         _video_extensions = {
1068                 '13': '3gp',
1069                 '17': 'mp4',
1070                 '18': 'mp4',
1071                 '22': 'mp4',
1072                 '37': 'mp4',
1073                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1074                 '43': 'webm',
1075                 '45': 'webm',
1076         }
1077
1078         @staticmethod
1079         def suitable(url):
1080                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1081
1082         def report_lang(self):
1083                 """Report attempt to set language."""
1084                 self._downloader.to_screen(u'[youtube] Setting language')
1085
1086         def report_login(self):
1087                 """Report attempt to log in."""
1088                 self._downloader.to_screen(u'[youtube] Logging in')
1089
1090         def report_age_confirmation(self):
1091                 """Report attempt to confirm age."""
1092                 self._downloader.to_screen(u'[youtube] Confirming age')
1093
1094         def report_video_webpage_download(self, video_id):
1095                 """Report attempt to download video webpage."""
1096                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1097
1098         def report_video_info_webpage_download(self, video_id):
1099                 """Report attempt to download video info webpage."""
1100                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1101
1102         def report_information_extraction(self, video_id):
1103                 """Report attempt to extract video information."""
1104                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1105
1106         def report_unavailable_format(self, video_id, format):
1107                 """Report extracted video URL."""
1108                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1109
1110         def report_rtmp_download(self):
1111                 """Indicate the download will use the RTMP protocol."""
1112                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1113
1114         def _real_initialize(self):
1115                 if self._downloader is None:
1116                         return
1117
1118                 username = None
1119                 password = None
1120                 downloader_params = self._downloader.params
1121
1122                 # Attempt to use provided username and password or .netrc data
1123                 if downloader_params.get('username', None) is not None:
1124                         username = downloader_params['username']
1125                         password = downloader_params['password']
1126                 elif downloader_params.get('usenetrc', False):
1127                         try:
1128                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1129                                 if info is not None:
1130                                         username = info[0]
1131                                         password = info[2]
1132                                 else:
1133                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1134                         except (IOError, netrc.NetrcParseError), err:
1135                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1136                                 return
1137
1138                 # Set language
1139                 request = urllib2.Request(self._LANG_URL)
1140                 try:
1141                         self.report_lang()
1142                         urllib2.urlopen(request).read()
1143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1144                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1145                         return
1146
1147                 # No authentication to be performed
1148                 if username is None:
1149                         return
1150
1151                 # Log in
1152                 login_form = {
1153                                 'current_form': 'loginForm',
1154                                 'next':         '/',
1155                                 'action_login': 'Log In',
1156                                 'username':     username,
1157                                 'password':     password,
1158                                 }
1159                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1160                 try:
1161                         self.report_login()
1162                         login_results = urllib2.urlopen(request).read()
1163                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1164                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1165                                 return
1166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1168                         return
1169
1170                 # Confirm age
1171                 age_form = {
1172                                 'next_url':             '/',
1173                                 'action_confirm':       'Confirm',
1174                                 }
1175                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1176                 try:
1177                         self.report_age_confirmation()
1178                         age_results = urllib2.urlopen(request).read()
1179                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1180                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1181                         return
1182
1183         def _real_extract(self, url):
1184                 # Extract video id from URL
1185                 mobj = re.match(self._VALID_URL, url)
1186                 if mobj is None:
1187                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1188                         return
1189                 video_id = mobj.group(2)
1190
1191                 # Get video webpage
1192                 self.report_video_webpage_download(video_id)
1193                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1194                 try:
1195                         video_webpage = urllib2.urlopen(request).read()
1196                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1197                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1198                         return
1199
1200                 # Attempt to extract SWF player URL
1201                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1202                 if mobj is not None:
1203                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1204                 else:
1205                         player_url = None
1206
1207                 # Get video info
1208                 self.report_video_info_webpage_download(video_id)
1209                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1210                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1211                                         % (video_id, el_type))
1212                         request = urllib2.Request(video_info_url)
1213                         try:
1214                                 video_info_webpage = urllib2.urlopen(request).read()
1215                                 video_info = parse_qs(video_info_webpage)
1216                                 if 'token' in video_info:
1217                                         break
1218                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1220                                 return
1221                 if 'token' not in video_info:
1222                         if 'reason' in video_info:
1223                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1224                         else:
1225                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1226                         return
1227
1228                 # Start extracting information
1229                 self.report_information_extraction(video_id)
1230
1231                 # uploader
1232                 if 'author' not in video_info:
1233                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1234                         return
1235                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1236
1237                 # title
1238                 if 'title' not in video_info:
1239                         self._downloader.trouble(u'ERROR: unable to extract video title')
1240                         return
1241                 video_title = urllib.unquote_plus(video_info['title'][0])
1242                 video_title = video_title.decode('utf-8')
1243                 video_title = sanitize_title(video_title)
1244
1245                 # simplified title
1246                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1247                 simple_title = simple_title.strip(ur'_')
1248
1249                 # thumbnail image
1250                 if 'thumbnail_url' not in video_info:
1251                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1252                         video_thumbnail = ''
1253                 else:   # don't panic if we can't find it
1254                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1255
1256                 # upload date
1257                 upload_date = u'NA'
1258                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1259                 if mobj is not None:
1260                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1261                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1262                         for expression in format_expressions:
1263                                 try:
1264                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1265                                 except:
1266                                         pass
1267
1268                 # description
1269                 try:
1270                         lxml.etree
1271                 except NameError:
1272                         video_description = u'No description available.'
1273                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1274                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1275                                 if mobj is not None:
1276                                         video_description = mobj.group(1).decode('utf-8')
1277                 else:
1278                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1279                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1280                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1281                         # TODO use another parser
1282
1283                 # token
1284                 video_token = urllib.unquote_plus(video_info['token'][0])
1285
1286                 # Decide which formats to download
1287                 req_format = self._downloader.params.get('format', None)
1288
1289                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1290                         self.report_rtmp_download()
1291                         video_url_list = [(None, video_info['conn'][0])]
1292                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1293                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1294                         url_data = [parse_qs(uds) for uds in url_data_strs]
1295                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1296                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1297
1298                         format_limit = self._downloader.params.get('format_limit', None)
1299                         if format_limit is not None and format_limit in self._available_formats:
1300                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1301                         else:
1302                                 format_list = self._available_formats
1303                         existing_formats = [x for x in format_list if x in url_map]
1304                         if len(existing_formats) == 0:
1305                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1306                                 return
1307                         if req_format is None:
1308                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1309                         elif req_format == '-1':
1310                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1311                         else:
1312                                 # Specific format
1313                                 if req_format not in url_map:
1314                                         self._downloader.trouble(u'ERROR: requested format not available')
1315                                         return
1316                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1317                 else:
1318                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1319                         return
1320
1321                 for format_param, video_real_url in video_url_list:
1322                         # At this point we have a new video
1323                         self._downloader.increment_downloads()
1324
1325                         # Extension
1326                         video_extension = self._video_extensions.get(format_param, 'flv')
1327
1328                         try:
1329                                 # Process video information
1330                                 self._downloader.process_info({
1331                                         'id':           video_id.decode('utf-8'),
1332                                         'url':          video_real_url.decode('utf-8'),
1333                                         'uploader':     video_uploader.decode('utf-8'),
1334                                         'upload_date':  upload_date,
1335                                         'title':        video_title,
1336                                         'stitle':       simple_title,
1337                                         'ext':          video_extension.decode('utf-8'),
1338                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1339                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1340                                         'description':  video_description,
1341                                         'player_url':   player_url,
1342                                 })
1343                         except UnavailableVideoError, err:
1344                                 self._downloader.trouble(u'\nERROR: unable to download video')
1345
1346
1347 class MetacafeIE(InfoExtractor):
1348         """Information Extractor for metacafe.com."""
1349
1350         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1351         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1352         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1353         _youtube_ie = None
1354
1355         def __init__(self, youtube_ie, downloader=None):
1356                 InfoExtractor.__init__(self, downloader)
1357                 self._youtube_ie = youtube_ie
1358
1359         @staticmethod
1360         def suitable(url):
1361                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1362
1363         def report_disclaimer(self):
1364                 """Report disclaimer retrieval."""
1365                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1366
1367         def report_age_confirmation(self):
1368                 """Report attempt to confirm age."""
1369                 self._downloader.to_screen(u'[metacafe] Confirming age')
1370
1371         def report_download_webpage(self, video_id):
1372                 """Report webpage download."""
1373                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1374
1375         def report_extraction(self, video_id):
1376                 """Report information extraction."""
1377                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1378
1379         def _real_initialize(self):
1380                 # Retrieve disclaimer
1381                 request = urllib2.Request(self._DISCLAIMER)
1382                 try:
1383                         self.report_disclaimer()
1384                         disclaimer = urllib2.urlopen(request).read()
1385                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1386                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1387                         return
1388
1389                 # Confirm age
1390                 disclaimer_form = {
1391                         'filters': '0',
1392                         'submit': "Continue - I'm over 18",
1393                         }
1394                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1395                 try:
1396                         self.report_age_confirmation()
1397                         disclaimer = urllib2.urlopen(request).read()
1398                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1400                         return
1401
1402         def _real_extract(self, url):
1403                 # Extract id and simplified title from URL
1404                 mobj = re.match(self._VALID_URL, url)
1405                 if mobj is None:
1406                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1407                         return
1408
1409                 video_id = mobj.group(1)
1410
1411                 # Check if video comes from YouTube
1412                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1413                 if mobj2 is not None:
1414                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1415                         return
1416
1417                 # At this point we have a new video
1418                 self._downloader.increment_downloads()
1419
1420                 simple_title = mobj.group(2).decode('utf-8')
1421
1422                 # Retrieve video webpage to extract further information
1423                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1424                 try:
1425                         self.report_download_webpage(video_id)
1426                         webpage = urllib2.urlopen(request).read()
1427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1428                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1429                         return
1430
1431                 # Extract URL, uploader and title from webpage
1432                 self.report_extraction(video_id)
1433                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1434                 if mobj is not None:
1435                         mediaURL = urllib.unquote(mobj.group(1))
1436                         video_extension = mediaURL[-3:]
1437
1438                         # Extract gdaKey if available
1439                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1440                         if mobj is None:
1441                                 video_url = mediaURL
1442                         else:
1443                                 gdaKey = mobj.group(1)
1444                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1445                 else:
1446                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1447                         if mobj is None:
1448                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1449                                 return
1450                         vardict = parse_qs(mobj.group(1))
1451                         if 'mediaData' not in vardict:
1452                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1453                                 return
1454                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1455                         if mobj is None:
1456                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1457                                 return
1458                         mediaURL = mobj.group(1).replace('\\/', '/')
1459                         video_extension = mediaURL[-3:]
1460                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1461
1462                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: unable to extract title')
1465                         return
1466                 video_title = mobj.group(1).decode('utf-8')
1467                 video_title = sanitize_title(video_title)
1468
1469                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1470                 if mobj is None:
1471                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1472                         return
1473                 video_uploader = mobj.group(1)
1474
1475                 try:
1476                         # Process video information
1477                         self._downloader.process_info({
1478                                 'id':           video_id.decode('utf-8'),
1479                                 'url':          video_url.decode('utf-8'),
1480                                 'uploader':     video_uploader.decode('utf-8'),
1481                                 'upload_date':  u'NA',
1482                                 'title':        video_title,
1483                                 'stitle':       simple_title,
1484                                 'ext':          video_extension.decode('utf-8'),
1485                                 'format':       u'NA',
1486                                 'player_url':   None,
1487                         })
1488                 except UnavailableVideoError:
1489                         self._downloader.trouble(u'\nERROR: unable to download video')
1490
1491
1492 class DailymotionIE(InfoExtractor):
1493         """Information Extractor for Dailymotion"""
1494
1495         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1496
1497         def __init__(self, downloader=None):
1498                 InfoExtractor.__init__(self, downloader)
1499
1500         @staticmethod
1501         def suitable(url):
1502                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1503
1504         def report_download_webpage(self, video_id):
1505                 """Report webpage download."""
1506                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1507
1508         def report_extraction(self, video_id):
1509                 """Report information extraction."""
1510                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1511
1512         def _real_initialize(self):
1513                 return
1514
1515         def _real_extract(self, url):
1516                 # Extract id and simplified title from URL
1517                 mobj = re.match(self._VALID_URL, url)
1518                 if mobj is None:
1519                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1520                         return
1521
1522                 # At this point we have a new video
1523                 self._downloader.increment_downloads()
1524                 video_id = mobj.group(1)
1525
1526                 simple_title = mobj.group(2).decode('utf-8')
1527                 video_extension = 'flv'
1528
1529                 # Retrieve video webpage to extract further information
1530                 request = urllib2.Request(url)
1531                 request.add_header('Cookie', 'family_filter=off')
1532                 try:
1533                         self.report_download_webpage(video_id)
1534                         webpage = urllib2.urlopen(request).read()
1535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1536                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1537                         return
1538
1539                 # Extract URL, uploader and title from webpage
1540                 self.report_extraction(video_id)
1541                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1544                         return
1545                 sequence = urllib.unquote(mobj.group(1))
1546                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1549                         return
1550                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1551
1552                 # if needed add http://www.dailymotion.com/ if relative URL
1553
1554                 video_url = mediaURL
1555
1556                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1557                 if mobj is None:
1558                         self._downloader.trouble(u'ERROR: unable to extract title')
1559                         return
1560                 video_title = mobj.group(1).decode('utf-8')
1561                 video_title = sanitize_title(video_title)
1562
1563                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1564                 if mobj is None:
1565                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1566                         return
1567                 video_uploader = mobj.group(1)
1568
1569                 try:
1570                         # Process video information
1571                         self._downloader.process_info({
1572                                 'id':           video_id.decode('utf-8'),
1573                                 'url':          video_url.decode('utf-8'),
1574                                 'uploader':     video_uploader.decode('utf-8'),
1575                                 'upload_date':  u'NA',
1576                                 'title':        video_title,
1577                                 'stitle':       simple_title,
1578                                 'ext':          video_extension.decode('utf-8'),
1579                                 'format':       u'NA',
1580                                 'player_url':   None,
1581                         })
1582                 except UnavailableVideoError:
1583                         self._downloader.trouble(u'\nERROR: unable to download video')
1584
1585
1586 class GoogleIE(InfoExtractor):
1587         """Information extractor for video.google.com."""
1588
1589         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1590
1591         def __init__(self, downloader=None):
1592                 InfoExtractor.__init__(self, downloader)
1593
1594         @staticmethod
1595         def suitable(url):
1596                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1597
1598         def report_download_webpage(self, video_id):
1599                 """Report webpage download."""
1600                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1601
1602         def report_extraction(self, video_id):
1603                 """Report information extraction."""
1604                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1605
1606         def _real_initialize(self):
1607                 return
1608
1609         def _real_extract(self, url):
1610                 # Extract id from URL
1611                 mobj = re.match(self._VALID_URL, url)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1614                         return
1615
1616                 # At this point we have a new video
1617                 self._downloader.increment_downloads()
1618                 video_id = mobj.group(1)
1619
1620                 video_extension = 'mp4'
1621
1622                 # Retrieve video webpage to extract further information
1623                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1624                 try:
1625                         self.report_download_webpage(video_id)
1626                         webpage = urllib2.urlopen(request).read()
1627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1629                         return
1630
1631                 # Extract URL, uploader, and title from webpage
1632                 self.report_extraction(video_id)
1633                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1634                 if mobj is None:
1635                         video_extension = 'flv'
1636                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1637                 if mobj is None:
1638                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1639                         return
1640                 mediaURL = urllib.unquote(mobj.group(1))
1641                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1642                 mediaURL = mediaURL.replace('\\x26', '\x26')
1643
1644                 video_url = mediaURL
1645
1646                 mobj = re.search(r'<title>(.*)</title>', webpage)
1647                 if mobj is None:
1648                         self._downloader.trouble(u'ERROR: unable to extract title')
1649                         return
1650                 video_title = mobj.group(1).decode('utf-8')
1651                 video_title = sanitize_title(video_title)
1652                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1653
1654                 # Extract video description
1655                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: unable to extract video description')
1658                         return
1659                 video_description = mobj.group(1).decode('utf-8')
1660                 if not video_description:
1661                         video_description = 'No description available.'
1662
1663                 # Extract video thumbnail
1664                 if self._downloader.params.get('forcethumbnail', False):
1665                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1666                         try:
1667                                 webpage = urllib2.urlopen(request).read()
1668                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1669                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1670                                 return
1671                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1672                         if mobj is None:
1673                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1674                                 return
1675                         video_thumbnail = mobj.group(1)
1676                 else:   # we need something to pass to process_info
1677                         video_thumbnail = ''
1678
1679                 try:
1680                         # Process video information
1681                         self._downloader.process_info({
1682                                 'id':           video_id.decode('utf-8'),
1683                                 'url':          video_url.decode('utf-8'),
1684                                 'uploader':     u'NA',
1685                                 'upload_date':  u'NA',
1686                                 'title':        video_title,
1687                                 'stitle':       simple_title,
1688                                 'ext':          video_extension.decode('utf-8'),
1689                                 'format':       u'NA',
1690                                 'player_url':   None,
1691                         })
1692                 except UnavailableVideoError:
1693                         self._downloader.trouble(u'\nERROR: unable to download video')
1694
1695
1696 class PhotobucketIE(InfoExtractor):
1697         """Information extractor for photobucket.com."""
1698
1699         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1700
1701         def __init__(self, downloader=None):
1702                 InfoExtractor.__init__(self, downloader)
1703
1704         @staticmethod
1705         def suitable(url):
1706                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1707
1708         def report_download_webpage(self, video_id):
1709                 """Report webpage download."""
1710                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1711
1712         def report_extraction(self, video_id):
1713                 """Report information extraction."""
1714                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1715
1716         def _real_initialize(self):
1717                 return
1718
1719         def _real_extract(self, url):
1720                 # Extract id from URL
1721                 mobj = re.match(self._VALID_URL, url)
1722                 if mobj is None:
1723                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1724                         return
1725
1726                 # At this point we have a new video
1727                 self._downloader.increment_downloads()
1728                 video_id = mobj.group(1)
1729
1730                 video_extension = 'flv'
1731
1732                 # Retrieve video webpage to extract further information
1733                 request = urllib2.Request(url)
1734                 try:
1735                         self.report_download_webpage(video_id)
1736                         webpage = urllib2.urlopen(request).read()
1737                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1738                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1739                         return
1740
1741                 # Extract URL, uploader, and title from webpage
1742                 self.report_extraction(video_id)
1743                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1746                         return
1747                 mediaURL = urllib.unquote(mobj.group(1))
1748
1749                 video_url = mediaURL
1750
1751                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1752                 if mobj is None:
1753                         self._downloader.trouble(u'ERROR: unable to extract title')
1754                         return
1755                 video_title = mobj.group(1).decode('utf-8')
1756                 video_title = sanitize_title(video_title)
1757                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1758
1759                 video_uploader = mobj.group(2).decode('utf-8')
1760
1761                 try:
1762                         # Process video information
1763                         self._downloader.process_info({
1764                                 'id':           video_id.decode('utf-8'),
1765                                 'url':          video_url.decode('utf-8'),
1766                                 'uploader':     video_uploader,
1767                                 'upload_date':  u'NA',
1768                                 'title':        video_title,
1769                                 'stitle':       simple_title,
1770                                 'ext':          video_extension.decode('utf-8'),
1771                                 'format':       u'NA',
1772                                 'player_url':   None,
1773                         })
1774                 except UnavailableVideoError:
1775                         self._downloader.trouble(u'\nERROR: unable to download video')
1776
1777
1778 class YahooIE(InfoExtractor):
1779         """Information extractor for video.yahoo.com."""
1780
1781         # _VALID_URL matches all Yahoo! Video URLs
1782         # _VPAGE_URL matches only the extractable '/watch/' URLs
1783         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1784         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1785
1786         def __init__(self, downloader=None):
1787                 InfoExtractor.__init__(self, downloader)
1788
1789         @staticmethod
1790         def suitable(url):
1791                 return (re.match(YahooIE._VALID_URL, url) is not None)
1792
1793         def report_download_webpage(self, video_id):
1794                 """Report webpage download."""
1795                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1796
1797         def report_extraction(self, video_id):
1798                 """Report information extraction."""
1799                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1800
1801         def _real_initialize(self):
1802                 return
1803
1804         def _real_extract(self, url, new_video=True):
1805                 # Extract ID from URL
1806                 mobj = re.match(self._VALID_URL, url)
1807                 if mobj is None:
1808                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1809                         return
1810
1811                 # At this point we have a new video
1812                 self._downloader.increment_downloads()
1813                 video_id = mobj.group(2)
1814                 video_extension = 'flv'
1815
1816                 # Rewrite valid but non-extractable URLs as
1817                 # extractable English language /watch/ URLs
1818                 if re.match(self._VPAGE_URL, url) is None:
1819                         request = urllib2.Request(url)
1820                         try:
1821                                 webpage = urllib2.urlopen(request).read()
1822                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1823                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1824                                 return
1825
1826                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1827                         if mobj is None:
1828                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1829                                 return
1830                         yahoo_id = mobj.group(1)
1831
1832                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1833                         if mobj is None:
1834                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1835                                 return
1836                         yahoo_vid = mobj.group(1)
1837
1838                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1839                         return self._real_extract(url, new_video=False)
1840
1841                 # Retrieve video webpage to extract further information
1842                 request = urllib2.Request(url)
1843                 try:
1844                         self.report_download_webpage(video_id)
1845                         webpage = urllib2.urlopen(request).read()
1846                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1847                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1848                         return
1849
1850                 # Extract uploader and title from webpage
1851                 self.report_extraction(video_id)
1852                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1853                 if mobj is None:
1854                         self._downloader.trouble(u'ERROR: unable to extract video title')
1855                         return
1856                 video_title = mobj.group(1).decode('utf-8')
1857                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1858
1859                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1860                 if mobj is None:
1861                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1862                         return
1863                 video_uploader = mobj.group(1).decode('utf-8')
1864
1865                 # Extract video thumbnail
1866                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1867                 if mobj is None:
1868                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1869                         return
1870                 video_thumbnail = mobj.group(1).decode('utf-8')
1871
1872                 # Extract video description
1873                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1874                 if mobj is None:
1875                         self._downloader.trouble(u'ERROR: unable to extract video description')
1876                         return
1877                 video_description = mobj.group(1).decode('utf-8')
1878                 if not video_description:
1879                         video_description = 'No description available.'
1880
1881                 # Extract video height and width
1882                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1883                 if mobj is None:
1884                         self._downloader.trouble(u'ERROR: unable to extract video height')
1885                         return
1886                 yv_video_height = mobj.group(1)
1887
1888                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: unable to extract video width')
1891                         return
1892                 yv_video_width = mobj.group(1)
1893
1894                 # Retrieve video playlist to extract media URL
1895                 # I'm not completely sure what all these options are, but we
1896                 # seem to need most of them, otherwise the server sends a 401.
1897                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1898                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1899                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1900                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1901                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1902                 try:
1903                         self.report_download_webpage(video_id)
1904                         webpage = urllib2.urlopen(request).read()
1905                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1906                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1907                         return
1908
1909                 # Extract media URL from playlist XML
1910                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1913                         return
1914                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1915                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1916
1917                 try:
1918                         # Process video information
1919                         self._downloader.process_info({
1920                                 'id':           video_id.decode('utf-8'),
1921                                 'url':          video_url,
1922                                 'uploader':     video_uploader,
1923                                 'upload_date':  u'NA',
1924                                 'title':        video_title,
1925                                 'stitle':       simple_title,
1926                                 'ext':          video_extension.decode('utf-8'),
1927                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1928                                 'description':  video_description,
1929                                 'thumbnail':    video_thumbnail,
1930                                 'player_url':   None,
1931                         })
1932                 except UnavailableVideoError:
1933                         self._downloader.trouble(u'\nERROR: unable to download video')
1934
1935
1936 class VimeoIE(InfoExtractor):
1937         """Information extractor for vimeo.com."""
1938
1939         # _VALID_URL matches Vimeo URLs
1940         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1941
1942         def __init__(self, downloader=None):
1943                 InfoExtractor.__init__(self, downloader)
1944
1945         @staticmethod
1946         def suitable(url):
1947                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1948
1949         def report_download_webpage(self, video_id):
1950                 """Report webpage download."""
1951                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1952
1953         def report_extraction(self, video_id):
1954                 """Report information extraction."""
1955                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1956
1957         def _real_initialize(self):
1958                 return
1959
1960         def _real_extract(self, url, new_video=True):
1961                 # Extract ID from URL
1962                 mobj = re.match(self._VALID_URL, url)
1963                 if mobj is None:
1964                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1965                         return
1966
1967                 # At this point we have a new video
1968                 self._downloader.increment_downloads()
1969                 video_id = mobj.group(1)
1970
1971                 # Retrieve video webpage to extract further information
1972                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1973                 try:
1974                         self.report_download_webpage(video_id)
1975                         webpage = urllib2.urlopen(request).read()
1976                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1977                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1978                         return
1979
1980                 # Now we begin extracting as much information as we can from what we
1981                 # retrieved. First we extract the information common to all extractors,
1982                 # and latter we extract those that are Vimeo specific.
1983                 self.report_extraction(video_id)
1984
1985                 # Extract title
1986                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1987                 if mobj is None:
1988                         self._downloader.trouble(u'ERROR: unable to extract video title')
1989                         return
1990                 video_title = mobj.group(1).decode('utf-8')
1991                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1992
1993                 # Extract uploader
1994                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1995                 if mobj is None:
1996                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1997                         return
1998                 video_uploader = mobj.group(1).decode('utf-8')
1999
2000                 # Extract video thumbnail
2001                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2002                 if mobj is None:
2003                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2004                         return
2005                 video_thumbnail = mobj.group(1).decode('utf-8')
2006
2007                 # # Extract video description
2008                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2009                 # if mobj is None:
2010                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2011                 #       return
2012                 # video_description = mobj.group(1).decode('utf-8')
2013                 # if not video_description: video_description = 'No description available.'
2014                 video_description = 'Foo.'
2015
2016                 # Vimeo specific: extract request signature
2017                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2020                         return
2021                 sig = mobj.group(1).decode('utf-8')
2022
2023                 # Vimeo specific: Extract request signature expiration
2024                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2025                 if mobj is None:
2026                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2027                         return
2028                 sig_exp = mobj.group(1).decode('utf-8')
2029
2030                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2031
2032                 try:
2033                         # Process video information
2034                         self._downloader.process_info({
2035                                 'id':           video_id.decode('utf-8'),
2036                                 'url':          video_url,
2037                                 'uploader':     video_uploader,
2038                                 'upload_date':  u'NA',
2039                                 'title':        video_title,
2040                                 'stitle':       simple_title,
2041                                 'ext':          u'mp4',
2042                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2043                                 'description':  video_description,
2044                                 'thumbnail':    video_thumbnail,
2045                                 'description':  video_description,
2046                                 'player_url':   None,
2047                         })
2048                 except UnavailableVideoError:
2049                         self._downloader.trouble(u'ERROR: unable to download video')
2050
2051
2052 class GenericIE(InfoExtractor):
2053         """Generic last-resort information extractor."""
2054
2055         def __init__(self, downloader=None):
2056                 InfoExtractor.__init__(self, downloader)
2057
2058         @staticmethod
2059         def suitable(url):
2060                 return True
2061
2062         def report_download_webpage(self, video_id):
2063                 """Report webpage download."""
2064                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2065                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2066
2067         def report_extraction(self, video_id):
2068                 """Report information extraction."""
2069                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2070
2071         def _real_initialize(self):
2072                 return
2073
2074         def _real_extract(self, url):
2075                 # At this point we have a new video
2076                 self._downloader.increment_downloads()
2077
2078                 video_id = url.split('/')[-1]
2079                 request = urllib2.Request(url)
2080                 try:
2081                         self.report_download_webpage(video_id)
2082                         webpage = urllib2.urlopen(request).read()
2083                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2084                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2085                         return
2086                 except ValueError, err:
2087                         # since this is the last-resort InfoExtractor, if
2088                         # this error is thrown, it'll be thrown here
2089                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2090                         return
2091
2092                 self.report_extraction(video_id)
2093                 # Start with something easy: JW Player in SWFObject
2094                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2095                 if mobj is None:
2096                         # Broaden the search a little bit
2097                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2098                 if mobj is None:
2099                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2100                         return
2101
2102                 # It's possible that one of the regexes
2103                 # matched, but returned an empty group:
2104                 if mobj.group(1) is None:
2105                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2106                         return
2107
2108                 video_url = urllib.unquote(mobj.group(1))
2109                 video_id = os.path.basename(video_url)
2110
2111                 # here's a fun little line of code for you:
2112                 video_extension = os.path.splitext(video_id)[1][1:]
2113                 video_id = os.path.splitext(video_id)[0]
2114
2115                 # it's tempting to parse this further, but you would
2116                 # have to take into account all the variations like
2117                 #   Video Title - Site Name
2118                 #   Site Name | Video Title
2119                 #   Video Title - Tagline | Site Name
2120                 # and so on and so forth; it's just not practical
2121                 mobj = re.search(r'<title>(.*)</title>', webpage)
2122                 if mobj is None:
2123                         self._downloader.trouble(u'ERROR: unable to extract title')
2124                         return
2125                 video_title = mobj.group(1).decode('utf-8')
2126                 video_title = sanitize_title(video_title)
2127                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2128
2129                 # video uploader is domain name
2130                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2131                 if mobj is None:
2132                         self._downloader.trouble(u'ERROR: unable to extract title')
2133                         return
2134                 video_uploader = mobj.group(1).decode('utf-8')
2135
2136                 try:
2137                         # Process video information
2138                         self._downloader.process_info({
2139                                 'id':           video_id.decode('utf-8'),
2140                                 'url':          video_url.decode('utf-8'),
2141                                 'uploader':     video_uploader,
2142                                 'upload_date':  u'NA',
2143                                 'title':        video_title,
2144                                 'stitle':       simple_title,
2145                                 'ext':          video_extension.decode('utf-8'),
2146                                 'format':       u'NA',
2147                                 'player_url':   None,
2148                         })
2149                 except UnavailableVideoError, err:
2150                         self._downloader.trouble(u'\nERROR: unable to download video')
2151
2152
2153 class YoutubeSearchIE(InfoExtractor):
2154         """Information Extractor for YouTube search queries."""
2155         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2156         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2157         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2158         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2159         _youtube_ie = None
2160         _max_youtube_results = 1000
2161
2162         def __init__(self, youtube_ie, downloader=None):
2163                 InfoExtractor.__init__(self, downloader)
2164                 self._youtube_ie = youtube_ie
2165
2166         @staticmethod
2167         def suitable(url):
2168                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2169
2170         def report_download_page(self, query, pagenum):
2171                 """Report attempt to download playlist page with given number."""
2172                 query = query.decode(preferredencoding())
2173                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2174
2175         def _real_initialize(self):
2176                 self._youtube_ie.initialize()
2177
2178         def _real_extract(self, query):
2179                 mobj = re.match(self._VALID_QUERY, query)
2180                 if mobj is None:
2181                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2182                         return
2183
2184                 prefix, query = query.split(':')
2185                 prefix = prefix[8:]
2186                 query = query.encode('utf-8')
2187                 if prefix == '':
2188                         self._download_n_results(query, 1)
2189                         return
2190                 elif prefix == 'all':
2191                         self._download_n_results(query, self._max_youtube_results)
2192                         return
2193                 else:
2194                         try:
2195                                 n = long(prefix)
2196                                 if n <= 0:
2197                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2198                                         return
2199                                 elif n > self._max_youtube_results:
2200                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2201                                         n = self._max_youtube_results
2202                                 self._download_n_results(query, n)
2203                                 return
2204                         except ValueError: # parsing prefix as integer fails
2205                                 self._download_n_results(query, 1)
2206                                 return
2207
2208         def _download_n_results(self, query, n):
2209                 """Downloads a specified number of results for a query"""
2210
2211                 video_ids = []
2212                 already_seen = set()
2213                 pagenum = 1
2214
2215                 while True:
2216                         self.report_download_page(query, pagenum)
2217                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2218                         request = urllib2.Request(result_url)
2219                         try:
2220                                 page = urllib2.urlopen(request).read()
2221                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2222                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2223                                 return
2224
2225                         # Extract video identifiers
2226                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2227                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2228                                 if video_id not in already_seen:
2229                                         video_ids.append(video_id)
2230                                         already_seen.add(video_id)
2231                                         if len(video_ids) == n:
2232                                                 # Specified n videos reached
2233                                                 for id in video_ids:
2234                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2235                                                 return
2236
2237                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2238                                 for id in video_ids:
2239                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2240                                 return
2241
2242                         pagenum = pagenum + 1
2243
2244
2245 class GoogleSearchIE(InfoExtractor):
2246         """Information Extractor for Google Video search queries."""
2247         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2248         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2249         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2250         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2251         _google_ie = None
2252         _max_google_results = 1000
2253
2254         def __init__(self, google_ie, downloader=None):
2255                 InfoExtractor.__init__(self, downloader)
2256                 self._google_ie = google_ie
2257
2258         @staticmethod
2259         def suitable(url):
2260                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2261
2262         def report_download_page(self, query, pagenum):
2263                 """Report attempt to download playlist page with given number."""
2264                 query = query.decode(preferredencoding())
2265                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2266
2267         def _real_initialize(self):
2268                 self._google_ie.initialize()
2269
2270         def _real_extract(self, query):
2271                 mobj = re.match(self._VALID_QUERY, query)
2272                 if mobj is None:
2273                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2274                         return
2275
2276                 prefix, query = query.split(':')
2277                 prefix = prefix[8:]
2278                 query = query.encode('utf-8')
2279                 if prefix == '':
2280                         self._download_n_results(query, 1)
2281                         return
2282                 elif prefix == 'all':
2283                         self._download_n_results(query, self._max_google_results)
2284                         return
2285                 else:
2286                         try:
2287                                 n = long(prefix)
2288                                 if n <= 0:
2289                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2290                                         return
2291                                 elif n > self._max_google_results:
2292                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2293                                         n = self._max_google_results
2294                                 self._download_n_results(query, n)
2295                                 return
2296                         except ValueError: # parsing prefix as integer fails
2297                                 self._download_n_results(query, 1)
2298                                 return
2299
2300         def _download_n_results(self, query, n):
2301                 """Downloads a specified number of results for a query"""
2302
2303                 video_ids = []
2304                 already_seen = set()
2305                 pagenum = 1
2306
2307                 while True:
2308                         self.report_download_page(query, pagenum)
2309                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2310                         request = urllib2.Request(result_url)
2311                         try:
2312                                 page = urllib2.urlopen(request).read()
2313                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2314                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2315                                 return
2316
2317                         # Extract video identifiers
2318                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2319                                 video_id = mobj.group(1)
2320                                 if video_id not in already_seen:
2321                                         video_ids.append(video_id)
2322                                         already_seen.add(video_id)
2323                                         if len(video_ids) == n:
2324                                                 # Specified n videos reached
2325                                                 for id in video_ids:
2326                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2327                                                 return
2328
2329                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2330                                 for id in video_ids:
2331                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2332                                 return
2333
2334                         pagenum = pagenum + 1
2335
2336
2337 class YahooSearchIE(InfoExtractor):
2338         """Information Extractor for Yahoo! Video search queries."""
2339         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2340         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2341         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2342         _MORE_PAGES_INDICATOR = r'\s*Next'
2343         _yahoo_ie = None
2344         _max_yahoo_results = 1000
2345
2346         def __init__(self, yahoo_ie, downloader=None):
2347                 InfoExtractor.__init__(self, downloader)
2348                 self._yahoo_ie = yahoo_ie
2349
2350         @staticmethod
2351         def suitable(url):
2352                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2353
2354         def report_download_page(self, query, pagenum):
2355                 """Report attempt to download playlist page with given number."""
2356                 query = query.decode(preferredencoding())
2357                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2358
2359         def _real_initialize(self):
2360                 self._yahoo_ie.initialize()
2361
2362         def _real_extract(self, query):
2363                 mobj = re.match(self._VALID_QUERY, query)
2364                 if mobj is None:
2365                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2366                         return
2367
2368                 prefix, query = query.split(':')
2369                 prefix = prefix[8:]
2370                 query = query.encode('utf-8')
2371                 if prefix == '':
2372                         self._download_n_results(query, 1)
2373                         return
2374                 elif prefix == 'all':
2375                         self._download_n_results(query, self._max_yahoo_results)
2376                         return
2377                 else:
2378                         try:
2379                                 n = long(prefix)
2380                                 if n <= 0:
2381                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2382                                         return
2383                                 elif n > self._max_yahoo_results:
2384                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2385                                         n = self._max_yahoo_results
2386                                 self._download_n_results(query, n)
2387                                 return
2388                         except ValueError: # parsing prefix as integer fails
2389                                 self._download_n_results(query, 1)
2390                                 return
2391
2392         def _download_n_results(self, query, n):
2393                 """Downloads a specified number of results for a query"""
2394
2395                 video_ids = []
2396                 already_seen = set()
2397                 pagenum = 1
2398
2399                 while True:
2400                         self.report_download_page(query, pagenum)
2401                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2402                         request = urllib2.Request(result_url)
2403                         try:
2404                                 page = urllib2.urlopen(request).read()
2405                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2406                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2407                                 return
2408
2409                         # Extract video identifiers
2410                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2411                                 video_id = mobj.group(1)
2412                                 if video_id not in already_seen:
2413                                         video_ids.append(video_id)
2414                                         already_seen.add(video_id)
2415                                         if len(video_ids) == n:
2416                                                 # Specified n videos reached
2417                                                 for id in video_ids:
2418                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2419                                                 return
2420
2421                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2422                                 for id in video_ids:
2423                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2424                                 return
2425
2426                         pagenum = pagenum + 1
2427
2428
2429 class YoutubePlaylistIE(InfoExtractor):
2430         """Information Extractor for YouTube playlists."""
2431
2432         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2433         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2434         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2435         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2436         _youtube_ie = None
2437
2438         def __init__(self, youtube_ie, downloader=None):
2439                 InfoExtractor.__init__(self, downloader)
2440                 self._youtube_ie = youtube_ie
2441
2442         @staticmethod
2443         def suitable(url):
2444                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2445
2446         def report_download_page(self, playlist_id, pagenum):
2447                 """Report attempt to download playlist page with given number."""
2448                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2449
2450         def _real_initialize(self):
2451                 self._youtube_ie.initialize()
2452
2453         def _real_extract(self, url):
2454                 # Extract playlist id
2455                 mobj = re.match(self._VALID_URL, url)
2456                 if mobj is None:
2457                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2458                         return
2459
2460                 # Single video case
2461                 if mobj.group(3) is not None:
2462                         self._youtube_ie.extract(mobj.group(3))
2463                         return
2464
2465                 # Download playlist pages
2466                 # prefix is 'p' as default for playlists but there are other types that need extra care
2467                 playlist_prefix = mobj.group(1)
2468                 if playlist_prefix == 'a':
2469                         playlist_access = 'artist'
2470                 else:
2471                         playlist_prefix = 'p'
2472                         playlist_access = 'view_play_list'
2473                 playlist_id = mobj.group(2)
2474                 video_ids = []
2475                 pagenum = 1
2476
2477                 while True:
2478                         self.report_download_page(playlist_id, pagenum)
2479                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2480                         try:
2481                                 page = urllib2.urlopen(request).read()
2482                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2483                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2484                                 return
2485
2486                         # Extract video identifiers
2487                         ids_in_page = []
2488                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2489                                 if mobj.group(1) not in ids_in_page:
2490                                         ids_in_page.append(mobj.group(1))
2491                         video_ids.extend(ids_in_page)
2492
2493                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2494                                 break
2495                         pagenum = pagenum + 1
2496
2497                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2498                 playlistend = self._downloader.params.get('playlistend', -1)
2499                 video_ids = video_ids[playliststart:playlistend]
2500
2501                 for id in video_ids:
2502                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2503                 return
2504
2505
2506 class YoutubeUserIE(InfoExtractor):
2507         """Information Extractor for YouTube users."""
2508
2509         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2510         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2511         _GDATA_PAGE_SIZE = 50
2512         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2513         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2514         _youtube_ie = None
2515
2516         def __init__(self, youtube_ie, downloader=None):
2517                 InfoExtractor.__init__(self, downloader)
2518                 self._youtube_ie = youtube_ie
2519
2520         @staticmethod
2521         def suitable(url):
2522                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2523
2524         def report_download_page(self, username, start_index):
2525                 """Report attempt to download user page."""
2526                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2527                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2528
2529         def _real_initialize(self):
2530                 self._youtube_ie.initialize()
2531
2532         def _real_extract(self, url):
2533                 # Extract username
2534                 mobj = re.match(self._VALID_URL, url)
2535                 if mobj is None:
2536                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2537                         return
2538
2539                 username = mobj.group(1)
2540
2541                 # Download video ids using YouTube Data API. Result size per
2542                 # query is limited (currently to 50 videos) so we need to query
2543                 # page by page until there are no video ids - it means we got
2544                 # all of them.
2545
2546                 video_ids = []
2547                 pagenum = 0
2548
2549                 while True:
2550                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2551                         self.report_download_page(username, start_index)
2552
2553                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2554
2555                         try:
2556                                 page = urllib2.urlopen(request).read()
2557                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559                                 return
2560
2561                         # Extract video identifiers
2562                         ids_in_page = []
2563
2564                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2565                                 if mobj.group(1) not in ids_in_page:
2566                                         ids_in_page.append(mobj.group(1))
2567
2568                         video_ids.extend(ids_in_page)
2569
2570                         # A little optimization - if current page is not
2571                         # "full", ie. does not contain PAGE_SIZE video ids then
2572                         # we can assume that this page is the last one - there
2573                         # are no more ids on further pages - no need to query
2574                         # again.
2575
2576                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2577                                 break
2578
2579                         pagenum += 1
2580
2581                 all_ids_count = len(video_ids)
2582                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2583                 playlistend = self._downloader.params.get('playlistend', -1)
2584
2585                 if playlistend == -1:
2586                         video_ids = video_ids[playliststart:]
2587                 else:
2588                         video_ids = video_ids[playliststart:playlistend]
2589
2590                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2591                                 (username, all_ids_count, len(video_ids)))
2592
2593                 for video_id in video_ids:
2594                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2595
2596
2597 class DepositFilesIE(InfoExtractor):
2598         """Information extractor for depositfiles.com"""
2599
2600         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2601
2602         def __init__(self, downloader=None):
2603                 InfoExtractor.__init__(self, downloader)
2604
2605         @staticmethod
2606         def suitable(url):
2607                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2608
2609         def report_download_webpage(self, file_id):
2610                 """Report webpage download."""
2611                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2612
2613         def report_extraction(self, file_id):
2614                 """Report information extraction."""
2615                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2616
2617         def _real_initialize(self):
2618                 return
2619
2620         def _real_extract(self, url):
2621                 # At this point we have a new file
2622                 self._downloader.increment_downloads()
2623
2624                 file_id = url.split('/')[-1]
2625                 # Rebuild url in english locale
2626                 url = 'http://depositfiles.com/en/files/' + file_id
2627
2628                 # Retrieve file webpage with 'Free download' button pressed
2629                 free_download_indication = { 'gateway_result' : '1' }
2630                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2631                 try:
2632                         self.report_download_webpage(file_id)
2633                         webpage = urllib2.urlopen(request).read()
2634                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2635                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2636                         return
2637
2638                 # Search for the real file URL
2639                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2640                 if (mobj is None) or (mobj.group(1) is None):
2641                         # Try to figure out reason of the error.
2642                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2643                         if (mobj is not None) and (mobj.group(1) is not None):
2644                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2645                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2646                         else:
2647                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2648                         return
2649
2650                 file_url = mobj.group(1)
2651                 file_extension = os.path.splitext(file_url)[1][1:]
2652
2653                 # Search for file title
2654                 mobj = re.search(r'<b title="(.*?)">', webpage)
2655                 if mobj is None:
2656                         self._downloader.trouble(u'ERROR: unable to extract title')
2657                         return
2658                 file_title = mobj.group(1).decode('utf-8')
2659
2660                 try:
2661                         # Process file information
2662                         self._downloader.process_info({
2663                                 'id':           file_id.decode('utf-8'),
2664                                 'url':          file_url.decode('utf-8'),
2665                                 'uploader':     u'NA',
2666                                 'upload_date':  u'NA',
2667                                 'title':        file_title,
2668                                 'stitle':       file_title,
2669                                 'ext':          file_extension.decode('utf-8'),
2670                                 'format':       u'NA',
2671                                 'player_url':   None,
2672                         })
2673                 except UnavailableVideoError, err:
2674                         self._downloader.trouble(u'ERROR: unable to download file')
2675
2676
2677 class FacebookIE(InfoExtractor):
2678         """Information Extractor for Facebook"""
2679
2680         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2681         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2682         _NETRC_MACHINE = 'facebook'
2683         _available_formats = ['highqual', 'lowqual']
2684         _video_extensions = {
2685                 'highqual': 'mp4',
2686                 'lowqual': 'mp4',
2687         }
2688
2689         def __init__(self, downloader=None):
2690                 InfoExtractor.__init__(self, downloader)
2691
2692         @staticmethod
2693         def suitable(url):
2694                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2695
2696         def _reporter(self, message):
2697                 """Add header and report message."""
2698                 self._downloader.to_screen(u'[facebook] %s' % message)
2699
2700         def report_login(self):
2701                 """Report attempt to log in."""
2702                 self._reporter(u'Logging in')
2703
2704         def report_video_webpage_download(self, video_id):
2705                 """Report attempt to download video webpage."""
2706                 self._reporter(u'%s: Downloading video webpage' % video_id)
2707
2708         def report_information_extraction(self, video_id):
2709                 """Report attempt to extract video information."""
2710                 self._reporter(u'%s: Extracting video information' % video_id)
2711
2712         def _parse_page(self, video_webpage):
2713                 """Extract video information from page"""
2714                 # General data
2715                 data = {'title': r'class="video_title datawrap">(.*?)</',
2716                         'description': r'<div class="datawrap">(.*?)</div>',
2717                         'owner': r'\("video_owner_name", "(.*?)"\)',
2718                         'upload_date': r'data-date="(.*?)"',
2719                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2720                         }
2721                 video_info = {}
2722                 for piece in data.keys():
2723                         mobj = re.search(data[piece], video_webpage)
2724                         if mobj is not None:
2725                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2726
2727                 # Video urls
2728                 video_urls = {}
2729                 for fmt in self._available_formats:
2730                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2731                         if mobj is not None:
2732                                 # URL is in a Javascript segment inside an escaped Unicode format within
2733                                 # the generally utf-8 page
2734                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2735                 video_info['video_urls'] = video_urls
2736
2737                 return video_info
2738
2739         def _real_initialize(self):
2740                 if self._downloader is None:
2741                         return
2742
2743                 useremail = None
2744                 password = None
2745                 downloader_params = self._downloader.params
2746
2747                 # Attempt to use provided username and password or .netrc data
2748                 if downloader_params.get('username', None) is not None:
2749                         useremail = downloader_params['username']
2750                         password = downloader_params['password']
2751                 elif downloader_params.get('usenetrc', False):
2752                         try:
2753                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2754                                 if info is not None:
2755                                         useremail = info[0]
2756                                         password = info[2]
2757                                 else:
2758                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2759                         except (IOError, netrc.NetrcParseError), err:
2760                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2761                                 return
2762
2763                 if useremail is None:
2764                         return
2765
2766                 # Log in
2767                 login_form = {
2768                         'email': useremail,
2769                         'pass': password,
2770                         'login': 'Log+In'
2771                         }
2772                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2773                 try:
2774                         self.report_login()
2775                         login_results = urllib2.urlopen(request).read()
2776                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2777                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2778                                 return
2779                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2780                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2781                         return
2782
2783         def _real_extract(self, url):
2784                 mobj = re.match(self._VALID_URL, url)
2785                 if mobj is None:
2786                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2787                         return
2788                 video_id = mobj.group('ID')
2789
2790                 # Get video webpage
2791                 self.report_video_webpage_download(video_id)
2792                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2793                 try:
2794                         page = urllib2.urlopen(request)
2795                         video_webpage = page.read()
2796                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2797                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2798                         return
2799
2800                 # Start extracting information
2801                 self.report_information_extraction(video_id)
2802
2803                 # Extract information
2804                 video_info = self._parse_page(video_webpage)
2805
2806                 # uploader
2807                 if 'owner' not in video_info:
2808                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2809                         return
2810                 video_uploader = video_info['owner']
2811
2812                 # title
2813                 if 'title' not in video_info:
2814                         self._downloader.trouble(u'ERROR: unable to extract video title')
2815                         return
2816                 video_title = video_info['title']
2817                 video_title = video_title.decode('utf-8')
2818                 video_title = sanitize_title(video_title)
2819
2820                 # simplified title
2821                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2822                 simple_title = simple_title.strip(ur'_')
2823
2824                 # thumbnail image
2825                 if 'thumbnail' not in video_info:
2826                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2827                         video_thumbnail = ''
2828                 else:
2829                         video_thumbnail = video_info['thumbnail']
2830
2831                 # upload date
2832                 upload_date = u'NA'
2833                 if 'upload_date' in video_info:
2834                         upload_time = video_info['upload_date']
2835                         timetuple = email.utils.parsedate_tz(upload_time)
2836                         if timetuple is not None:
2837                                 try:
2838                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2839                                 except:
2840                                         pass
2841
2842                 # description
2843                 video_description = video_info.get('description', 'No description available.')
2844
2845                 url_map = video_info['video_urls']
2846                 if len(url_map.keys()) > 0:
2847                         # Decide which formats to download
2848                         req_format = self._downloader.params.get('format', None)
2849                         format_limit = self._downloader.params.get('format_limit', None)
2850
2851                         if format_limit is not None and format_limit in self._available_formats:
2852                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2853                         else:
2854                                 format_list = self._available_formats
2855                         existing_formats = [x for x in format_list if x in url_map]
2856                         if len(existing_formats) == 0:
2857                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2858                                 return
2859                         if req_format is None:
2860                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2861                         elif req_format == '-1':
2862                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2863                         else:
2864                                 # Specific format
2865                                 if req_format not in url_map:
2866                                         self._downloader.trouble(u'ERROR: requested format not available')
2867                                         return
2868                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2869
2870                 for format_param, video_real_url in video_url_list:
2871
2872                         # At this point we have a new video
2873                         self._downloader.increment_downloads()
2874
2875                         # Extension
2876                         video_extension = self._video_extensions.get(format_param, 'mp4')
2877
2878                         try:
2879                                 # Process video information
2880                                 self._downloader.process_info({
2881                                         'id':           video_id.decode('utf-8'),
2882                                         'url':          video_real_url.decode('utf-8'),
2883                                         'uploader':     video_uploader.decode('utf-8'),
2884                                         'upload_date':  upload_date,
2885                                         'title':        video_title,
2886                                         'stitle':       simple_title,
2887                                         'ext':          video_extension.decode('utf-8'),
2888                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2889                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2890                                         'description':  video_description.decode('utf-8'),
2891                                         'player_url':   None,
2892                                 })
2893                         except UnavailableVideoError, err:
2894                                 self._downloader.trouble(u'\nERROR: unable to download video')
2895
2896 class BlipTVIE(InfoExtractor):
2897         """Information extractor for blip.tv"""
2898
2899         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2900         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2901
2902         @staticmethod
2903         def suitable(url):
2904                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2905
2906         def report_extraction(self, file_id):
2907                 """Report information extraction."""
2908                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2909
2910         def _simplify_title(self, title):
2911                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2912                 res = res.strip(ur'_')
2913                 return res
2914
2915         def _real_extract(self, url):
2916                 mobj = re.match(self._VALID_URL, url)
2917                 if mobj is None:
2918                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2919                         return
2920
2921                 if '?' in url:
2922                         cchar = '&'
2923                 else:
2924                         cchar = '?'
2925                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2926                 request = urllib2.Request(json_url)
2927                 self.report_extraction(mobj.group(1))
2928                 try:
2929                         json_code = urllib2.urlopen(request).read()
2930                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2931                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2932                         return
2933                 try:
2934                         json_data = json.loads(json_code)
2935                         if 'Post' in json_data:
2936                                 data = json_data['Post']
2937                         else:
2938                                 data = json_data
2939
2940                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2941                         video_url = data['media']['url']
2942                         umobj = re.match(self._URL_EXT, video_url)
2943                         if umobj is None:
2944                                 raise ValueError('Can not determine filename extension')
2945                         ext = umobj.group(1)
2946
2947                         self._downloader.increment_downloads()
2948
2949                         info = {
2950                                 'id': data['item_id'],
2951                                 'url': video_url,
2952                                 'uploader': data['display_name'],
2953                                 'upload_date': upload_date,
2954                                 'title': data['title'],
2955                                 'stitle': self._simplify_title(data['title']),
2956                                 'ext': ext,
2957                                 'format': data['media']['mimeType'],
2958                                 'thumbnail': data['thumbnailUrl'],
2959                                 'description': data['description'],
2960                                 'player_url': data['embedUrl']
2961                         }
2962                 except (ValueError,KeyError), err:
2963                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2964                         return
2965
2966                 try:
2967                         self._downloader.process_info(info)
2968                 except UnavailableVideoError, err:
2969                         self._downloader.trouble(u'\nERROR: unable to download video')
2970
2971
2972 class MyVideoIE(InfoExtractor):
2973         """Information Extractor for myvideo.de."""
2974
2975         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2976
2977         def __init__(self, downloader=None):
2978                 InfoExtractor.__init__(self, downloader)
2979
2980         @staticmethod
2981         def suitable(url):
2982                 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2983
2984         def report_download_webpage(self, video_id):
2985                 """Report webpage download."""
2986                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2987
2988         def report_extraction(self, video_id):
2989                 """Report information extraction."""
2990                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2991
2992         def _real_initialize(self):
2993                 return
2994
2995         def _real_extract(self,url):
2996                 mobj = re.match(self._VALID_URL, url)
2997                 if mobj is None:
2998                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2999                         return
3000
3001                 video_id = mobj.group(1)
3002                 simple_title = mobj.group(2).decode('utf-8')
3003                 # should actually not be necessary
3004                 simple_title = sanitize_title(simple_title)
3005                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3006
3007                 # Get video webpage
3008                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3009                 try:
3010                         self.report_download_webpage(video_id)
3011                         webpage = urllib2.urlopen(request).read()
3012                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3013                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3014                         return
3015
3016                 self.report_extraction(video_id)
3017                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3018                                  webpage)
3019                 if mobj is None:
3020                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3021                         return
3022                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3023
3024                 mobj = re.search('<title>([^<]+)</title>', webpage)
3025                 if mobj is None:
3026                         self._downloader.trouble(u'ERROR: unable to extract title')
3027                         return
3028
3029                 video_title = mobj.group(1)
3030                 video_title = sanitize_title(video_title)
3031
3032                 try:
3033                         print(video_url)
3034                         self._downloader.process_info({
3035                                 'id':           video_id,
3036                                 'url':          video_url,
3037                                 'uploader':     u'NA',
3038                                 'upload_date':  u'NA',
3039                                 'title':        video_title,
3040                                 'stitle':       simple_title,
3041                                 'ext':          u'flv',
3042                                 'format':       u'NA',
3043                                 'player_url':   None,
3044                         })
3045                 except UnavailableVideoError:
3046                         self._downloader.trouble(u'\nERROR: Unable to download video')
3047
3048 class ComedyCentralIE(InfoExtractor):
3049         """Information extractor for The Daily Show and Colbert Report """
3050
3051         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3052
3053         @staticmethod
3054         def suitable(url):
3055                 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3056
3057         def report_extraction(self, episode_id):
3058                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3059
3060         def report_config_download(self, episode_id):
3061                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3062
3063         def report_index_download(self, episode_id):
3064                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3065
3066         def report_player_url(self, episode_id):
3067                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3068
3069         def _simplify_title(self, title):
3070                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3071                 res = res.strip(ur'_')
3072                 return res
3073
3074         def _real_extract(self, url):
3075                 mobj = re.match(self._VALID_URL, url)
3076                 if mobj is None:
3077                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3078                         return
3079
3080                 if mobj.group('shortname'):
3081                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3082                                 url = 'http://www.thedailyshow.com/full-episodes/'
3083                         else:
3084                                 url = 'http://www.colbertnation.com/full-episodes/'
3085                         mobj = re.match(self._VALID_URL, url)
3086                         assert mobj is not None
3087
3088                 dlNewest = not mobj.group('episode')
3089                 if dlNewest:
3090                         epTitle = mobj.group('showname')
3091                 else:
3092                         epTitle = mobj.group('episode')
3093
3094                 req = urllib2.Request(url)
3095                 self.report_extraction(epTitle)
3096                 try:
3097                         htmlHandle = urllib2.urlopen(req)
3098                         html = htmlHandle.read()
3099                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3100                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3101                         return
3102                 if dlNewest:
3103                         url = htmlHandle.geturl()
3104                         mobj = re.match(self._VALID_URL, url)
3105                         if mobj is None:
3106                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3107                                 return
3108                         if mobj.group('episode') == '':
3109                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3110                                 return
3111                         epTitle = mobj.group('episode')
3112
3113                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3114                 if len(mMovieParams) == 0:
3115                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3116                         return
3117
3118                 playerUrl_raw = mMovieParams[0][0]
3119                 self.report_player_url(epTitle)
3120                 try:
3121                         urlHandle = urllib2.urlopen(playerUrl_raw)
3122                         playerUrl = urlHandle.geturl()
3123                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3124                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3125                         return
3126
3127                 uri = mMovieParams[0][1]
3128                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3129                 self.report_index_download(epTitle)
3130                 try:
3131                         indexXml = urllib2.urlopen(indexUrl).read()
3132                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3133                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3134                         return
3135
3136                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3137                 itemEls = idoc.findall('.//item')
3138                 for itemEl in itemEls:
3139                         mediaId = itemEl.findall('./guid')[0].text
3140                         shortMediaId = mediaId.split(':')[-1]
3141                         showId = mediaId.split(':')[-2].replace('.com', '')
3142                         officialTitle = itemEl.findall('./title')[0].text
3143                         officialDate = itemEl.findall('./pubDate')[0].text
3144
3145                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3146                                                 urllib.urlencode({'uri': mediaId}))
3147                         configReq = urllib2.Request(configUrl)
3148                         self.report_config_download(epTitle)
3149                         try:
3150                                 configXml = urllib2.urlopen(configReq).read()
3151                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3152                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3153                                 return
3154
3155                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3156                         turls = []
3157                         for rendition in cdoc.findall('.//rendition'):
3158                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3159                                 turls.append(finfo)
3160
3161                         if len(turls) == 0:
3162                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3163                                 continue
3164
3165                         # For now, just pick the highest bitrate
3166                         format,video_url = turls[-1]
3167
3168                         self._downloader.increment_downloads()
3169
3170                         effTitle = showId + '-' + epTitle
3171                         info = {
3172                                 'id': shortMediaId,
3173                                 'url': video_url,
3174                                 'uploader': showId,
3175                                 'upload_date': officialDate,
3176                                 'title': effTitle,
3177                                 'stitle': self._simplify_title(effTitle),
3178                                 'ext': 'mp4',
3179                                 'format': format,
3180                                 'thumbnail': None,
3181                                 'description': officialTitle,
3182                                 'player_url': playerUrl
3183                         }
3184
3185                         try:
3186                                 self._downloader.process_info(info)
3187                         except UnavailableVideoError, err:
3188                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3189                                 continue
3190
3191
3192 class PostProcessor(object):
3193         """Post Processor class.
3194
3195         PostProcessor objects can be added to downloaders with their
3196         add_post_processor() method. When the downloader has finished a
3197         successful download, it will take its internal chain of PostProcessors
3198         and start calling the run() method on each one of them, first with
3199         an initial argument and then with the returned value of the previous
3200         PostProcessor.
3201
3202         The chain will be stopped if one of them ever returns None or the end
3203         of the chain is reached.
3204
3205         PostProcessor objects follow a "mutual registration" process similar
3206         to InfoExtractor objects.
3207         """
3208
3209         _downloader = None
3210
3211         def __init__(self, downloader=None):
3212                 self._downloader = downloader
3213
3214         def set_downloader(self, downloader):
3215                 """Sets the downloader for this PP."""
3216                 self._downloader = downloader
3217
3218         def run(self, information):
3219                 """Run the PostProcessor.
3220
3221                 The "information" argument is a dictionary like the ones
3222                 composed by InfoExtractors. The only difference is that this
3223                 one has an extra field called "filepath" that points to the
3224                 downloaded file.
3225
3226                 When this method returns None, the postprocessing chain is
3227                 stopped. However, this method may return an information
3228                 dictionary that will be passed to the next postprocessing
3229                 object in the chain. It can be the one it received after
3230                 changing some fields.
3231
3232                 In addition, this method may raise a PostProcessingError
3233                 exception that will be taken into account by the downloader
3234                 it was called from.
3235                 """
3236                 return information # by default, do nothing
3237
3238
3239 class FFmpegExtractAudioPP(PostProcessor):
3240
3241         def __init__(self, downloader=None, preferredcodec=None):
3242                 PostProcessor.__init__(self, downloader)
3243                 if preferredcodec is None:
3244                         preferredcodec = 'best'
3245                 self._preferredcodec = preferredcodec
3246
3247         @staticmethod
3248         def get_audio_codec(path):
3249                 try:
3250                         cmd = ['ffprobe', '-show_streams', '--', path]
3251                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3252                         output = handle.communicate()[0]
3253                         if handle.wait() != 0:
3254                                 return None
3255                 except (IOError, OSError):
3256                         return None
3257                 audio_codec = None
3258                 for line in output.split('\n'):
3259                         if line.startswith('codec_name='):
3260                                 audio_codec = line.split('=')[1].strip()
3261                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3262                                 return audio_codec
3263                 return None
3264
3265         @staticmethod
3266         def run_ffmpeg(path, out_path, codec, more_opts):
3267                 try:
3268                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3269                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3270                         return (ret == 0)
3271                 except (IOError, OSError):
3272                         return False
3273
3274         def run(self, information):
3275                 path = information['filepath']
3276
3277                 filecodec = self.get_audio_codec(path)
3278                 if filecodec is None:
3279                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3280                         return None
3281
3282                 more_opts = []
3283                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3284                         if filecodec == 'aac' or filecodec == 'mp3':
3285                                 # Lossless if possible
3286                                 acodec = 'copy'
3287                                 extension = filecodec
3288                                 if filecodec == 'aac':
3289                                         more_opts = ['-f', 'adts']
3290                         else:
3291                                 # MP3 otherwise.
3292                                 acodec = 'libmp3lame'
3293                                 extension = 'mp3'
3294                                 more_opts = ['-ab', '128k']
3295                 else:
3296                         # We convert the audio (lossy)
3297                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3298                         extension = self._preferredcodec
3299                         more_opts = ['-ab', '128k']
3300                         if self._preferredcodec == 'aac':
3301                                 more_opts += ['-f', 'adts']
3302
3303                 (prefix, ext) = os.path.splitext(path)
3304                 new_path = prefix + '.' + extension
3305                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3306                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3307
3308                 if not status:
3309                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3310                         return None
3311
3312                 try:
3313                         os.remove(path)
3314                 except (IOError, OSError):
3315                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3316                         return None
3317
3318                 information['filepath'] = new_path
3319                 return information
3320
3321
3322 def updateSelf(downloader, filename):
3323         ''' Update the program file with the latest version from the repository '''
3324         # Note: downloader only used for options
3325         if not os.access(filename, os.W_OK):
3326                 sys.exit('ERROR: no write permissions on %s' % filename)
3327
3328         downloader.to_screen('Updating to latest version...')
3329
3330         try:
3331                 try:
3332                         urlh = urllib.urlopen(UPDATE_URL)
3333                         newcontent = urlh.read()
3334                 finally:
3335                         urlh.close()
3336         except (IOError, OSError), err:
3337                 sys.exit('ERROR: unable to download latest version')
3338
3339         try:
3340                 outf = open(filename, 'wb')
3341                 try:
3342                         outf.write(newcontent)
3343                 finally:
3344                         outf.close()
3345         except (IOError, OSError), err:
3346                 sys.exit('ERROR: unable to overwrite current version')
3347
3348         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3349
3350 def parseOpts():
3351         # Deferred imports
3352         import getpass
3353         import optparse
3354
3355         def _format_option_string(option):
3356                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3357
3358                 opts = []
3359
3360                 if option._short_opts: opts.append(option._short_opts[0])
3361                 if option._long_opts: opts.append(option._long_opts[0])
3362                 if len(opts) > 1: opts.insert(1, ', ')
3363
3364                 if option.takes_value(): opts.append(' %s' % option.metavar)
3365
3366                 return "".join(opts)
3367
3368         def _find_term_columns():
3369                 columns = os.environ.get('COLUMNS', None)
3370                 if columns:
3371                         return int(columns)
3372
3373                 try:
3374                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3375                         out,err = sp.communicate()
3376                         return int(out.split()[1])
3377                 except:
3378                         pass
3379                 return None
3380
3381         max_width = 80
3382         max_help_position = 80
3383
3384         # No need to wrap help messages if we're on a wide console
3385         columns = _find_term_columns()
3386         if columns: max_width = columns
3387
3388         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3389         fmt.format_option_strings = _format_option_string
3390
3391         kw = {
3392                 'version'   : __version__,
3393                 'formatter' : fmt,
3394                 'usage' : '%prog [options] url [url...]',
3395                 'conflict_handler' : 'resolve',
3396         }
3397
3398         parser = optparse.OptionParser(**kw)
3399
3400         # option groups
3401         general        = optparse.OptionGroup(parser, 'General Options')
3402         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3403         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3404         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3405         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3406         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3407
3408         general.add_option('-h', '--help',
3409                         action='help', help='print this help text and exit')
3410         general.add_option('-v', '--version',
3411                         action='version', help='print program version and exit')
3412         general.add_option('-U', '--update',
3413                         action='store_true', dest='update_self', help='update this program to latest version')
3414         general.add_option('-i', '--ignore-errors',
3415                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3416         general.add_option('-r', '--rate-limit',
3417                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3418         general.add_option('-R', '--retries',
3419                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3420         general.add_option('--playlist-start',
3421                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3422         general.add_option('--playlist-end',
3423                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3424         general.add_option('--dump-user-agent',
3425                         action='store_true', dest='dump_user_agent',
3426                         help='display the current browser identification', default=False)
3427
3428         authentication.add_option('-u', '--username',
3429                         dest='username', metavar='USERNAME', help='account username')
3430         authentication.add_option('-p', '--password',
3431                         dest='password', metavar='PASSWORD', help='account password')
3432         authentication.add_option('-n', '--netrc',
3433                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3434
3435
3436         video_format.add_option('-f', '--format',
3437                         action='store', dest='format', metavar='FORMAT', help='video format code')
3438         video_format.add_option('--all-formats',
3439                         action='store_const', dest='format', help='download all available video formats', const='-1')
3440         video_format.add_option('--max-quality',
3441                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3442
3443
3444         verbosity.add_option('-q', '--quiet',
3445                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3446         verbosity.add_option('-s', '--simulate',
3447                         action='store_true', dest='simulate', help='do not download video', default=False)
3448         verbosity.add_option('-g', '--get-url',
3449                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3450         verbosity.add_option('-e', '--get-title',
3451                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3452         verbosity.add_option('--get-thumbnail',
3453                         action='store_true', dest='getthumbnail',
3454                         help='simulate, quiet but print thumbnail URL', default=False)
3455         verbosity.add_option('--get-description',
3456                         action='store_true', dest='getdescription',
3457                         help='simulate, quiet but print video description', default=False)
3458         verbosity.add_option('--get-filename',
3459                         action='store_true', dest='getfilename',
3460                         help='simulate, quiet but print output filename', default=False)
3461         verbosity.add_option('--no-progress',
3462                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3463         verbosity.add_option('--console-title',
3464                         action='store_true', dest='consoletitle',
3465                         help='display progress in console titlebar', default=False)
3466
3467
3468         filesystem.add_option('-t', '--title',
3469                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3470         filesystem.add_option('-l', '--literal',
3471                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3472         filesystem.add_option('-A', '--auto-number',
3473                         action='store_true', dest='autonumber',
3474                         help='number downloaded files starting from 00000', default=False)
3475         filesystem.add_option('-o', '--output',
3476                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3477         filesystem.add_option('-a', '--batch-file',
3478                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3479         filesystem.add_option('-w', '--no-overwrites',
3480                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3481         filesystem.add_option('-c', '--continue',
3482                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3483         filesystem.add_option('--cookies',
3484                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3485         filesystem.add_option('--no-part',
3486                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3487         filesystem.add_option('--no-mtime',
3488                         action='store_false', dest='updatetime',
3489                         help='do not use the Last-modified header to set the file modification time', default=True)
3490         filesystem.add_option('--write-description',
3491                         action='store_true', dest='writedescription',
3492                         help='write video description to a .description file', default=False)
3493         filesystem.add_option('--write-info-json',
3494                         action='store_true', dest='writeinfojson',
3495                         help='write video metadata to a .info.json file', default=False)
3496
3497
3498         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3499                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3500         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3501                         help='"best", "aac" or "mp3"; best by default')
3502
3503
3504         parser.add_option_group(general)
3505         parser.add_option_group(filesystem)
3506         parser.add_option_group(verbosity)
3507         parser.add_option_group(video_format)
3508         parser.add_option_group(authentication)
3509         parser.add_option_group(postproc)
3510
3511         opts, args = parser.parse_args()
3512
3513         return parser, opts, args
3514
3515 def main():
3516         parser, opts, args = parseOpts()
3517
3518         # Open appropriate CookieJar
3519         if opts.cookiefile is None:
3520                 jar = cookielib.CookieJar()
3521         else:
3522                 try:
3523                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3524                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3525                                 jar.load()
3526                 except (IOError, OSError), err:
3527                         sys.exit(u'ERROR: unable to open cookie file')
3528
3529         # Dump user agent
3530         if opts.dump_user_agent:
3531                 print std_headers['User-Agent']
3532                 sys.exit(0)
3533
3534         # General configuration
3535         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3536         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3537         urllib2.install_opener(opener)
3538         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3539
3540         # Batch file verification
3541         batchurls = []
3542         if opts.batchfile is not None:
3543                 try:
3544                         if opts.batchfile == '-':
3545                                 batchfd = sys.stdin
3546                         else:
3547                                 batchfd = open(opts.batchfile, 'r')
3548                         batchurls = batchfd.readlines()
3549                         batchurls = [x.strip() for x in batchurls]
3550                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3551                 except IOError:
3552                         sys.exit(u'ERROR: batch file could not be read')
3553         all_urls = batchurls + args
3554
3555         # Conflicting, missing and erroneous options
3556         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3557                 parser.error(u'using .netrc conflicts with giving username/password')
3558         if opts.password is not None and opts.username is None:
3559                 parser.error(u'account username missing')
3560         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3561                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3562         if opts.usetitle and opts.useliteral:
3563                 parser.error(u'using title conflicts with using literal title')
3564         if opts.username is not None and opts.password is None:
3565                 opts.password = getpass.getpass(u'Type account password and press return:')
3566         if opts.ratelimit is not None:
3567                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3568                 if numeric_limit is None:
3569                         parser.error(u'invalid rate limit specified')
3570                 opts.ratelimit = numeric_limit
3571         if opts.retries is not None:
3572                 try:
3573                         opts.retries = long(opts.retries)
3574                 except (TypeError, ValueError), err:
3575                         parser.error(u'invalid retry count specified')
3576         try:
3577                 opts.playliststart = int(opts.playliststart)
3578                 if opts.playliststart <= 0:
3579                         raise ValueError(u'Playlist start must be positive')
3580         except (TypeError, ValueError), err:
3581                 parser.error(u'invalid playlist start number specified')
3582         try:
3583                 opts.playlistend = int(opts.playlistend)
3584                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3585                         raise ValueError(u'Playlist end must be greater than playlist start')
3586         except (TypeError, ValueError), err:
3587                 parser.error(u'invalid playlist end number specified')
3588         if opts.extractaudio:
3589                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3590                         parser.error(u'invalid audio format specified')
3591
3592         # Information extractors
3593         youtube_ie = YoutubeIE()
3594         metacafe_ie = MetacafeIE(youtube_ie)
3595         dailymotion_ie = DailymotionIE()
3596         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3597         youtube_user_ie = YoutubeUserIE(youtube_ie)
3598         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3599         google_ie = GoogleIE()
3600         google_search_ie = GoogleSearchIE(google_ie)
3601         photobucket_ie = PhotobucketIE()
3602         yahoo_ie = YahooIE()
3603         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3604         deposit_files_ie = DepositFilesIE()
3605         facebook_ie = FacebookIE()
3606         bliptv_ie = BlipTVIE()
3607         vimeo_ie = VimeoIE()
3608         myvideo_ie = MyVideoIE()
3609         comedycentral_ie = ComedyCentralIE()
3610
3611         generic_ie = GenericIE()
3612
3613         # File downloader
3614         fd = FileDownloader({
3615                 'usenetrc': opts.usenetrc,
3616                 'username': opts.username,
3617                 'password': opts.password,
3618                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3619                 'forceurl': opts.geturl,
3620                 'forcetitle': opts.gettitle,
3621                 'forcethumbnail': opts.getthumbnail,
3622                 'forcedescription': opts.getdescription,
3623                 'forcefilename': opts.getfilename,
3624                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3625                 'format': opts.format,
3626                 'format_limit': opts.format_limit,
3627                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3628                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3629                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3630                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3631                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3632                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3633                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3634                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3635                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3636                         or u'%(id)s.%(ext)s'),
3637                 'ignoreerrors': opts.ignoreerrors,
3638                 'ratelimit': opts.ratelimit,
3639                 'nooverwrites': opts.nooverwrites,
3640                 'retries': opts.retries,
3641                 'continuedl': opts.continue_dl,
3642                 'noprogress': opts.noprogress,
3643                 'playliststart': opts.playliststart,
3644                 'playlistend': opts.playlistend,
3645                 'logtostderr': opts.outtmpl == '-',
3646                 'consoletitle': opts.consoletitle,
3647                 'nopart': opts.nopart,
3648                 'updatetime': opts.updatetime,
3649                 'writedescription': opts.writedescription,
3650                 'writeinfojson': opts.writeinfojson,
3651                 })
3652         fd.add_info_extractor(youtube_search_ie)
3653         fd.add_info_extractor(youtube_pl_ie)
3654         fd.add_info_extractor(youtube_user_ie)
3655         fd.add_info_extractor(metacafe_ie)
3656         fd.add_info_extractor(dailymotion_ie)
3657         fd.add_info_extractor(youtube_ie)
3658         fd.add_info_extractor(google_ie)
3659         fd.add_info_extractor(google_search_ie)
3660         fd.add_info_extractor(photobucket_ie)
3661         fd.add_info_extractor(yahoo_ie)
3662         fd.add_info_extractor(yahoo_search_ie)
3663         fd.add_info_extractor(deposit_files_ie)
3664         fd.add_info_extractor(facebook_ie)
3665         fd.add_info_extractor(bliptv_ie)
3666         fd.add_info_extractor(vimeo_ie)
3667         fd.add_info_extractor(myvideo_ie)
3668         fd.add_info_extractor(comedycentral_ie)
3669
3670         # This must come last since it's the
3671         # fallback if none of the others work
3672         fd.add_info_extractor(generic_ie)
3673
3674         # PostProcessors
3675         if opts.extractaudio:
3676                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3677
3678         # Update version
3679         if opts.update_self:
3680                 updateSelf(fd, sys.argv[0])
3681
3682         # Maybe do nothing
3683         if len(all_urls) < 1:
3684                 if not opts.update_self:
3685                         parser.error(u'you must provide at least one URL')
3686                 else:
3687                         sys.exit()
3688         retcode = fd.download(all_urls)
3689
3690         # Dump cookie jar if requested
3691         if opts.cookiefile is not None:
3692                 try:
3693                         jar.save()
3694                 except (IOError, OSError), err:
3695                         sys.exit(u'ERROR: unable to save cookie jar')
3696
3697         sys.exit(retcode)
3698
3699
3700 if __name__ == '__main__':
3701         try:
3702                 main()
3703         except DownloadError:
3704                 sys.exit(1)
3705         except SameFileError:
3706                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3707         except KeyboardInterrupt:
3708                 sys.exit(u'\nERROR: Interrupted by user')
3709
3710 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: