youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.18'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forceformat', False):
 714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 715
 716                 # Do nothing else if in simulate mode
 717                 if self.params.get('simulate', False):
 718                         return
 719
 720                 if filename is None:
 721                         return
 722
 723                 matchtitle=self.params.get('matchtitle',False)
 724                 rejecttitle=self.params.get('rejecttitle',False)
 725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 728                         return
 729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 731                         return
 732
 733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 734                         self.to_stderr(u'WARNING: file exists and will be skipped')
 735                         return
 736
 737                 try:
 738                         dn = os.path.dirname(filename)
 739                         if dn != '' and not os.path.exists(dn):
 740                                 os.makedirs(dn)
 741                 except (OSError, IOError), err:
 742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 743                         return
 744
 745                 if self.params.get('writedescription', False):
 746                         try:
 747                                 descfn = filename + '.description'
 748                                 self.report_writedescription(descfn)
 749                                 descfile = open(descfn, 'wb')
 750                                 try:
 751                                         descfile.write(info_dict['description'].encode('utf-8'))
 752                                 finally:
 753                                         descfile.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 756                                 return
 757
 758                 if self.params.get('writeinfojson', False):
 759                         infofn = filename + '.info.json'
 760                         self.report_writeinfojson(infofn)
 761                         try:
 762                                 json.dump
 763                         except (NameError,AttributeError):
 764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 765                                 return
 766                         try:
 767                                 infof = open(infofn, 'wb')
 768                                 try:
 769                                         json.dump(info_dict, infof)
 770                                 finally:
 771                                         infof.close()
 772                         except (OSError, IOError):
 773                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 774                                 return
 775
 776                 if not self.params.get('skip_download', False):
 777                         try:
 778                                 success = self._do_download(filename, info_dict)
 779                         except (OSError, IOError), err:
 780                                 raise UnavailableVideoError
 781                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 782                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 783                                 return
 784                         except (ContentTooShortError, ), err:
 785                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 786                                 return
 787
 788                         if success:
 789                                 try:
 790                                         self.post_process(filename, info_dict)
 791                                 except (PostProcessingError), err:
 792                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 793                                         return
 794
 795         def download(self, url_list):
 796                 """Download a given list of URLs."""
 797                 if len(url_list) > 1 and self.fixed_template():
 798                         raise SameFileError(self.params['outtmpl'])
 799
 800                 for url in url_list:
 801                         suitable_found = False
 802                         for ie in self._ies:
 803                                 # Go to next InfoExtractor if not suitable
 804                                 if not ie.suitable(url):
 805                                         continue
 806
 807                                 # Suitable InfoExtractor found
 808                                 suitable_found = True
 809
 810                                 # Extract information from URL and process it
 811                                 ie.extract(url)
 812
 813                                 # Suitable InfoExtractor had been found; go to next URL
 814                                 break
 815
 816                         if not suitable_found:
 817                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 818
 819                 return self._download_retcode
 820
 821         def post_process(self, filename, ie_info):
 822                 """Run the postprocessing chain on the given file."""
 823                 info = dict(ie_info)
 824                 info['filepath'] = filename
 825                 for pp in self._pps:
 826                         info = pp.run(info)
 827                         if info is None:
 828                                 break
 829
 830         def _download_with_rtmpdump(self, filename, url, player_url):
 831                 self.report_destination(filename)
 832                 tmpfilename = self.temp_name(filename)
 833
 834                 # Check for rtmpdump first
 835                 try:
 836                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 837                 except (OSError, IOError):
 838                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 839                         return False
 840
 841                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 842                 # the connection was interrumpted and resuming appears to be
 843                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 844                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 845                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 846                 while retval == 2 or retval == 1:
 847                         prevsize = os.path.getsize(tmpfilename)
 848                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 849                         time.sleep(5.0) # This seems to be needed
 850                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 851                         cursize = os.path.getsize(tmpfilename)
 852                         if prevsize == cursize and retval == 1:
 853                                 break
 854                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 855                         if prevsize == cursize and retval == 2 and cursize > 1024:
 856                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 857                                 retval = 0
 858                                 break
 859                 if retval == 0:
 860                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 861                         self.try_rename(tmpfilename, filename)
 862                         return True
 863                 else:
 864                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 865                         return False
 866
 867         def _do_download(self, filename, info_dict):
 868                 url = info_dict['url']
 869                 player_url = info_dict.get('player_url', None)
 870
 871                 # Check file already present
 872                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 873                         self.report_file_already_downloaded(filename)
 874                         return True
 875
 876                 # Attempt to download using rtmpdump
 877                 if url.startswith('rtmp'):
 878                         return self._download_with_rtmpdump(filename, url, player_url)
 879
 880                 tmpfilename = self.temp_name(filename)
 881                 stream = None
 882
 883                 # Do not include the Accept-Encoding header
 884                 headers = {'Youtubedl-no-compression': 'True'}
 885                 basic_request = urllib2.Request(url, None, headers)
 886                 request = urllib2.Request(url, None, headers)
 887
 888                 # Establish possible resume length
 889                 if os.path.isfile(tmpfilename):
 890                         resume_len = os.path.getsize(tmpfilename)
 891                 else:
 892                         resume_len = 0
 893
 894                 open_mode = 'wb'
 895                 if resume_len != 0:
 896                         if self.params.get('continuedl', False):
 897                                 self.report_resuming_byte(resume_len)
 898                                 request.add_header('Range','bytes=%d-' % resume_len)
 899                                 open_mode = 'ab'
 900                         else:
 901                                 resume_len = 0
 902
 903                 count = 0
 904                 retries = self.params.get('retries', 0)
 905                 while count <= retries:
 906                         # Establish connection
 907                         try:
 908                                 data = urllib2.urlopen(request)
 909                                 break
 910                         except (urllib2.HTTPError, ), err:
 911                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 912                                         # Unexpected HTTP error
 913                                         raise
 914                                 elif err.code == 416:
 915                                         # Unable to resume (requested range not satisfiable)
 916                                         try:
 917                                                 # Open the connection again without the range header
 918                                                 data = urllib2.urlopen(basic_request)
 919                                                 content_length = data.info()['Content-Length']
 920                                         except (urllib2.HTTPError, ), err:
 921                                                 if err.code < 500 or err.code >= 600:
 922                                                         raise
 923                                         else:
 924                                                 # Examine the reported length
 925                                                 if (content_length is not None and
 926                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 927                                                         # The file had already been fully downloaded.
 928                                                         # Explanation to the above condition: in issue #175 it was revealed that
 929                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 930                                                         # changing the file size slightly and causing problems for some users. So
 931                                                         # I decided to implement a suggested change and consider the file
 932                                                         # completely downloaded if the file size differs less than 100 bytes from
 933                                                         # the one in the hard drive.
 934                                                         self.report_file_already_downloaded(filename)
 935                                                         self.try_rename(tmpfilename, filename)
 936                                                         return True
 937                                                 else:
 938                                                         # The length does not match, we start the download over
 939                                                         self.report_unable_to_resume()
 940                                                         open_mode = 'wb'
 941                                                         break
 942                         # Retry
 943                         count += 1
 944                         if count <= retries:
 945                                 self.report_retry(count, retries)
 946
 947                 if count > retries:
 948                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 949                         return False
 950
 951                 data_len = data.info().get('Content-length', None)
 952                 if data_len is not None:
 953                         data_len = long(data_len) + resume_len
 954                 data_len_str = self.format_bytes(data_len)
 955                 byte_counter = 0 + resume_len
 956                 block_size = 1024
 957                 start = time.time()
 958                 while True:
 959                         # Download and write
 960                         before = time.time()
 961                         data_block = data.read(block_size)
 962                         after = time.time()
 963                         if len(data_block) == 0:
 964                                 break
 965                         byte_counter += len(data_block)
 966
 967                         # Open file just in time
 968                         if stream is None:
 969                                 try:
 970                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 971                                         assert stream is not None
 972                                         filename = self.undo_temp_name(tmpfilename)
 973                                         self.report_destination(filename)
 974                                 except (OSError, IOError), err:
 975                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 976                                         return False
 977                         try:
 978                                 stream.write(data_block)
 979                         except (IOError, OSError), err:
 980                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 981                                 return False
 982                         block_size = self.best_block_size(after - before, len(data_block))
 983
 984                         # Progress message
 985                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 986                         if data_len is None:
 987                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 988                         else:
 989                                 percent_str = self.calc_percent(byte_counter, data_len)
 990                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 991                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 992
 993                         # Apply rate limit
 994                         self.slow_down(start, byte_counter - resume_len)
 995
 996                 if stream is None:
 997                         self.trouble(u'\nERROR: Did not get any data blocks')
 998                         return False
 999                 stream.close()
1000                 self.report_finish()
1001                 if data_len is not None and byte_counter != data_len:
1002                         raise ContentTooShortError(byte_counter, long(data_len))
1003                 self.try_rename(tmpfilename, filename)
1004
1005                 # Update file modification time
1006                 if self.params.get('updatetime', True):
1007                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1008
1009                 return True
1010
1011
1012 class InfoExtractor(object):
1013         """Information Extractor class.
1014
1015         Information extractors are the classes that, given a URL, extract
1016         information from the video (or videos) the URL refers to. This
1017         information includes the real video URL, the video title and simplified
1018         title, author and others. The information is stored in a dictionary
1019         which is then passed to the FileDownloader. The FileDownloader
1020         processes this information possibly downloading the video to the file
1021         system, among other possible outcomes. The dictionaries must include
1022         the following fields:
1023
1024         id:             Video identifier.
1025         url:            Final video URL.
1026         uploader:       Nickname of the video uploader.
1027         title:          Literal title.
1028         stitle:         Simplified title.
1029         ext:            Video filename extension.
1030         format:         Video format.
1031         player_url:     SWF Player URL (may be None).
1032
1033         The following fields are optional. Their primary purpose is to allow
1034         youtube-dl to serve as the backend for a video search function, such
1035         as the one in youtube2mp3.  They are only used when their respective
1036         forced printing functions are called:
1037
1038         thumbnail:      Full URL to a video thumbnail image.
1039         description:    One-line video description.
1040
1041         Subclasses of this one should re-define the _real_initialize() and
1042         _real_extract() methods and define a _VALID_URL regexp.
1043         Probably, they should also be added to the list of extractors.
1044         """
1045
1046         _ready = False
1047         _downloader = None
1048
1049         def __init__(self, downloader=None):
1050                 """Constructor. Receives an optional downloader."""
1051                 self._ready = False
1052                 self.set_downloader(downloader)
1053
1054         def suitable(self, url):
1055                 """Receives a URL and returns True if suitable for this IE."""
1056                 return re.match(self._VALID_URL, url) is not None
1057
1058         def initialize(self):
1059                 """Initializes an instance (authentication, etc)."""
1060                 if not self._ready:
1061                         self._real_initialize()
1062                         self._ready = True
1063
1064         def extract(self, url):
1065                 """Extracts URL information and returns it in list of dicts."""
1066                 self.initialize()
1067                 return self._real_extract(url)
1068
1069         def set_downloader(self, downloader):
1070                 """Sets the downloader for this IE."""
1071                 self._downloader = downloader
1072
1073         def _real_initialize(self):
1074                 """Real initialization process. Redefine in subclasses."""
1075                 pass
1076
1077         def _real_extract(self, url):
1078                 """Real extraction process. Redefine in subclasses."""
1079                 pass
1080
1081
1082 class YoutubeIE(InfoExtractor):
1083         """Information extractor for youtube.com."""
1084
1085         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1086         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1087         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1088         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1089         _NETRC_MACHINE = 'youtube'
1090         # Listed in order of quality
1091         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1092         _video_extensions = {
1093                 '13': '3gp',
1094                 '17': 'mp4',
1095                 '18': 'mp4',
1096                 '22': 'mp4',
1097                 '37': 'mp4',
1098                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1099                 '43': 'webm',
1100                 '45': 'webm',
1101         }
1102         IE_NAME = u'youtube'
1103
1104         def report_lang(self):
1105                 """Report attempt to set language."""
1106                 self._downloader.to_screen(u'[youtube] Setting language')
1107
1108         def report_login(self):
1109                 """Report attempt to log in."""
1110                 self._downloader.to_screen(u'[youtube] Logging in')
1111
1112         def report_age_confirmation(self):
1113                 """Report attempt to confirm age."""
1114                 self._downloader.to_screen(u'[youtube] Confirming age')
1115
1116         def report_video_webpage_download(self, video_id):
1117                 """Report attempt to download video webpage."""
1118                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1119
1120         def report_video_info_webpage_download(self, video_id):
1121                 """Report attempt to download video info webpage."""
1122                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1123
1124         def report_information_extraction(self, video_id):
1125                 """Report attempt to extract video information."""
1126                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1127
1128         def report_unavailable_format(self, video_id, format):
1129                 """Report extracted video URL."""
1130                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1131
1132         def report_rtmp_download(self):
1133                 """Indicate the download will use the RTMP protocol."""
1134                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1135
1136         def _real_initialize(self):
1137                 if self._downloader is None:
1138                         return
1139
1140                 username = None
1141                 password = None
1142                 downloader_params = self._downloader.params
1143
1144                 # Attempt to use provided username and password or .netrc data
1145                 if downloader_params.get('username', None) is not None:
1146                         username = downloader_params['username']
1147                         password = downloader_params['password']
1148                 elif downloader_params.get('usenetrc', False):
1149                         try:
1150                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1151                                 if info is not None:
1152                                         username = info[0]
1153                                         password = info[2]
1154                                 else:
1155                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1156                         except (IOError, netrc.NetrcParseError), err:
1157                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1158                                 return
1159
1160                 # Set language
1161                 request = urllib2.Request(self._LANG_URL)
1162                 try:
1163                         self.report_lang()
1164                         urllib2.urlopen(request).read()
1165                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1166                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1167                         return
1168
1169                 # No authentication to be performed
1170                 if username is None:
1171                         return
1172
1173                 # Log in
1174                 login_form = {
1175                                 'current_form': 'loginForm',
1176                                 'next':         '/',
1177                                 'action_login': 'Log In',
1178                                 'username':     username,
1179                                 'password':     password,
1180                                 }
1181                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1182                 try:
1183                         self.report_login()
1184                         login_results = urllib2.urlopen(request).read()
1185                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1186                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1187                                 return
1188                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1190                         return
1191
1192                 # Confirm age
1193                 age_form = {
1194                                 'next_url':             '/',
1195                                 'action_confirm':       'Confirm',
1196                                 }
1197                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1198                 try:
1199                         self.report_age_confirmation()
1200                         age_results = urllib2.urlopen(request).read()
1201                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1202                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1203                         return
1204
1205         def _real_extract(self, url):
1206                 # Extract video id from URL
1207                 mobj = re.match(self._VALID_URL, url)
1208                 if mobj is None:
1209                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1210                         return
1211                 video_id = mobj.group(2)
1212
1213                 # Get video webpage
1214                 self.report_video_webpage_download(video_id)
1215                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1216                 try:
1217                         video_webpage = urllib2.urlopen(request).read()
1218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1220                         return
1221
1222                 # Attempt to extract SWF player URL
1223                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1224                 if mobj is not None:
1225                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1226                 else:
1227                         player_url = None
1228
1229                 # Get video info
1230                 self.report_video_info_webpage_download(video_id)
1231                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1232                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1233                                         % (video_id, el_type))
1234                         request = urllib2.Request(video_info_url)
1235                         try:
1236                                 video_info_webpage = urllib2.urlopen(request).read()
1237                                 video_info = parse_qs(video_info_webpage)
1238                                 if 'token' in video_info:
1239                                         break
1240                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1241                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1242                                 return
1243                 if 'token' not in video_info:
1244                         if 'reason' in video_info:
1245                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1246                         else:
1247                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1248                         return
1249
1250                 # Start extracting information
1251                 self.report_information_extraction(video_id)
1252
1253                 # uploader
1254                 if 'author' not in video_info:
1255                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1256                         return
1257                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1258
1259                 # title
1260                 if 'title' not in video_info:
1261                         self._downloader.trouble(u'ERROR: unable to extract video title')
1262                         return
1263                 video_title = urllib.unquote_plus(video_info['title'][0])
1264                 video_title = video_title.decode('utf-8')
1265                 video_title = sanitize_title(video_title)
1266
1267                 # simplified title
1268                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1269                 simple_title = simple_title.strip(ur'_')
1270
1271                 # thumbnail image
1272                 if 'thumbnail_url' not in video_info:
1273                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1274                         video_thumbnail = ''
1275                 else:   # don't panic if we can't find it
1276                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1277
1278                 # upload date
1279                 upload_date = u'NA'
1280                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1281                 if mobj is not None:
1282                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1283                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1284                         for expression in format_expressions:
1285                                 try:
1286                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1287                                 except:
1288                                         pass
1289
1290                 # description
1291                 try:
1292                         lxml.etree
1293                 except NameError:
1294                         video_description = u'No description available.'
1295                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1296                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1297                                 if mobj is not None:
1298                                         video_description = mobj.group(1).decode('utf-8')
1299                 else:
1300                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1301                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1302                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1303                         # TODO use another parser
1304
1305                 # token
1306                 video_token = urllib.unquote_plus(video_info['token'][0])
1307
1308                 # Decide which formats to download
1309                 req_format = self._downloader.params.get('format', None)
1310
1311                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1312                         self.report_rtmp_download()
1313                         video_url_list = [(None, video_info['conn'][0])]
1314                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1315                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1316                         url_data = [parse_qs(uds) for uds in url_data_strs]
1317                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1318                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1319
1320                         format_limit = self._downloader.params.get('format_limit', None)
1321                         if format_limit is not None and format_limit in self._available_formats:
1322                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1323                         else:
1324                                 format_list = self._available_formats
1325                         existing_formats = [x for x in format_list if x in url_map]
1326                         if len(existing_formats) == 0:
1327                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1328                                 return
1329                         if req_format is None or req_format == 'best':
1330                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1331                         elif req_format == 'worst':
1332                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1333                         elif req_format in ('-1', 'all'):
1334                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1335                         else:
1336                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1337                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1338                                 req_formats = req_format.split('/')
1339                                 video_url_list = None
1340                                 for rf in req_formats:
1341                                         if rf in url_map:
1342                                                 video_url_list = [(rf, url_map[rf])]
1343                                                 break
1344                                 if video_url_list is None:
1345                                         self._downloader.trouble(u'ERROR: requested format not available')
1346                                         return
1347                 else:
1348                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1349                         return
1350
1351                 for format_param, video_real_url in video_url_list:
1352                         # At this point we have a new video
1353                         self._downloader.increment_downloads()
1354
1355                         # Extension
1356                         video_extension = self._video_extensions.get(format_param, 'flv')
1357
1358                         try:
1359                                 # Process video information
1360                                 self._downloader.process_info({
1361                                         'id':           video_id.decode('utf-8'),
1362                                         'url':          video_real_url.decode('utf-8'),
1363                                         'uploader':     video_uploader.decode('utf-8'),
1364                                         'upload_date':  upload_date,
1365                                         'title':        video_title,
1366                                         'stitle':       simple_title,
1367                                         'ext':          video_extension.decode('utf-8'),
1368                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1369                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1370                                         'description':  video_description,
1371                                         'player_url':   player_url,
1372                                 })
1373                         except UnavailableVideoError, err:
1374                                 self._downloader.trouble(u'\nERROR: unable to download video')
1375
1376
1377 class MetacafeIE(InfoExtractor):
1378         """Information Extractor for metacafe.com."""
1379
1380         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1381         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1382         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1383         _youtube_ie = None
1384         IE_NAME = u'metacafe'
1385
1386         def __init__(self, youtube_ie, downloader=None):
1387                 InfoExtractor.__init__(self, downloader)
1388                 self._youtube_ie = youtube_ie
1389
1390         def report_disclaimer(self):
1391                 """Report disclaimer retrieval."""
1392                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1393
1394         def report_age_confirmation(self):
1395                 """Report attempt to confirm age."""
1396                 self._downloader.to_screen(u'[metacafe] Confirming age')
1397
1398         def report_download_webpage(self, video_id):
1399                 """Report webpage download."""
1400                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1401
1402         def report_extraction(self, video_id):
1403                 """Report information extraction."""
1404                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1405
1406         def _real_initialize(self):
1407                 # Retrieve disclaimer
1408                 request = urllib2.Request(self._DISCLAIMER)
1409                 try:
1410                         self.report_disclaimer()
1411                         disclaimer = urllib2.urlopen(request).read()
1412                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1413                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1414                         return
1415
1416                 # Confirm age
1417                 disclaimer_form = {
1418                         'filters': '0',
1419                         'submit': "Continue - I'm over 18",
1420                         }
1421                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1422                 try:
1423                         self.report_age_confirmation()
1424                         disclaimer = urllib2.urlopen(request).read()
1425                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1426                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1427                         return
1428
1429         def _real_extract(self, url):
1430                 # Extract id and simplified title from URL
1431                 mobj = re.match(self._VALID_URL, url)
1432                 if mobj is None:
1433                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1434                         return
1435
1436                 video_id = mobj.group(1)
1437
1438                 # Check if video comes from YouTube
1439                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1440                 if mobj2 is not None:
1441                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1442                         return
1443
1444                 # At this point we have a new video
1445                 self._downloader.increment_downloads()
1446
1447                 simple_title = mobj.group(2).decode('utf-8')
1448
1449                 # Retrieve video webpage to extract further information
1450                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1451                 try:
1452                         self.report_download_webpage(video_id)
1453                         webpage = urllib2.urlopen(request).read()
1454                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1456                         return
1457
1458                 # Extract URL, uploader and title from webpage
1459                 self.report_extraction(video_id)
1460                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1461                 if mobj is not None:
1462                         mediaURL = urllib.unquote(mobj.group(1))
1463                         video_extension = mediaURL[-3:]
1464
1465                         # Extract gdaKey if available
1466                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1467                         if mobj is None:
1468                                 video_url = mediaURL
1469                         else:
1470                                 gdaKey = mobj.group(1)
1471                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1472                 else:
1473                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1474                         if mobj is None:
1475                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1476                                 return
1477                         vardict = parse_qs(mobj.group(1))
1478                         if 'mediaData' not in vardict:
1479                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1480                                 return
1481                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1482                         if mobj is None:
1483                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1484                                 return
1485                         mediaURL = mobj.group(1).replace('\\/', '/')
1486                         video_extension = mediaURL[-3:]
1487                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1488
1489                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1490                 if mobj is None:
1491                         self._downloader.trouble(u'ERROR: unable to extract title')
1492                         return
1493                 video_title = mobj.group(1).decode('utf-8')
1494                 video_title = sanitize_title(video_title)
1495
1496                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1497                 if mobj is None:
1498                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1499                         return
1500                 video_uploader = mobj.group(1)
1501
1502                 try:
1503                         # Process video information
1504                         self._downloader.process_info({
1505                                 'id':           video_id.decode('utf-8'),
1506                                 'url':          video_url.decode('utf-8'),
1507                                 'uploader':     video_uploader.decode('utf-8'),
1508                                 'upload_date':  u'NA',
1509                                 'title':        video_title,
1510                                 'stitle':       simple_title,
1511                                 'ext':          video_extension.decode('utf-8'),
1512                                 'format':       u'NA',
1513                                 'player_url':   None,
1514                         })
1515                 except UnavailableVideoError:
1516                         self._downloader.trouble(u'\nERROR: unable to download video')
1517
1518
1519 class DailymotionIE(InfoExtractor):
1520         """Information Extractor for Dailymotion"""
1521
1522         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1523         IE_NAME = u'dailymotion'
1524
1525         def __init__(self, downloader=None):
1526                 InfoExtractor.__init__(self, downloader)
1527
1528         def report_download_webpage(self, video_id):
1529                 """Report webpage download."""
1530                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1531
1532         def report_extraction(self, video_id):
1533                 """Report information extraction."""
1534                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1535
1536         def _real_initialize(self):
1537                 return
1538
1539         def _real_extract(self, url):
1540                 # Extract id and simplified title from URL
1541                 mobj = re.match(self._VALID_URL, url)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1544                         return
1545
1546                 # At this point we have a new video
1547                 self._downloader.increment_downloads()
1548                 video_id = mobj.group(1)
1549
1550                 simple_title = mobj.group(2).decode('utf-8')
1551                 video_extension = 'flv'
1552
1553                 # Retrieve video webpage to extract further information
1554                 request = urllib2.Request(url)
1555                 request.add_header('Cookie', 'family_filter=off')
1556                 try:
1557                         self.report_download_webpage(video_id)
1558                         webpage = urllib2.urlopen(request).read()
1559                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1560                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1561                         return
1562
1563                 # Extract URL, uploader and title from webpage
1564                 self.report_extraction(video_id)
1565                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1566                 if mobj is None:
1567                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1568                         return
1569                 sequence = urllib.unquote(mobj.group(1))
1570                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1571                 if mobj is None:
1572                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1573                         return
1574                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1575
1576                 # if needed add http://www.dailymotion.com/ if relative URL
1577
1578                 video_url = mediaURL
1579
1580                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1581                 if mobj is None:
1582                         self._downloader.trouble(u'ERROR: unable to extract title')
1583                         return
1584                 video_title = mobj.group(1).decode('utf-8')
1585                 video_title = sanitize_title(video_title)
1586
1587                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1588                 if mobj is None:
1589                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1590                         return
1591                 video_uploader = mobj.group(1)
1592
1593                 try:
1594                         # Process video information
1595                         self._downloader.process_info({
1596                                 'id':           video_id.decode('utf-8'),
1597                                 'url':          video_url.decode('utf-8'),
1598                                 'uploader':     video_uploader.decode('utf-8'),
1599                                 'upload_date':  u'NA',
1600                                 'title':        video_title,
1601                                 'stitle':       simple_title,
1602                                 'ext':          video_extension.decode('utf-8'),
1603                                 'format':       u'NA',
1604                                 'player_url':   None,
1605                         })
1606                 except UnavailableVideoError:
1607                         self._downloader.trouble(u'\nERROR: unable to download video')
1608
1609
1610 class GoogleIE(InfoExtractor):
1611         """Information extractor for video.google.com."""
1612
1613         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1614         IE_NAME = u'video.google'
1615
1616         def __init__(self, downloader=None):
1617                 InfoExtractor.__init__(self, downloader)
1618
1619         def report_download_webpage(self, video_id):
1620                 """Report webpage download."""
1621                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1622
1623         def report_extraction(self, video_id):
1624                 """Report information extraction."""
1625                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1626
1627         def _real_initialize(self):
1628                 return
1629
1630         def _real_extract(self, url):
1631                 # Extract id from URL
1632                 mobj = re.match(self._VALID_URL, url)
1633                 if mobj is None:
1634                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1635                         return
1636
1637                 # At this point we have a new video
1638                 self._downloader.increment_downloads()
1639                 video_id = mobj.group(1)
1640
1641                 video_extension = 'mp4'
1642
1643                 # Retrieve video webpage to extract further information
1644                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1645                 try:
1646                         self.report_download_webpage(video_id)
1647                         webpage = urllib2.urlopen(request).read()
1648                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1649                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1650                         return
1651
1652                 # Extract URL, uploader, and title from webpage
1653                 self.report_extraction(video_id)
1654                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1655                 if mobj is None:
1656                         video_extension = 'flv'
1657                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1658                 if mobj is None:
1659                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1660                         return
1661                 mediaURL = urllib.unquote(mobj.group(1))
1662                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1663                 mediaURL = mediaURL.replace('\\x26', '\x26')
1664
1665                 video_url = mediaURL
1666
1667                 mobj = re.search(r'<title>(.*)</title>', webpage)
1668                 if mobj is None:
1669                         self._downloader.trouble(u'ERROR: unable to extract title')
1670                         return
1671                 video_title = mobj.group(1).decode('utf-8')
1672                 video_title = sanitize_title(video_title)
1673                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1674
1675                 # Extract video description
1676                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1677                 if mobj is None:
1678                         self._downloader.trouble(u'ERROR: unable to extract video description')
1679                         return
1680                 video_description = mobj.group(1).decode('utf-8')
1681                 if not video_description:
1682                         video_description = 'No description available.'
1683
1684                 # Extract video thumbnail
1685                 if self._downloader.params.get('forcethumbnail', False):
1686                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1687                         try:
1688                                 webpage = urllib2.urlopen(request).read()
1689                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1691                                 return
1692                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1693                         if mobj is None:
1694                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1695                                 return
1696                         video_thumbnail = mobj.group(1)
1697                 else:   # we need something to pass to process_info
1698                         video_thumbnail = ''
1699
1700                 try:
1701                         # Process video information
1702                         self._downloader.process_info({
1703                                 'id':           video_id.decode('utf-8'),
1704                                 'url':          video_url.decode('utf-8'),
1705                                 'uploader':     u'NA',
1706                                 'upload_date':  u'NA',
1707                                 'title':        video_title,
1708                                 'stitle':       simple_title,
1709                                 'ext':          video_extension.decode('utf-8'),
1710                                 'format':       u'NA',
1711                                 'player_url':   None,
1712                         })
1713                 except UnavailableVideoError:
1714                         self._downloader.trouble(u'\nERROR: unable to download video')
1715
1716
1717 class PhotobucketIE(InfoExtractor):
1718         """Information extractor for photobucket.com."""
1719
1720         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1721         IE_NAME = u'photobucket'
1722
1723         def __init__(self, downloader=None):
1724                 InfoExtractor.__init__(self, downloader)
1725
1726         def report_download_webpage(self, video_id):
1727                 """Report webpage download."""
1728                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1729
1730         def report_extraction(self, video_id):
1731                 """Report information extraction."""
1732                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1733
1734         def _real_initialize(self):
1735                 return
1736
1737         def _real_extract(self, url):
1738                 # Extract id from URL
1739                 mobj = re.match(self._VALID_URL, url)
1740                 if mobj is None:
1741                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1742                         return
1743
1744                 # At this point we have a new video
1745                 self._downloader.increment_downloads()
1746                 video_id = mobj.group(1)
1747
1748                 video_extension = 'flv'
1749
1750                 # Retrieve video webpage to extract further information
1751                 request = urllib2.Request(url)
1752                 try:
1753                         self.report_download_webpage(video_id)
1754                         webpage = urllib2.urlopen(request).read()
1755                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1756                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1757                         return
1758
1759                 # Extract URL, uploader, and title from webpage
1760                 self.report_extraction(video_id)
1761                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1762                 if mobj is None:
1763                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1764                         return
1765                 mediaURL = urllib.unquote(mobj.group(1))
1766
1767                 video_url = mediaURL
1768
1769                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1770                 if mobj is None:
1771                         self._downloader.trouble(u'ERROR: unable to extract title')
1772                         return
1773                 video_title = mobj.group(1).decode('utf-8')
1774                 video_title = sanitize_title(video_title)
1775                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1776
1777                 video_uploader = mobj.group(2).decode('utf-8')
1778
1779                 try:
1780                         # Process video information
1781                         self._downloader.process_info({
1782                                 'id':           video_id.decode('utf-8'),
1783                                 'url':          video_url.decode('utf-8'),
1784                                 'uploader':     video_uploader,
1785                                 'upload_date':  u'NA',
1786                                 'title':        video_title,
1787                                 'stitle':       simple_title,
1788                                 'ext':          video_extension.decode('utf-8'),
1789                                 'format':       u'NA',
1790                                 'player_url':   None,
1791                         })
1792                 except UnavailableVideoError:
1793                         self._downloader.trouble(u'\nERROR: unable to download video')
1794
1795
1796 class YahooIE(InfoExtractor):
1797         """Information extractor for video.yahoo.com."""
1798
1799         # _VALID_URL matches all Yahoo! Video URLs
1800         # _VPAGE_URL matches only the extractable '/watch/' URLs
1801         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1802         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1803         IE_NAME = u'video.yahoo'
1804
1805         def __init__(self, downloader=None):
1806                 InfoExtractor.__init__(self, downloader)
1807
1808         def report_download_webpage(self, video_id):
1809                 """Report webpage download."""
1810                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1811
1812         def report_extraction(self, video_id):
1813                 """Report information extraction."""
1814                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1815
1816         def _real_initialize(self):
1817                 return
1818
1819         def _real_extract(self, url, new_video=True):
1820                 # Extract ID from URL
1821                 mobj = re.match(self._VALID_URL, url)
1822                 if mobj is None:
1823                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1824                         return
1825
1826                 # At this point we have a new video
1827                 self._downloader.increment_downloads()
1828                 video_id = mobj.group(2)
1829                 video_extension = 'flv'
1830
1831                 # Rewrite valid but non-extractable URLs as
1832                 # extractable English language /watch/ URLs
1833                 if re.match(self._VPAGE_URL, url) is None:
1834                         request = urllib2.Request(url)
1835                         try:
1836                                 webpage = urllib2.urlopen(request).read()
1837                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1838                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1839                                 return
1840
1841                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1842                         if mobj is None:
1843                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1844                                 return
1845                         yahoo_id = mobj.group(1)
1846
1847                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1848                         if mobj is None:
1849                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1850                                 return
1851                         yahoo_vid = mobj.group(1)
1852
1853                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1854                         return self._real_extract(url, new_video=False)
1855
1856                 # Retrieve video webpage to extract further information
1857                 request = urllib2.Request(url)
1858                 try:
1859                         self.report_download_webpage(video_id)
1860                         webpage = urllib2.urlopen(request).read()
1861                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1862                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1863                         return
1864
1865                 # Extract uploader and title from webpage
1866                 self.report_extraction(video_id)
1867                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: unable to extract video title')
1870                         return
1871                 video_title = mobj.group(1).decode('utf-8')
1872                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1873
1874                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1875                 if mobj is None:
1876                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1877                         return
1878                 video_uploader = mobj.group(1).decode('utf-8')
1879
1880                 # Extract video thumbnail
1881                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1882                 if mobj is None:
1883                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1884                         return
1885                 video_thumbnail = mobj.group(1).decode('utf-8')
1886
1887                 # Extract video description
1888                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: unable to extract video description')
1891                         return
1892                 video_description = mobj.group(1).decode('utf-8')
1893                 if not video_description:
1894                         video_description = 'No description available.'
1895
1896                 # Extract video height and width
1897                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1898                 if mobj is None:
1899                         self._downloader.trouble(u'ERROR: unable to extract video height')
1900                         return
1901                 yv_video_height = mobj.group(1)
1902
1903                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1904                 if mobj is None:
1905                         self._downloader.trouble(u'ERROR: unable to extract video width')
1906                         return
1907                 yv_video_width = mobj.group(1)
1908
1909                 # Retrieve video playlist to extract media URL
1910                 # I'm not completely sure what all these options are, but we
1911                 # seem to need most of them, otherwise the server sends a 401.
1912                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1913                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1914                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1915                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1916                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1917                 try:
1918                         self.report_download_webpage(video_id)
1919                         webpage = urllib2.urlopen(request).read()
1920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1921                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1922                         return
1923
1924                 # Extract media URL from playlist XML
1925                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1926                 if mobj is None:
1927                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1928                         return
1929                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1930                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1931
1932                 try:
1933                         # Process video information
1934                         self._downloader.process_info({
1935                                 'id':           video_id.decode('utf-8'),
1936                                 'url':          video_url,
1937                                 'uploader':     video_uploader,
1938                                 'upload_date':  u'NA',
1939                                 'title':        video_title,
1940                                 'stitle':       simple_title,
1941                                 'ext':          video_extension.decode('utf-8'),
1942                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1943                                 'description':  video_description,
1944                                 'thumbnail':    video_thumbnail,
1945                                 'player_url':   None,
1946                         })
1947                 except UnavailableVideoError:
1948                         self._downloader.trouble(u'\nERROR: unable to download video')
1949
1950
1951 class VimeoIE(InfoExtractor):
1952         """Information extractor for vimeo.com."""
1953
1954         # _VALID_URL matches Vimeo URLs
1955         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1956         IE_NAME = u'vimeo'
1957
1958         def __init__(self, downloader=None):
1959                 InfoExtractor.__init__(self, downloader)
1960
1961         def report_download_webpage(self, video_id):
1962                 """Report webpage download."""
1963                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1964
1965         def report_extraction(self, video_id):
1966                 """Report information extraction."""
1967                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1968
1969         def _real_initialize(self):
1970                 return
1971
1972         def _real_extract(self, url, new_video=True):
1973                 # Extract ID from URL
1974                 mobj = re.match(self._VALID_URL, url)
1975                 if mobj is None:
1976                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1977                         return
1978
1979                 # At this point we have a new video
1980                 self._downloader.increment_downloads()
1981                 video_id = mobj.group(1)
1982
1983                 # Retrieve video webpage to extract further information
1984                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1985                 try:
1986                         self.report_download_webpage(video_id)
1987                         webpage = urllib2.urlopen(request).read()
1988                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1989                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1990                         return
1991
1992                 # Now we begin extracting as much information as we can from what we
1993                 # retrieved. First we extract the information common to all extractors,
1994                 # and latter we extract those that are Vimeo specific.
1995                 self.report_extraction(video_id)
1996
1997                 # Extract title
1998                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1999                 if mobj is None:
2000                         self._downloader.trouble(u'ERROR: unable to extract video title')
2001                         return
2002                 video_title = mobj.group(1).decode('utf-8')
2003                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2004
2005                 # Extract uploader
2006                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2007                 if mobj is None:
2008                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2009                         return
2010                 video_uploader = mobj.group(1).decode('utf-8')
2011
2012                 # Extract video thumbnail
2013                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2014                 if mobj is None:
2015                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2016                         return
2017                 video_thumbnail = mobj.group(1).decode('utf-8')
2018
2019                 # # Extract video description
2020                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2021                 # if mobj is None:
2022                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2023                 #       return
2024                 # video_description = mobj.group(1).decode('utf-8')
2025                 # if not video_description: video_description = 'No description available.'
2026                 video_description = 'Foo.'
2027
2028                 # Vimeo specific: extract request signature
2029                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2030                 if mobj is None:
2031                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2032                         return
2033                 sig = mobj.group(1).decode('utf-8')
2034
2035                 # Vimeo specific: Extract request signature expiration
2036                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2037                 if mobj is None:
2038                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2039                         return
2040                 sig_exp = mobj.group(1).decode('utf-8')
2041
2042                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2043
2044                 try:
2045                         # Process video information
2046                         self._downloader.process_info({
2047                                 'id':           video_id.decode('utf-8'),
2048                                 'url':          video_url,
2049                                 'uploader':     video_uploader,
2050                                 'upload_date':  u'NA',
2051                                 'title':        video_title,
2052                                 'stitle':       simple_title,
2053                                 'ext':          u'mp4',
2054                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2055                                 'description':  video_description,
2056                                 'thumbnail':    video_thumbnail,
2057                                 'description':  video_description,
2058                                 'player_url':   None,
2059                         })
2060                 except UnavailableVideoError:
2061                         self._downloader.trouble(u'ERROR: unable to download video')
2062
2063
2064 class GenericIE(InfoExtractor):
2065         """Generic last-resort information extractor."""
2066
2067         _VALID_URL = r'.*'
2068         IE_NAME = u'generic'
2069
2070         def __init__(self, downloader=None):
2071                 InfoExtractor.__init__(self, downloader)
2072
2073         def report_download_webpage(self, video_id):
2074                 """Report webpage download."""
2075                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2076                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2077
2078         def report_extraction(self, video_id):
2079                 """Report information extraction."""
2080                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2081
2082         def _real_initialize(self):
2083                 return
2084
2085         def _real_extract(self, url):
2086                 # At this point we have a new video
2087                 self._downloader.increment_downloads()
2088
2089                 video_id = url.split('/')[-1]
2090                 request = urllib2.Request(url)
2091                 try:
2092                         self.report_download_webpage(video_id)
2093                         webpage = urllib2.urlopen(request).read()
2094                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2095                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2096                         return
2097                 except ValueError, err:
2098                         # since this is the last-resort InfoExtractor, if
2099                         # this error is thrown, it'll be thrown here
2100                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2101                         return
2102
2103                 self.report_extraction(video_id)
2104                 # Start with something easy: JW Player in SWFObject
2105                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2106                 if mobj is None:
2107                         # Broaden the search a little bit
2108                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2109                 if mobj is None:
2110                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2111                         return
2112
2113                 # It's possible that one of the regexes
2114                 # matched, but returned an empty group:
2115                 if mobj.group(1) is None:
2116                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2117                         return
2118
2119                 video_url = urllib.unquote(mobj.group(1))
2120                 video_id = os.path.basename(video_url)
2121
2122                 # here's a fun little line of code for you:
2123                 video_extension = os.path.splitext(video_id)[1][1:]
2124                 video_id = os.path.splitext(video_id)[0]
2125
2126                 # it's tempting to parse this further, but you would
2127                 # have to take into account all the variations like
2128                 #   Video Title - Site Name
2129                 #   Site Name | Video Title
2130                 #   Video Title - Tagline | Site Name
2131                 # and so on and so forth; it's just not practical
2132                 mobj = re.search(r'<title>(.*)</title>', webpage)
2133                 if mobj is None:
2134                         self._downloader.trouble(u'ERROR: unable to extract title')
2135                         return
2136                 video_title = mobj.group(1).decode('utf-8')
2137                 video_title = sanitize_title(video_title)
2138                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2139
2140                 # video uploader is domain name
2141                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2142                 if mobj is None:
2143                         self._downloader.trouble(u'ERROR: unable to extract title')
2144                         return
2145                 video_uploader = mobj.group(1).decode('utf-8')
2146
2147                 try:
2148                         # Process video information
2149                         self._downloader.process_info({
2150                                 'id':           video_id.decode('utf-8'),
2151                                 'url':          video_url.decode('utf-8'),
2152                                 'uploader':     video_uploader,
2153                                 'upload_date':  u'NA',
2154                                 'title':        video_title,
2155                                 'stitle':       simple_title,
2156                                 'ext':          video_extension.decode('utf-8'),
2157                                 'format':       u'NA',
2158                                 'player_url':   None,
2159                         })
2160                 except UnavailableVideoError, err:
2161                         self._downloader.trouble(u'\nERROR: unable to download video')
2162
2163
2164 class YoutubeSearchIE(InfoExtractor):
2165         """Information Extractor for YouTube search queries."""
2166         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2167         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2168         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2169         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2170         _youtube_ie = None
2171         _max_youtube_results = 1000
2172         IE_NAME = u'youtube:search'
2173
2174         def __init__(self, youtube_ie, downloader=None):
2175                 InfoExtractor.__init__(self, downloader)
2176                 self._youtube_ie = youtube_ie
2177
2178         def report_download_page(self, query, pagenum):
2179                 """Report attempt to download playlist page with given number."""
2180                 query = query.decode(preferredencoding())
2181                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2182
2183         def _real_initialize(self):
2184                 self._youtube_ie.initialize()
2185
2186         def _real_extract(self, query):
2187                 mobj = re.match(self._VALID_URL, query)
2188                 if mobj is None:
2189                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2190                         return
2191
2192                 prefix, query = query.split(':')
2193                 prefix = prefix[8:]
2194                 query = query.encode('utf-8')
2195                 if prefix == '':
2196                         self._download_n_results(query, 1)
2197                         return
2198                 elif prefix == 'all':
2199                         self._download_n_results(query, self._max_youtube_results)
2200                         return
2201                 else:
2202                         try:
2203                                 n = long(prefix)
2204                                 if n <= 0:
2205                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2206                                         return
2207                                 elif n > self._max_youtube_results:
2208                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2209                                         n = self._max_youtube_results
2210                                 self._download_n_results(query, n)
2211                                 return
2212                         except ValueError: # parsing prefix as integer fails
2213                                 self._download_n_results(query, 1)
2214                                 return
2215
2216         def _download_n_results(self, query, n):
2217                 """Downloads a specified number of results for a query"""
2218
2219                 video_ids = []
2220                 already_seen = set()
2221                 pagenum = 1
2222
2223                 while True:
2224                         self.report_download_page(query, pagenum)
2225                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2226                         request = urllib2.Request(result_url)
2227                         try:
2228                                 page = urllib2.urlopen(request).read()
2229                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2230                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2231                                 return
2232
2233                         # Extract video identifiers
2234                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2235                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2236                                 if video_id not in already_seen:
2237                                         video_ids.append(video_id)
2238                                         already_seen.add(video_id)
2239                                         if len(video_ids) == n:
2240                                                 # Specified n videos reached
2241                                                 for id in video_ids:
2242                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2243                                                 return
2244
2245                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2246                                 for id in video_ids:
2247                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2248                                 return
2249
2250                         pagenum = pagenum + 1
2251
2252
2253 class GoogleSearchIE(InfoExtractor):
2254         """Information Extractor for Google Video search queries."""
2255         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2256         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2257         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2258         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2259         _google_ie = None
2260         _max_google_results = 1000
2261         IE_NAME = u'video.google:search'
2262
2263         def __init__(self, google_ie, downloader=None):
2264                 InfoExtractor.__init__(self, downloader)
2265                 self._google_ie = google_ie
2266
2267         def report_download_page(self, query, pagenum):
2268                 """Report attempt to download playlist page with given number."""
2269                 query = query.decode(preferredencoding())
2270                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2271
2272         def _real_initialize(self):
2273                 self._google_ie.initialize()
2274
2275         def _real_extract(self, query):
2276                 mobj = re.match(self._VALID_URL, query)
2277                 if mobj is None:
2278                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2279                         return
2280
2281                 prefix, query = query.split(':')
2282                 prefix = prefix[8:]
2283                 query = query.encode('utf-8')
2284                 if prefix == '':
2285                         self._download_n_results(query, 1)
2286                         return
2287                 elif prefix == 'all':
2288                         self._download_n_results(query, self._max_google_results)
2289                         return
2290                 else:
2291                         try:
2292                                 n = long(prefix)
2293                                 if n <= 0:
2294                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2295                                         return
2296                                 elif n > self._max_google_results:
2297                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2298                                         n = self._max_google_results
2299                                 self._download_n_results(query, n)
2300                                 return
2301                         except ValueError: # parsing prefix as integer fails
2302                                 self._download_n_results(query, 1)
2303                                 return
2304
2305         def _download_n_results(self, query, n):
2306                 """Downloads a specified number of results for a query"""
2307
2308                 video_ids = []
2309                 already_seen = set()
2310                 pagenum = 1
2311
2312                 while True:
2313                         self.report_download_page(query, pagenum)
2314                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2315                         request = urllib2.Request(result_url)
2316                         try:
2317                                 page = urllib2.urlopen(request).read()
2318                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2319                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2320                                 return
2321
2322                         # Extract video identifiers
2323                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2324                                 video_id = mobj.group(1)
2325                                 if video_id not in already_seen:
2326                                         video_ids.append(video_id)
2327                                         already_seen.add(video_id)
2328                                         if len(video_ids) == n:
2329                                                 # Specified n videos reached
2330                                                 for id in video_ids:
2331                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2332                                                 return
2333
2334                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2335                                 for id in video_ids:
2336                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2337                                 return
2338
2339                         pagenum = pagenum + 1
2340
2341
2342 class YahooSearchIE(InfoExtractor):
2343         """Information Extractor for Yahoo! Video search queries."""
2344         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2345         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2346         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2347         _MORE_PAGES_INDICATOR = r'\s*Next'
2348         _yahoo_ie = None
2349         _max_yahoo_results = 1000
2350         IE_NAME = u'video.yahoo:search'
2351
2352         def __init__(self, yahoo_ie, downloader=None):
2353                 InfoExtractor.__init__(self, downloader)
2354                 self._yahoo_ie = yahoo_ie
2355
2356         def report_download_page(self, query, pagenum):
2357                 """Report attempt to download playlist page with given number."""
2358                 query = query.decode(preferredencoding())
2359                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2360
2361         def _real_initialize(self):
2362                 self._yahoo_ie.initialize()
2363
2364         def _real_extract(self, query):
2365                 mobj = re.match(self._VALID_URL, query)
2366                 if mobj is None:
2367                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2368                         return
2369
2370                 prefix, query = query.split(':')
2371                 prefix = prefix[8:]
2372                 query = query.encode('utf-8')
2373                 if prefix == '':
2374                         self._download_n_results(query, 1)
2375                         return
2376                 elif prefix == 'all':
2377                         self._download_n_results(query, self._max_yahoo_results)
2378                         return
2379                 else:
2380                         try:
2381                                 n = long(prefix)
2382                                 if n <= 0:
2383                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2384                                         return
2385                                 elif n > self._max_yahoo_results:
2386                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2387                                         n = self._max_yahoo_results
2388                                 self._download_n_results(query, n)
2389                                 return
2390                         except ValueError: # parsing prefix as integer fails
2391                                 self._download_n_results(query, 1)
2392                                 return
2393
2394         def _download_n_results(self, query, n):
2395                 """Downloads a specified number of results for a query"""
2396
2397                 video_ids = []
2398                 already_seen = set()
2399                 pagenum = 1
2400
2401                 while True:
2402                         self.report_download_page(query, pagenum)
2403                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2404                         request = urllib2.Request(result_url)
2405                         try:
2406                                 page = urllib2.urlopen(request).read()
2407                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2408                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2409                                 return
2410
2411                         # Extract video identifiers
2412                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2413                                 video_id = mobj.group(1)
2414                                 if video_id not in already_seen:
2415                                         video_ids.append(video_id)
2416                                         already_seen.add(video_id)
2417                                         if len(video_ids) == n:
2418                                                 # Specified n videos reached
2419                                                 for id in video_ids:
2420                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2421                                                 return
2422
2423                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2424                                 for id in video_ids:
2425                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2426                                 return
2427
2428                         pagenum = pagenum + 1
2429
2430
2431 class YoutubePlaylistIE(InfoExtractor):
2432         """Information Extractor for YouTube playlists."""
2433
2434         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2435         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2436         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2437         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2438         _youtube_ie = None
2439         IE_NAME = u'youtube:playlist'
2440
2441         def __init__(self, youtube_ie, downloader=None):
2442                 InfoExtractor.__init__(self, downloader)
2443                 self._youtube_ie = youtube_ie
2444
2445         def report_download_page(self, playlist_id, pagenum):
2446                 """Report attempt to download playlist page with given number."""
2447                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2448
2449         def _real_initialize(self):
2450                 self._youtube_ie.initialize()
2451
2452         def _real_extract(self, url):
2453                 # Extract playlist id
2454                 mobj = re.match(self._VALID_URL, url)
2455                 if mobj is None:
2456                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2457                         return
2458
2459                 # Single video case
2460                 if mobj.group(3) is not None:
2461                         self._youtube_ie.extract(mobj.group(3))
2462                         return
2463
2464                 # Download playlist pages
2465                 # prefix is 'p' as default for playlists but there are other types that need extra care
2466                 playlist_prefix = mobj.group(1)
2467                 if playlist_prefix == 'a':
2468                         playlist_access = 'artist'
2469                 else:
2470                         playlist_prefix = 'p'
2471                         playlist_access = 'view_play_list'
2472                 playlist_id = mobj.group(2)
2473                 video_ids = []
2474                 pagenum = 1
2475
2476                 while True:
2477                         self.report_download_page(playlist_id, pagenum)
2478                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2479                         try:
2480                                 page = urllib2.urlopen(request).read()
2481                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2482                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2483                                 return
2484
2485                         # Extract video identifiers
2486                         ids_in_page = []
2487                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2488                                 if mobj.group(1) not in ids_in_page:
2489                                         ids_in_page.append(mobj.group(1))
2490                         video_ids.extend(ids_in_page)
2491
2492                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2493                                 break
2494                         pagenum = pagenum + 1
2495
2496                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2497                 playlistend = self._downloader.params.get('playlistend', -1)
2498                 video_ids = video_ids[playliststart:playlistend]
2499
2500                 for id in video_ids:
2501                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2502                 return
2503
2504
2505 class YoutubeUserIE(InfoExtractor):
2506         """Information Extractor for YouTube users."""
2507
2508         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2509         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2510         _GDATA_PAGE_SIZE = 50
2511         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2512         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2513         _youtube_ie = None
2514         IE_NAME = u'youtube:user'
2515
2516         def __init__(self, youtube_ie, downloader=None):
2517                 InfoExtractor.__init__(self, downloader)
2518                 self._youtube_ie = youtube_ie
2519
2520         def report_download_page(self, username, start_index):
2521                 """Report attempt to download user page."""
2522                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2523                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2524
2525         def _real_initialize(self):
2526                 self._youtube_ie.initialize()
2527
2528         def _real_extract(self, url):
2529                 # Extract username
2530                 mobj = re.match(self._VALID_URL, url)
2531                 if mobj is None:
2532                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2533                         return
2534
2535                 username = mobj.group(1)
2536
2537                 # Download video ids using YouTube Data API. Result size per
2538                 # query is limited (currently to 50 videos) so we need to query
2539                 # page by page until there are no video ids - it means we got
2540                 # all of them.
2541
2542                 video_ids = []
2543                 pagenum = 0
2544
2545                 while True:
2546                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2547                         self.report_download_page(username, start_index)
2548
2549                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2550
2551                         try:
2552                                 page = urllib2.urlopen(request).read()
2553                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2554                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2555                                 return
2556
2557                         # Extract video identifiers
2558                         ids_in_page = []
2559
2560                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561                                 if mobj.group(1) not in ids_in_page:
2562                                         ids_in_page.append(mobj.group(1))
2563
2564                         video_ids.extend(ids_in_page)
2565
2566                         # A little optimization - if current page is not
2567                         # "full", ie. does not contain PAGE_SIZE video ids then
2568                         # we can assume that this page is the last one - there
2569                         # are no more ids on further pages - no need to query
2570                         # again.
2571
2572                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2573                                 break
2574
2575                         pagenum += 1
2576
2577                 all_ids_count = len(video_ids)
2578                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2579                 playlistend = self._downloader.params.get('playlistend', -1)
2580
2581                 if playlistend == -1:
2582                         video_ids = video_ids[playliststart:]
2583                 else:
2584                         video_ids = video_ids[playliststart:playlistend]
2585
2586                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2587                                 (username, all_ids_count, len(video_ids)))
2588
2589                 for video_id in video_ids:
2590                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2591
2592
2593 class DepositFilesIE(InfoExtractor):
2594         """Information extractor for depositfiles.com"""
2595
2596         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2597         IE_NAME = u'DepositFiles'
2598
2599         def __init__(self, downloader=None):
2600                 InfoExtractor.__init__(self, downloader)
2601
2602         def report_download_webpage(self, file_id):
2603                 """Report webpage download."""
2604                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2605
2606         def report_extraction(self, file_id):
2607                 """Report information extraction."""
2608                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2609
2610         def _real_initialize(self):
2611                 return
2612
2613         def _real_extract(self, url):
2614                 # At this point we have a new file
2615                 self._downloader.increment_downloads()
2616
2617                 file_id = url.split('/')[-1]
2618                 # Rebuild url in english locale
2619                 url = 'http://depositfiles.com/en/files/' + file_id
2620
2621                 # Retrieve file webpage with 'Free download' button pressed
2622                 free_download_indication = { 'gateway_result' : '1' }
2623                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2624                 try:
2625                         self.report_download_webpage(file_id)
2626                         webpage = urllib2.urlopen(request).read()
2627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2628                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2629                         return
2630
2631                 # Search for the real file URL
2632                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2633                 if (mobj is None) or (mobj.group(1) is None):
2634                         # Try to figure out reason of the error.
2635                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2636                         if (mobj is not None) and (mobj.group(1) is not None):
2637                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2638                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2639                         else:
2640                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2641                         return
2642
2643                 file_url = mobj.group(1)
2644                 file_extension = os.path.splitext(file_url)[1][1:]
2645
2646                 # Search for file title
2647                 mobj = re.search(r'<b title="(.*?)">', webpage)
2648                 if mobj is None:
2649                         self._downloader.trouble(u'ERROR: unable to extract title')
2650                         return
2651                 file_title = mobj.group(1).decode('utf-8')
2652
2653                 try:
2654                         # Process file information
2655                         self._downloader.process_info({
2656                                 'id':           file_id.decode('utf-8'),
2657                                 'url':          file_url.decode('utf-8'),
2658                                 'uploader':     u'NA',
2659                                 'upload_date':  u'NA',
2660                                 'title':        file_title,
2661                                 'stitle':       file_title,
2662                                 'ext':          file_extension.decode('utf-8'),
2663                                 'format':       u'NA',
2664                                 'player_url':   None,
2665                         })
2666                 except UnavailableVideoError, err:
2667                         self._downloader.trouble(u'ERROR: unable to download file')
2668
2669
2670 class FacebookIE(InfoExtractor):
2671         """Information Extractor for Facebook"""
2672
2673         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2674         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2675         _NETRC_MACHINE = 'facebook'
2676         _available_formats = ['highqual', 'lowqual']
2677         _video_extensions = {
2678                 'highqual': 'mp4',
2679                 'lowqual': 'mp4',
2680         }
2681         IE_NAME = u'facebook'
2682
2683         def __init__(self, downloader=None):
2684                 InfoExtractor.__init__(self, downloader)
2685
2686         def _reporter(self, message):
2687                 """Add header and report message."""
2688                 self._downloader.to_screen(u'[facebook] %s' % message)
2689
2690         def report_login(self):
2691                 """Report attempt to log in."""
2692                 self._reporter(u'Logging in')
2693
2694         def report_video_webpage_download(self, video_id):
2695                 """Report attempt to download video webpage."""
2696                 self._reporter(u'%s: Downloading video webpage' % video_id)
2697
2698         def report_information_extraction(self, video_id):
2699                 """Report attempt to extract video information."""
2700                 self._reporter(u'%s: Extracting video information' % video_id)
2701
2702         def _parse_page(self, video_webpage):
2703                 """Extract video information from page"""
2704                 # General data
2705                 data = {'title': r'class="video_title datawrap">(.*?)</',
2706                         'description': r'<div class="datawrap">(.*?)</div>',
2707                         'owner': r'\("video_owner_name", "(.*?)"\)',
2708                         'upload_date': r'data-date="(.*?)"',
2709                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2710                         }
2711                 video_info = {}
2712                 for piece in data.keys():
2713                         mobj = re.search(data[piece], video_webpage)
2714                         if mobj is not None:
2715                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2716
2717                 # Video urls
2718                 video_urls = {}
2719                 for fmt in self._available_formats:
2720                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2721                         if mobj is not None:
2722                                 # URL is in a Javascript segment inside an escaped Unicode format within
2723                                 # the generally utf-8 page
2724                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2725                 video_info['video_urls'] = video_urls
2726
2727                 return video_info
2728
2729         def _real_initialize(self):
2730                 if self._downloader is None:
2731                         return
2732
2733                 useremail = None
2734                 password = None
2735                 downloader_params = self._downloader.params
2736
2737                 # Attempt to use provided username and password or .netrc data
2738                 if downloader_params.get('username', None) is not None:
2739                         useremail = downloader_params['username']
2740                         password = downloader_params['password']
2741                 elif downloader_params.get('usenetrc', False):
2742                         try:
2743                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2744                                 if info is not None:
2745                                         useremail = info[0]
2746                                         password = info[2]
2747                                 else:
2748                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2749                         except (IOError, netrc.NetrcParseError), err:
2750                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2751                                 return
2752
2753                 if useremail is None:
2754                         return
2755
2756                 # Log in
2757                 login_form = {
2758                         'email': useremail,
2759                         'pass': password,
2760                         'login': 'Log+In'
2761                         }
2762                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2763                 try:
2764                         self.report_login()
2765                         login_results = urllib2.urlopen(request).read()
2766                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2767                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2768                                 return
2769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2770                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2771                         return
2772
2773         def _real_extract(self, url):
2774                 mobj = re.match(self._VALID_URL, url)
2775                 if mobj is None:
2776                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2777                         return
2778                 video_id = mobj.group('ID')
2779
2780                 # Get video webpage
2781                 self.report_video_webpage_download(video_id)
2782                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2783                 try:
2784                         page = urllib2.urlopen(request)
2785                         video_webpage = page.read()
2786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2787                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2788                         return
2789
2790                 # Start extracting information
2791                 self.report_information_extraction(video_id)
2792
2793                 # Extract information
2794                 video_info = self._parse_page(video_webpage)
2795
2796                 # uploader
2797                 if 'owner' not in video_info:
2798                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2799                         return
2800                 video_uploader = video_info['owner']
2801
2802                 # title
2803                 if 'title' not in video_info:
2804                         self._downloader.trouble(u'ERROR: unable to extract video title')
2805                         return
2806                 video_title = video_info['title']
2807                 video_title = video_title.decode('utf-8')
2808                 video_title = sanitize_title(video_title)
2809
2810                 # simplified title
2811                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2812                 simple_title = simple_title.strip(ur'_')
2813
2814                 # thumbnail image
2815                 if 'thumbnail' not in video_info:
2816                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2817                         video_thumbnail = ''
2818                 else:
2819                         video_thumbnail = video_info['thumbnail']
2820
2821                 # upload date
2822                 upload_date = u'NA'
2823                 if 'upload_date' in video_info:
2824                         upload_time = video_info['upload_date']
2825                         timetuple = email.utils.parsedate_tz(upload_time)
2826                         if timetuple is not None:
2827                                 try:
2828                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2829                                 except:
2830                                         pass
2831
2832                 # description
2833                 video_description = video_info.get('description', 'No description available.')
2834
2835                 url_map = video_info['video_urls']
2836                 if len(url_map.keys()) > 0:
2837                         # Decide which formats to download
2838                         req_format = self._downloader.params.get('format', None)
2839                         format_limit = self._downloader.params.get('format_limit', None)
2840
2841                         if format_limit is not None and format_limit in self._available_formats:
2842                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2843                         else:
2844                                 format_list = self._available_formats
2845                         existing_formats = [x for x in format_list if x in url_map]
2846                         if len(existing_formats) == 0:
2847                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2848                                 return
2849                         if req_format is None:
2850                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2851                         elif req_format == 'worst':
2852                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2853                         elif req_format == '-1':
2854                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2855                         else:
2856                                 # Specific format
2857                                 if req_format not in url_map:
2858                                         self._downloader.trouble(u'ERROR: requested format not available')
2859                                         return
2860                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2861
2862                 for format_param, video_real_url in video_url_list:
2863
2864                         # At this point we have a new video
2865                         self._downloader.increment_downloads()
2866
2867                         # Extension
2868                         video_extension = self._video_extensions.get(format_param, 'mp4')
2869
2870                         try:
2871                                 # Process video information
2872                                 self._downloader.process_info({
2873                                         'id':           video_id.decode('utf-8'),
2874                                         'url':          video_real_url.decode('utf-8'),
2875                                         'uploader':     video_uploader.decode('utf-8'),
2876                                         'upload_date':  upload_date,
2877                                         'title':        video_title,
2878                                         'stitle':       simple_title,
2879                                         'ext':          video_extension.decode('utf-8'),
2880                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2881                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2882                                         'description':  video_description.decode('utf-8'),
2883                                         'player_url':   None,
2884                                 })
2885                         except UnavailableVideoError, err:
2886                                 self._downloader.trouble(u'\nERROR: unable to download video')
2887
2888 class BlipTVIE(InfoExtractor):
2889         """Information extractor for blip.tv"""
2890
2891         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2892         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2893         IE_NAME = u'blip.tv'
2894
2895         def report_extraction(self, file_id):
2896                 """Report information extraction."""
2897                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2898
2899         def _simplify_title(self, title):
2900                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2901                 res = res.strip(ur'_')
2902                 return res
2903
2904         def _real_extract(self, url):
2905                 mobj = re.match(self._VALID_URL, url)
2906                 if mobj is None:
2907                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2908                         return
2909
2910                 if '?' in url:
2911                         cchar = '&'
2912                 else:
2913                         cchar = '?'
2914                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2915                 request = urllib2.Request(json_url)
2916                 self.report_extraction(mobj.group(1))
2917                 try:
2918                         json_code = urllib2.urlopen(request).read()
2919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2921                         return
2922                 try:
2923                         json_data = json.loads(json_code)
2924                         if 'Post' in json_data:
2925                                 data = json_data['Post']
2926                         else:
2927                                 data = json_data
2928
2929                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2930                         video_url = data['media']['url']
2931                         umobj = re.match(self._URL_EXT, video_url)
2932                         if umobj is None:
2933                                 raise ValueError('Can not determine filename extension')
2934                         ext = umobj.group(1)
2935
2936                         self._downloader.increment_downloads()
2937
2938                         info = {
2939                                 'id': data['item_id'],
2940                                 'url': video_url,
2941                                 'uploader': data['display_name'],
2942                                 'upload_date': upload_date,
2943                                 'title': data['title'],
2944                                 'stitle': self._simplify_title(data['title']),
2945                                 'ext': ext,
2946                                 'format': data['media']['mimeType'],
2947                                 'thumbnail': data['thumbnailUrl'],
2948                                 'description': data['description'],
2949                                 'player_url': data['embedUrl']
2950                         }
2951                 except (ValueError,KeyError), err:
2952                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2953                         return
2954
2955                 try:
2956                         self._downloader.process_info(info)
2957                 except UnavailableVideoError, err:
2958                         self._downloader.trouble(u'\nERROR: unable to download video')
2959
2960
2961 class MyVideoIE(InfoExtractor):
2962         """Information Extractor for myvideo.de."""
2963
2964         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2965         IE_NAME = u'myvideo'
2966
2967         def __init__(self, downloader=None):
2968                 InfoExtractor.__init__(self, downloader)
2969
2970         def report_download_webpage(self, video_id):
2971                 """Report webpage download."""
2972                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2973
2974         def report_extraction(self, video_id):
2975                 """Report information extraction."""
2976                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2977
2978         def _real_initialize(self):
2979                 return
2980
2981         def _real_extract(self,url):
2982                 mobj = re.match(self._VALID_URL, url)
2983                 if mobj is None:
2984                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2985                         return
2986
2987                 video_id = mobj.group(1)
2988                 simple_title = mobj.group(2).decode('utf-8')
2989                 # should actually not be necessary
2990                 simple_title = sanitize_title(simple_title)
2991                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2992
2993                 # Get video webpage
2994                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2995                 try:
2996                         self.report_download_webpage(video_id)
2997                         webpage = urllib2.urlopen(request).read()
2998                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2999                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3000                         return
3001
3002                 self.report_extraction(video_id)
3003                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3004                                  webpage)
3005                 if mobj is None:
3006                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3007                         return
3008                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3009
3010                 mobj = re.search('<title>([^<]+)</title>', webpage)
3011                 if mobj is None:
3012                         self._downloader.trouble(u'ERROR: unable to extract title')
3013                         return
3014
3015                 video_title = mobj.group(1)
3016                 video_title = sanitize_title(video_title)
3017
3018                 try:
3019                         print(video_url)
3020                         self._downloader.process_info({
3021                                 'id':           video_id,
3022                                 'url':          video_url,
3023                                 'uploader':     u'NA',
3024                                 'upload_date':  u'NA',
3025                                 'title':        video_title,
3026                                 'stitle':       simple_title,
3027                                 'ext':          u'flv',
3028                                 'format':       u'NA',
3029                                 'player_url':   None,
3030                         })
3031                 except UnavailableVideoError:
3032                         self._downloader.trouble(u'\nERROR: Unable to download video')
3033
3034 class ComedyCentralIE(InfoExtractor):
3035         """Information extractor for The Daily Show and Colbert Report """
3036
3037         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3038         IE_NAME = u'comedycentral'
3039
3040         def report_extraction(self, episode_id):
3041                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3042
3043         def report_config_download(self, episode_id):
3044                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3045
3046         def report_index_download(self, episode_id):
3047                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3048
3049         def report_player_url(self, episode_id):
3050                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3051
3052         def _simplify_title(self, title):
3053                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3054                 res = res.strip(ur'_')
3055                 return res
3056
3057         def _real_extract(self, url):
3058                 mobj = re.match(self._VALID_URL, url)
3059                 if mobj is None:
3060                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3061                         return
3062
3063                 if mobj.group('shortname'):
3064                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3065                                 url = 'http://www.thedailyshow.com/full-episodes/'
3066                         else:
3067                                 url = 'http://www.colbertnation.com/full-episodes/'
3068                         mobj = re.match(self._VALID_URL, url)
3069                         assert mobj is not None
3070
3071                 dlNewest = not mobj.group('episode')
3072                 if dlNewest:
3073                         epTitle = mobj.group('showname')
3074                 else:
3075                         epTitle = mobj.group('episode')
3076
3077                 req = urllib2.Request(url)
3078                 self.report_extraction(epTitle)
3079                 try:
3080                         htmlHandle = urllib2.urlopen(req)
3081                         html = htmlHandle.read()
3082                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3083                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3084                         return
3085                 if dlNewest:
3086                         url = htmlHandle.geturl()
3087                         mobj = re.match(self._VALID_URL, url)
3088                         if mobj is None:
3089                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3090                                 return
3091                         if mobj.group('episode') == '':
3092                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3093                                 return
3094                         epTitle = mobj.group('episode')
3095
3096                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3097                 if len(mMovieParams) == 0:
3098                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3099                         return
3100
3101                 playerUrl_raw = mMovieParams[0][0]
3102                 self.report_player_url(epTitle)
3103                 try:
3104                         urlHandle = urllib2.urlopen(playerUrl_raw)
3105                         playerUrl = urlHandle.geturl()
3106                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3107                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3108                         return
3109
3110                 uri = mMovieParams[0][1]
3111                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3112                 self.report_index_download(epTitle)
3113                 try:
3114                         indexXml = urllib2.urlopen(indexUrl).read()
3115                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3116                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3117                         return
3118
3119                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3120                 itemEls = idoc.findall('.//item')
3121                 for itemEl in itemEls:
3122                         mediaId = itemEl.findall('./guid')[0].text
3123                         shortMediaId = mediaId.split(':')[-1]
3124                         showId = mediaId.split(':')[-2].replace('.com', '')
3125                         officialTitle = itemEl.findall('./title')[0].text
3126                         officialDate = itemEl.findall('./pubDate')[0].text
3127
3128                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3129                                                 urllib.urlencode({'uri': mediaId}))
3130                         configReq = urllib2.Request(configUrl)
3131                         self.report_config_download(epTitle)
3132                         try:
3133                                 configXml = urllib2.urlopen(configReq).read()
3134                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3135                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3136                                 return
3137
3138                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3139                         turls = []
3140                         for rendition in cdoc.findall('.//rendition'):
3141                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3142                                 turls.append(finfo)
3143
3144                         if len(turls) == 0:
3145                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3146                                 continue
3147
3148                         # For now, just pick the highest bitrate
3149                         format,video_url = turls[-1]
3150
3151                         self._downloader.increment_downloads()
3152
3153                         effTitle = showId + '-' + epTitle
3154                         info = {
3155                                 'id': shortMediaId,
3156                                 'url': video_url,
3157                                 'uploader': showId,
3158                                 'upload_date': officialDate,
3159                                 'title': effTitle,
3160                                 'stitle': self._simplify_title(effTitle),
3161                                 'ext': 'mp4',
3162                                 'format': format,
3163                                 'thumbnail': None,
3164                                 'description': officialTitle,
3165                                 'player_url': playerUrl
3166                         }
3167
3168                         try:
3169                                 self._downloader.process_info(info)
3170                         except UnavailableVideoError, err:
3171                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3172                                 continue
3173
3174
3175 class EscapistIE(InfoExtractor):
3176         """Information extractor for The Escapist """
3177
3178         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3179         IE_NAME = u'escapist'
3180
3181         def report_extraction(self, showName):
3182                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3183
3184         def report_config_download(self, showName):
3185                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3186
3187         def _simplify_title(self, title):
3188                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3189                 res = res.strip(ur'_')
3190                 return res
3191
3192         def _real_extract(self, url):
3193                 htmlParser = HTMLParser.HTMLParser()
3194
3195                 mobj = re.match(self._VALID_URL, url)
3196                 if mobj is None:
3197                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3198                         return
3199                 showName = mobj.group('showname')
3200                 videoId = mobj.group('episode')
3201
3202                 self.report_extraction(showName)
3203                 try:
3204                         webPage = urllib2.urlopen(url).read()
3205                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3206                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3207                         return
3208
3209                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3210                 description = htmlParser.unescape(descMatch.group(1))
3211                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3212                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3213                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3214                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3215                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3216                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3217
3218                 self.report_config_download(showName)
3219                 try:
3220                         configJSON = urllib2.urlopen(configUrl).read()
3221                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3222                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3223                         return
3224
3225                 # Technically, it's JavaScript, not JSON
3226                 configJSON = configJSON.replace("'", '"')
3227
3228                 try:
3229                         config = json.loads(configJSON)
3230                 except (ValueError,), err:
3231                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3232                         return
3233
3234                 playlist = config['playlist']
3235                 videoUrl = playlist[1]['url']
3236
3237                 self._downloader.increment_downloads()
3238                 info = {
3239                         'id': videoId,
3240                         'url': videoUrl,
3241                         'uploader': showName,
3242                         'upload_date': None,
3243                         'title': showName,
3244                         'stitle': self._simplify_title(showName),
3245                         'ext': 'flv',
3246                         'format': 'flv',
3247                         'thumbnail': imgUrl,
3248                         'description': description,
3249                         'player_url': playerUrl,
3250                 }
3251
3252                 try:
3253                         self._downloader.process_info(info)
3254                 except UnavailableVideoError, err:
3255                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3256
3257
3258
3259 class PostProcessor(object):
3260         """Post Processor class.
3261
3262         PostProcessor objects can be added to downloaders with their
3263         add_post_processor() method. When the downloader has finished a
3264         successful download, it will take its internal chain of PostProcessors
3265         and start calling the run() method on each one of them, first with
3266         an initial argument and then with the returned value of the previous
3267         PostProcessor.
3268
3269         The chain will be stopped if one of them ever returns None or the end
3270         of the chain is reached.
3271
3272         PostProcessor objects follow a "mutual registration" process similar
3273         to InfoExtractor objects.
3274         """
3275
3276         _downloader = None
3277
3278         def __init__(self, downloader=None):
3279                 self._downloader = downloader
3280
3281         def set_downloader(self, downloader):
3282                 """Sets the downloader for this PP."""
3283                 self._downloader = downloader
3284
3285         def run(self, information):
3286                 """Run the PostProcessor.
3287
3288                 The "information" argument is a dictionary like the ones
3289                 composed by InfoExtractors. The only difference is that this
3290                 one has an extra field called "filepath" that points to the
3291                 downloaded file.
3292
3293                 When this method returns None, the postprocessing chain is
3294                 stopped. However, this method may return an information
3295                 dictionary that will be passed to the next postprocessing
3296                 object in the chain. It can be the one it received after
3297                 changing some fields.
3298
3299                 In addition, this method may raise a PostProcessingError
3300                 exception that will be taken into account by the downloader
3301                 it was called from.
3302                 """
3303                 return information # by default, do nothing
3304
3305
3306 class FFmpegExtractAudioPP(PostProcessor):
3307
3308         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3309                 PostProcessor.__init__(self, downloader)
3310                 if preferredcodec is None:
3311                         preferredcodec = 'best'
3312                 self._preferredcodec = preferredcodec
3313                 self._preferredquality = preferredquality
3314                 self._keepvideo = keepvideo
3315
3316         @staticmethod
3317         def get_audio_codec(path):
3318                 try:
3319                         cmd = ['ffprobe', '-show_streams', '--', path]
3320                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3321                         output = handle.communicate()[0]
3322                         if handle.wait() != 0:
3323                                 return None
3324                 except (IOError, OSError):
3325                         return None
3326                 audio_codec = None
3327                 for line in output.split('\n'):
3328                         if line.startswith('codec_name='):
3329                                 audio_codec = line.split('=')[1].strip()
3330                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3331                                 return audio_codec
3332                 return None
3333
3334         @staticmethod
3335         def run_ffmpeg(path, out_path, codec, more_opts):
3336                 try:
3337                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3338                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3339                         return (ret == 0)
3340                 except (IOError, OSError):
3341                         return False
3342
3343         def run(self, information):
3344                 path = information['filepath']
3345
3346                 filecodec = self.get_audio_codec(path)
3347                 if filecodec is None:
3348                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3349                         return None
3350
3351                 more_opts = []
3352                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3353                         if filecodec == 'aac' or filecodec == 'mp3':
3354                                 # Lossless if possible
3355                                 acodec = 'copy'
3356                                 extension = filecodec
3357                                 if filecodec == 'aac':
3358                                         more_opts = ['-f', 'adts']
3359                         else:
3360                                 # MP3 otherwise.
3361                                 acodec = 'libmp3lame'
3362                                 extension = 'mp3'
3363                                 more_opts = []
3364                                 if self._preferredquality is not None:
3365                                         more_opts += ['-ab', self._preferredquality]
3366                 else:
3367                         # We convert the audio (lossy)
3368                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3369                         extension = self._preferredcodec
3370                         more_opts = []
3371                         if self._preferredquality is not None:
3372                                 more_opts += ['-ab', self._preferredquality]
3373                         if self._preferredcodec == 'aac':
3374                                 more_opts += ['-f', 'adts']
3375
3376                 (prefix, ext) = os.path.splitext(path)
3377                 new_path = prefix + '.' + extension
3378                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3379                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3380
3381                 if not status:
3382                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3383                         return None
3384
3385                 # Try to update the date time for extracted audio file.
3386                 if information.get('filetime') is not None:
3387                         try:
3388                                 os.utime(new_path, (time.time(), information['filetime']))
3389                         except:
3390                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3391
3392                 if not self._keepvideo:
3393                         try:
3394                                 os.remove(path)
3395                         except (IOError, OSError):
3396                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3397                                 return None
3398
3399                 information['filepath'] = new_path
3400                 return information
3401
3402
3403 def updateSelf(downloader, filename):
3404         ''' Update the program file with the latest version from the repository '''
3405         # Note: downloader only used for options
3406         if not os.access(filename, os.W_OK):
3407                 sys.exit('ERROR: no write permissions on %s' % filename)
3408
3409         downloader.to_screen('Updating to latest version...')
3410
3411         try:
3412                 try:
3413                         urlh = urllib.urlopen(UPDATE_URL)
3414                         newcontent = urlh.read()
3415
3416                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3417                         if vmatch is not None and vmatch.group(1) == __version__:
3418                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3419                                 return
3420                 finally:
3421                         urlh.close()
3422         except (IOError, OSError), err:
3423                 sys.exit('ERROR: unable to download latest version')
3424
3425         try:
3426                 outf = open(filename, 'wb')
3427                 try:
3428                         outf.write(newcontent)
3429                 finally:
3430                         outf.close()
3431         except (IOError, OSError), err:
3432                 sys.exit('ERROR: unable to overwrite current version')
3433
3434         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3435
3436 def parseOpts():
3437         # Deferred imports
3438         import getpass
3439         import optparse
3440
3441         def _format_option_string(option):
3442                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3443
3444                 opts = []
3445
3446                 if option._short_opts: opts.append(option._short_opts[0])
3447                 if option._long_opts: opts.append(option._long_opts[0])
3448                 if len(opts) > 1: opts.insert(1, ', ')
3449
3450                 if option.takes_value(): opts.append(' %s' % option.metavar)
3451
3452                 return "".join(opts)
3453
3454         def _find_term_columns():
3455                 columns = os.environ.get('COLUMNS', None)
3456                 if columns:
3457                         return int(columns)
3458
3459                 try:
3460                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3461                         out,err = sp.communicate()
3462                         return int(out.split()[1])
3463                 except:
3464                         pass
3465                 return None
3466
3467         max_width = 80
3468         max_help_position = 80
3469
3470         # No need to wrap help messages if we're on a wide console
3471         columns = _find_term_columns()
3472         if columns: max_width = columns
3473
3474         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3475         fmt.format_option_strings = _format_option_string
3476
3477         kw = {
3478                 'version'   : __version__,
3479                 'formatter' : fmt,
3480                 'usage' : '%prog [options] url [url...]',
3481                 'conflict_handler' : 'resolve',
3482         }
3483
3484         parser = optparse.OptionParser(**kw)
3485
3486         # option groups
3487         general        = optparse.OptionGroup(parser, 'General Options')
3488         selection      = optparse.OptionGroup(parser, 'Video Selection')
3489         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3490         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3491         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3492         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3493         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3494
3495         general.add_option('-h', '--help',
3496                         action='help', help='print this help text and exit')
3497         general.add_option('-v', '--version',
3498                         action='version', help='print program version and exit')
3499         general.add_option('-U', '--update',
3500                         action='store_true', dest='update_self', help='update this program to latest version')
3501         general.add_option('-i', '--ignore-errors',
3502                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3503         general.add_option('-r', '--rate-limit',
3504                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3505         general.add_option('-R', '--retries',
3506                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3507         general.add_option('--dump-user-agent',
3508                         action='store_true', dest='dump_user_agent',
3509                         help='display the current browser identification', default=False)
3510         general.add_option('--list-extractors',
3511                         action='store_true', dest='list_extractors',
3512                         help='List all supported extractors and the URLs they would handle', default=False)
3513
3514         selection.add_option('--playlist-start',
3515                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3516         selection.add_option('--playlist-end',
3517                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3518         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3519         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3520
3521         authentication.add_option('-u', '--username',
3522                         dest='username', metavar='USERNAME', help='account username')
3523         authentication.add_option('-p', '--password',
3524                         dest='password', metavar='PASSWORD', help='account password')
3525         authentication.add_option('-n', '--netrc',
3526                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3527
3528
3529         video_format.add_option('-f', '--format',
3530                         action='store', dest='format', metavar='FORMAT', help='video format code')
3531         video_format.add_option('--all-formats',
3532                         action='store_const', dest='format', help='download all available video formats', const='all')
3533         video_format.add_option('--max-quality',
3534                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3535
3536
3537         verbosity.add_option('-q', '--quiet',
3538                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3539         verbosity.add_option('-s', '--simulate',
3540                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3541         verbosity.add_option('--skip-download',
3542                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3543         verbosity.add_option('-g', '--get-url',
3544                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3545         verbosity.add_option('-e', '--get-title',
3546                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3547         verbosity.add_option('--get-thumbnail',
3548                         action='store_true', dest='getthumbnail',
3549                         help='simulate, quiet but print thumbnail URL', default=False)
3550         verbosity.add_option('--get-description',
3551                         action='store_true', dest='getdescription',
3552                         help='simulate, quiet but print video description', default=False)
3553         verbosity.add_option('--get-filename',
3554                         action='store_true', dest='getfilename',
3555                         help='simulate, quiet but print output filename', default=False)
3556         verbosity.add_option('--get-format',
3557                         action='store_true', dest='getformat',
3558                         help='simulate, quiet but print output format', default=False)
3559         verbosity.add_option('--no-progress',
3560                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3561         verbosity.add_option('--console-title',
3562                         action='store_true', dest='consoletitle',
3563                         help='display progress in console titlebar', default=False)
3564
3565
3566         filesystem.add_option('-t', '--title',
3567                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3568         filesystem.add_option('-l', '--literal',
3569                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3570         filesystem.add_option('-A', '--auto-number',
3571                         action='store_true', dest='autonumber',
3572                         help='number downloaded files starting from 00000', default=False)
3573         filesystem.add_option('-o', '--output',
3574                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3575         filesystem.add_option('-a', '--batch-file',
3576                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3577         filesystem.add_option('-w', '--no-overwrites',
3578                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3579         filesystem.add_option('-c', '--continue',
3580                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3581         filesystem.add_option('--no-continue',
3582                         action='store_false', dest='continue_dl',
3583                         help='do not resume partially downloaded files (restart from beginning)')
3584         filesystem.add_option('--cookies',
3585                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3586         filesystem.add_option('--no-part',
3587                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3588         filesystem.add_option('--no-mtime',
3589                         action='store_false', dest='updatetime',
3590                         help='do not use the Last-modified header to set the file modification time', default=True)
3591         filesystem.add_option('--write-description',
3592                         action='store_true', dest='writedescription',
3593                         help='write video description to a .description file', default=False)
3594         filesystem.add_option('--write-info-json',
3595                         action='store_true', dest='writeinfojson',
3596                         help='write video metadata to a .info.json file', default=False)
3597
3598
3599         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3600                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3601         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3602                         help='"best", "aac" or "mp3"; best by default')
3603         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3604                         help='ffmpeg audio bitrate specification, 128k by default')
3605         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3606                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3607
3608
3609         parser.add_option_group(general)
3610         parser.add_option_group(selection)
3611         parser.add_option_group(filesystem)
3612         parser.add_option_group(verbosity)
3613         parser.add_option_group(video_format)
3614         parser.add_option_group(authentication)
3615         parser.add_option_group(postproc)
3616
3617         opts, args = parser.parse_args()
3618
3619         return parser, opts, args
3620
3621 def gen_extractors():
3622         """ Return a list of an instance of every supported extractor.
3623         The order does matter; the first extractor matched is the one handling the URL.
3624         """
3625         youtube_ie = YoutubeIE()
3626         google_ie = GoogleIE()
3627         yahoo_ie = YahooIE()
3628         return [
3629                 YoutubePlaylistIE(youtube_ie),
3630                 YoutubeUserIE(youtube_ie),
3631                 YoutubeSearchIE(youtube_ie),
3632                 youtube_ie,
3633                 MetacafeIE(youtube_ie),
3634                 DailymotionIE(),
3635                 google_ie,
3636                 GoogleSearchIE(google_ie),
3637                 PhotobucketIE(),
3638                 yahoo_ie,
3639                 YahooSearchIE(yahoo_ie),
3640                 DepositFilesIE(),
3641                 FacebookIE(),
3642                 BlipTVIE(),
3643                 VimeoIE(),
3644                 MyVideoIE(),
3645                 ComedyCentralIE(),
3646                 EscapistIE(),
3647
3648                 GenericIE()
3649         ]
3650
3651 def main():
3652         parser, opts, args = parseOpts()
3653
3654         # Open appropriate CookieJar
3655         if opts.cookiefile is None:
3656                 jar = cookielib.CookieJar()
3657         else:
3658                 try:
3659                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3660                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3661                                 jar.load()
3662                 except (IOError, OSError), err:
3663                         sys.exit(u'ERROR: unable to open cookie file')
3664
3665         # Dump user agent
3666         if opts.dump_user_agent:
3667                 print std_headers['User-Agent']
3668                 sys.exit(0)
3669
3670         # Batch file verification
3671         batchurls = []
3672         if opts.batchfile is not None:
3673                 try:
3674                         if opts.batchfile == '-':
3675                                 batchfd = sys.stdin
3676                         else:
3677                                 batchfd = open(opts.batchfile, 'r')
3678                         batchurls = batchfd.readlines()
3679                         batchurls = [x.strip() for x in batchurls]
3680                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3681                 except IOError:
3682                         sys.exit(u'ERROR: batch file could not be read')
3683         all_urls = batchurls + args
3684
3685         # General configuration
3686         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3687         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3688         urllib2.install_opener(opener)
3689         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3690
3691         extractors = gen_extractors()
3692
3693         if opts.list_extractors:
3694                 for ie in extractors:
3695                         print(ie.IE_NAME)
3696                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3697                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3698                         for mu in matchedUrls:
3699                                 print(u'  ' + mu)
3700                 sys.exit(0)
3701
3702         # Conflicting, missing and erroneous options
3703         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3704                 parser.error(u'using .netrc conflicts with giving username/password')
3705         if opts.password is not None and opts.username is None:
3706                 parser.error(u'account username missing')
3707         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3708                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3709         if opts.usetitle and opts.useliteral:
3710                 parser.error(u'using title conflicts with using literal title')
3711         if opts.username is not None and opts.password is None:
3712                 opts.password = getpass.getpass(u'Type account password and press return:')
3713         if opts.ratelimit is not None:
3714                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3715                 if numeric_limit is None:
3716                         parser.error(u'invalid rate limit specified')
3717                 opts.ratelimit = numeric_limit
3718         if opts.retries is not None:
3719                 try:
3720                         opts.retries = long(opts.retries)
3721                 except (TypeError, ValueError), err:
3722                         parser.error(u'invalid retry count specified')
3723         try:
3724                 opts.playliststart = int(opts.playliststart)
3725                 if opts.playliststart <= 0:
3726                         raise ValueError(u'Playlist start must be positive')
3727         except (TypeError, ValueError), err:
3728                 parser.error(u'invalid playlist start number specified')
3729         try:
3730                 opts.playlistend = int(opts.playlistend)
3731                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3732                         raise ValueError(u'Playlist end must be greater than playlist start')
3733         except (TypeError, ValueError), err:
3734                 parser.error(u'invalid playlist end number specified')
3735         if opts.extractaudio:
3736                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3737                         parser.error(u'invalid audio format specified')
3738
3739         # File downloader
3740         fd = FileDownloader({
3741                 'usenetrc': opts.usenetrc,
3742                 'username': opts.username,
3743                 'password': opts.password,
3744                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3745                 'forceurl': opts.geturl,
3746                 'forcetitle': opts.gettitle,
3747                 'forcethumbnail': opts.getthumbnail,
3748                 'forcedescription': opts.getdescription,
3749                 'forcefilename': opts.getfilename,
3750                 'forceformat': opts.getformat,
3751                 'simulate': opts.simulate,
3752                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3753                 'format': opts.format,
3754                 'format_limit': opts.format_limit,
3755                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3756                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3757                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3758                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3759                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3760                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3761                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3762                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3763                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3764                         or u'%(id)s.%(ext)s'),
3765                 'ignoreerrors': opts.ignoreerrors,
3766                 'ratelimit': opts.ratelimit,
3767                 'nooverwrites': opts.nooverwrites,
3768                 'retries': opts.retries,
3769                 'continuedl': opts.continue_dl,
3770                 'noprogress': opts.noprogress,
3771                 'playliststart': opts.playliststart,
3772                 'playlistend': opts.playlistend,
3773                 'logtostderr': opts.outtmpl == '-',
3774                 'consoletitle': opts.consoletitle,
3775                 'nopart': opts.nopart,
3776                 'updatetime': opts.updatetime,
3777                 'writedescription': opts.writedescription,
3778                 'writeinfojson': opts.writeinfojson,
3779                 'matchtitle': opts.matchtitle,
3780                 'rejecttitle': opts.rejecttitle,
3781                 })
3782         for extractor in extractors:
3783                 fd.add_info_extractor(extractor)
3784
3785         # PostProcessors
3786         if opts.extractaudio:
3787                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3788
3789         # Update version
3790         if opts.update_self:
3791                 updateSelf(fd, sys.argv[0])
3792
3793         # Maybe do nothing
3794         if len(all_urls) < 1:
3795                 if not opts.update_self:
3796                         parser.error(u'you must provide at least one URL')
3797                 else:
3798                         sys.exit()
3799         retcode = fd.download(all_urls)
3800
3801         # Dump cookie jar if requested
3802         if opts.cookiefile is not None:
3803                 try:
3804                         jar.save()
3805                 except (IOError, OSError), err:
3806                         sys.exit(u'ERROR: unable to save cookie jar')
3807
3808         sys.exit(retcode)
3809
3810
3811 if __name__ == '__main__':
3812         try:
3813                 main()
3814         except DownloadError:
3815                 sys.exit(1)
3816         except SameFileError:
3817                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3818         except KeyboardInterrupt:
3819                 sys.exit(u'\nERROR: Interrupted by user')
3820
3821 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: