youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         )
  18
  19 __license__ = 'Public Domain'
  20 __version__ = '2011.10.19'
  21
  22 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  23
  24 import cookielib
  25 import datetime
  26 import gzip
  27 import htmlentitydefs
  28 import HTMLParser
  29 import httplib
  30 import locale
  31 import math
  32 import netrc
  33 import os
  34 import os.path
  35 import re
  36 import socket
  37 import string
  38 import subprocess
  39 import sys
  40 import time
  41 import urllib
  42 import urllib2
  43 import warnings
  44 import zlib
  45
  46 if os.name == 'nt':
  47         import ctypes
  48
  49 try:
  50         import email.utils
  51 except ImportError: # Python 2.4
  52         import email.Utils
  53 try:
  54         import cStringIO as StringIO
  55 except ImportError:
  56         import StringIO
  57
  58 # parse_qs was moved from the cgi module to the urlparse module recently.
  59 try:
  60         from urlparse import parse_qs
  61 except ImportError:
  62         from cgi import parse_qs
  63
  64 try:
  65         import lxml.etree
  66 except ImportError:
  67         pass # Handled below
  68
  69 try:
  70         import xml.etree.ElementTree
  71 except ImportError: # Python<2.5: Not officially supported, but let it slip
  72         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  73
  74 std_headers = {
  75         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  76         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  77         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  78         'Accept-Encoding': 'gzip, deflate',
  79         'Accept-Language': 'en-us,en;q=0.5',
  80 }
  81
  82 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  83
  84 try:
  85         import json
  86 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  87         import re
  88         class json(object):
  89                 @staticmethod
  90                 def loads(s):
  91                         s = s.decode('UTF-8')
  92                         def raiseError(msg, i):
  93                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  94                         def skipSpace(i, expectMore=True):
  95                                 while i < len(s) and s[i] in ' \t\r\n':
  96                                         i += 1
  97                                 if expectMore:
  98                                         if i >= len(s):
  99                                                 raiseError('Premature end', i)
 100                                 return i
 101                         def decodeEscape(match):
 102                                 esc = match.group(1)
 103                                 _STATIC = {
 104                                         '"': '"',
 105                                         '\\': '\\',
 106                                         '/': '/',
 107                                         'b': unichr(0x8),
 108                                         'f': unichr(0xc),
 109                                         'n': '\n',
 110                                         'r': '\r',
 111                                         't': '\t',
 112                                 }
 113                                 if esc in _STATIC:
 114                                         return _STATIC[esc]
 115                                 if esc[0] == 'u':
 116                                         if len(esc) == 1+4:
 117                                                 return unichr(int(esc[1:5], 16))
 118                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 119                                                 hi = int(esc[1:5], 16)
 120                                                 low = int(esc[7:11], 16)
 121                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 122                                 raise ValueError('Unknown escape ' + str(esc))
 123                         def parseString(i):
 124                                 i += 1
 125                                 e = i
 126                                 while True:
 127                                         e = s.index('"', e)
 128                                         bslashes = 0
 129                                         while s[e-bslashes-1] == '\\':
 130                                                 bslashes += 1
 131                                         if bslashes % 2 == 1:
 132                                                 e += 1
 133                                                 continue
 134                                         break
 135                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 136                                 stri = rexp.sub(decodeEscape, s[i:e])
 137                                 return (e+1,stri)
 138                         def parseObj(i):
 139                                 i += 1
 140                                 res = {}
 141                                 i = skipSpace(i)
 142                                 if s[i] == '}': # Empty dictionary
 143                                         return (i+1,res)
 144                                 while True:
 145                                         if s[i] != '"':
 146                                                 raiseError('Expected a string object key', i)
 147                                         i,key = parseString(i)
 148                                         i = skipSpace(i)
 149                                         if i >= len(s) or s[i] != ':':
 150                                                 raiseError('Expected a colon', i)
 151                                         i,val = parse(i+1)
 152                                         res[key] = val
 153                                         i = skipSpace(i)
 154                                         if s[i] == '}':
 155                                                 return (i+1, res)
 156                                         if s[i] != ',':
 157                                                 raiseError('Expected comma or closing curly brace', i)
 158                                         i = skipSpace(i+1)
 159                         def parseArray(i):
 160                                 res = []
 161                                 i = skipSpace(i+1)
 162                                 if s[i] == ']': # Empty array
 163                                         return (i+1,res)
 164                                 while True:
 165                                         i,val = parse(i)
 166                                         res.append(val)
 167                                         i = skipSpace(i) # Raise exception if premature end
 168                                         if s[i] == ']':
 169                                                 return (i+1, res)
 170                                         if s[i] != ',':
 171                                                 raiseError('Expected a comma or closing bracket', i)
 172                                         i = skipSpace(i+1)
 173                         def parseDiscrete(i):
 174                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 175                                         if s.startswith(k, i):
 176                                                 return (i+len(k), v)
 177                                 raiseError('Not a boolean (or null)', i)
 178                         def parseNumber(i):
 179                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 180                                 if mobj is None:
 181                                         raiseError('Not a number', i)
 182                                 nums = mobj.group(1)
 183                                 if '.' in nums or 'e' in nums or 'E' in nums:
 184                                         return (i+len(nums), float(nums))
 185                                 return (i+len(nums), int(nums))
 186                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 187                         def parse(i):
 188                                 i = skipSpace(i)
 189                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 190                                 i = skipSpace(i, False)
 191                                 return (i,res)
 192                         i,res = parse(0)
 193                         if i < len(s):
 194                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 195                         return res
 196
 197 def preferredencoding():
 198         """Get preferred encoding.
 199
 200         Returns the best encoding scheme for the system, based on
 201         locale.getpreferredencoding() and some further tweaks.
 202         """
 203         def yield_preferredencoding():
 204                 try:
 205                         pref = locale.getpreferredencoding()
 206                         u'TEST'.encode(pref)
 207                 except:
 208                         pref = 'UTF-8'
 209                 while True:
 210                         yield pref
 211         return yield_preferredencoding().next()
 212
 213
 214 def htmlentity_transform(matchobj):
 215         """Transforms an HTML entity to a Unicode character.
 216
 217         This function receives a match object and is intended to be used with
 218         the re.sub() function.
 219         """
 220         entity = matchobj.group(1)
 221
 222         # Known non-numeric HTML entity
 223         if entity in htmlentitydefs.name2codepoint:
 224                 return unichr(htmlentitydefs.name2codepoint[entity])
 225
 226         # Unicode character
 227         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 228         if mobj is not None:
 229                 numstr = mobj.group(1)
 230                 if numstr.startswith(u'x'):
 231                         base = 16
 232                         numstr = u'0%s' % numstr
 233                 else:
 234                         base = 10
 235                 return unichr(long(numstr, base))
 236
 237         # Unknown entity in name, return its literal representation
 238         return (u'&%s;' % entity)
 239
 240
 241 def sanitize_title(utitle):
 242         """Sanitizes a video title so it could be used as part of a filename."""
 243         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 244         return utitle.replace(unicode(os.sep), u'%')
 245
 246
 247 def sanitize_open(filename, open_mode):
 248         """Try to open the given filename, and slightly tweak it if this fails.
 249
 250         Attempts to open the given filename. If this fails, it tries to change
 251         the filename slightly, step by step, until it's either able to open it
 252         or it fails and raises a final exception, like the standard open()
 253         function.
 254
 255         It returns the tuple (stream, definitive_file_name).
 256         """
 257         try:
 258                 if filename == u'-':
 259                         if sys.platform == 'win32':
 260                                 import msvcrt
 261                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 262                         return (sys.stdout, filename)
 263                 stream = open(filename, open_mode)
 264                 return (stream, filename)
 265         except (IOError, OSError), err:
 266                 # In case of error, try to remove win32 forbidden chars
 267                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 268
 269                 # An exception here should be caught in the caller
 270                 stream = open(filename, open_mode)
 271                 return (stream, filename)
 272
 273
 274 def timeconvert(timestr):
 275         """Convert RFC 2822 defined time string into system timestamp"""
 276         timestamp = None
 277         timetuple = email.utils.parsedate_tz(timestr)
 278         if timetuple is not None:
 279                 timestamp = email.utils.mktime_tz(timetuple)
 280         return timestamp
 281
 282
 283 class DownloadError(Exception):
 284         """Download Error exception.
 285
 286         This exception may be thrown by FileDownloader objects if they are not
 287         configured to continue on errors. They will contain the appropriate
 288         error message.
 289         """
 290         pass
 291
 292
 293 class SameFileError(Exception):
 294         """Same File exception.
 295
 296         This exception will be thrown by FileDownloader objects if they detect
 297         multiple files would have to be downloaded to the same file on disk.
 298         """
 299         pass
 300
 301
 302 class PostProcessingError(Exception):
 303         """Post Processing exception.
 304
 305         This exception may be raised by PostProcessor's .run() method to
 306         indicate an error in the postprocessing task.
 307         """
 308         pass
 309
 310
 311 class UnavailableVideoError(Exception):
 312         """Unavailable Format exception.
 313
 314         This exception will be thrown when a video is requested
 315         in a format that is not available for that video.
 316         """
 317         pass
 318
 319
 320 class ContentTooShortError(Exception):
 321         """Content Too Short exception.
 322
 323         This exception may be raised by FileDownloader objects when a file they
 324         download is too small for what the server announced first, indicating
 325         the connection was probably interrupted.
 326         """
 327         # Both in bytes
 328         downloaded = None
 329         expected = None
 330
 331         def __init__(self, downloaded, expected):
 332                 self.downloaded = downloaded
 333                 self.expected = expected
 334
 335
 336 class YoutubeDLHandler(urllib2.HTTPHandler):
 337         """Handler for HTTP requests and responses.
 338
 339         This class, when installed with an OpenerDirector, automatically adds
 340         the standard headers to every HTTP request and handles gzipped and
 341         deflated responses from web servers. If compression is to be avoided in
 342         a particular request, the original request in the program code only has
 343         to include the HTTP header "Youtubedl-No-Compression", which will be
 344         removed before making the real request.
 345
 346         Part of this code was copied from:
 347
 348         http://techknack.net/python-urllib2-handlers/
 349
 350         Andrew Rowls, the author of that code, agreed to release it to the
 351         public domain.
 352         """
 353
 354         @staticmethod
 355         def deflate(data):
 356                 try:
 357                         return zlib.decompress(data, -zlib.MAX_WBITS)
 358                 except zlib.error:
 359                         return zlib.decompress(data)
 360
 361         @staticmethod
 362         def addinfourl_wrapper(stream, headers, url, code):
 363                 if hasattr(urllib2.addinfourl, 'getcode'):
 364                         return urllib2.addinfourl(stream, headers, url, code)
 365                 ret = urllib2.addinfourl(stream, headers, url)
 366                 ret.code = code
 367                 return ret
 368
 369         def http_request(self, req):
 370                 for h in std_headers:
 371                         if h in req.headers:
 372                                 del req.headers[h]
 373                         req.add_header(h, std_headers[h])
 374                 if 'Youtubedl-no-compression' in req.headers:
 375                         if 'Accept-encoding' in req.headers:
 376                                 del req.headers['Accept-encoding']
 377                         del req.headers['Youtubedl-no-compression']
 378                 return req
 379
 380         def http_response(self, req, resp):
 381                 old_resp = resp
 382                 # gzip
 383                 if resp.headers.get('Content-encoding', '') == 'gzip':
 384                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 385                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 386                         resp.msg = old_resp.msg
 387                 # deflate
 388                 if resp.headers.get('Content-encoding', '') == 'deflate':
 389                         gz = StringIO.StringIO(self.deflate(resp.read()))
 390                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 391                         resp.msg = old_resp.msg
 392                 return resp
 393
 394
 395 class FileDownloader(object):
 396         """File Downloader class.
 397
 398         File downloader objects are the ones responsible of downloading the
 399         actual video file and writing it to disk if the user has requested
 400         it, among some other tasks. In most cases there should be one per
 401         program. As, given a video URL, the downloader doesn't know how to
 402         extract all the needed information, task that InfoExtractors do, it
 403         has to pass the URL to one of them.
 404
 405         For this, file downloader objects have a method that allows
 406         InfoExtractors to be registered in a given order. When it is passed
 407         a URL, the file downloader handles it to the first InfoExtractor it
 408         finds that reports being able to handle it. The InfoExtractor extracts
 409         all the information about the video or videos the URL refers to, and
 410         asks the FileDownloader to process the video information, possibly
 411         downloading the video.
 412
 413         File downloaders accept a lot of parameters. In order not to saturate
 414         the object constructor with arguments, it receives a dictionary of
 415         options instead. These options are available through the params
 416         attribute for the InfoExtractors to use. The FileDownloader also
 417         registers itself as the downloader in charge for the InfoExtractors
 418         that are added to it, so this is a "mutual registration".
 419
 420         Available options:
 421
 422         username:         Username for authentication purposes.
 423         password:         Password for authentication purposes.
 424         usenetrc:         Use netrc for authentication instead.
 425         quiet:            Do not print messages to stdout.
 426         forceurl:         Force printing final URL.
 427         forcetitle:       Force printing title.
 428         forcethumbnail:   Force printing thumbnail URL.
 429         forcedescription: Force printing description.
 430         forcefilename:    Force printing final filename.
 431         simulate:         Do not download the video files.
 432         format:           Video format code.
 433         format_limit:     Highest quality format to try.
 434         outtmpl:          Template for output names.
 435         ignoreerrors:     Do not stop on download errors.
 436         ratelimit:        Download speed limit, in bytes/sec.
 437         nooverwrites:     Prevent overwriting files.
 438         retries:          Number of times to retry for HTTP error 5xx
 439         continuedl:       Try to continue downloads if possible.
 440         noprogress:       Do not print the progress bar.
 441         playliststart:    Playlist item to start at.
 442         playlistend:      Playlist item to end at.
 443         matchtitle:       Download only matching titles.
 444         rejecttitle:      Reject downloads for matching titles.
 445         logtostderr:      Log messages to stderr instead of stdout.
 446         consoletitle:     Display progress in console window's titlebar.
 447         nopart:           Do not use temporary .part files.
 448         updatetime:       Use the Last-modified header to set output file timestamps.
 449         writedescription: Write the video description to a .description file
 450         writeinfojson:    Write the video description to a .info.json file
 451         """
 452
 453         params = None
 454         _ies = []
 455         _pps = []
 456         _download_retcode = None
 457         _num_downloads = None
 458         _screen_file = None
 459
 460         def __init__(self, params):
 461                 """Create a FileDownloader object with the given options."""
 462                 self._ies = []
 463                 self._pps = []
 464                 self._download_retcode = 0
 465                 self._num_downloads = 0
 466                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 467                 self.params = params
 468
 469         @staticmethod
 470         def format_bytes(bytes):
 471                 if bytes is None:
 472                         return 'N/A'
 473                 if type(bytes) is str:
 474                         bytes = float(bytes)
 475                 if bytes == 0.0:
 476                         exponent = 0
 477                 else:
 478                         exponent = long(math.log(bytes, 1024.0))
 479                 suffix = 'bkMGTPEZY'[exponent]
 480                 converted = float(bytes) / float(1024 ** exponent)
 481                 return '%.2f%s' % (converted, suffix)
 482
 483         @staticmethod
 484         def calc_percent(byte_counter, data_len):
 485                 if data_len is None:
 486                         return '---.-%'
 487                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 488
 489         @staticmethod
 490         def calc_eta(start, now, total, current):
 491                 if total is None:
 492                         return '--:--'
 493                 dif = now - start
 494                 if current == 0 or dif < 0.001: # One millisecond
 495                         return '--:--'
 496                 rate = float(current) / dif
 497                 eta = long((float(total) - float(current)) / rate)
 498                 (eta_mins, eta_secs) = divmod(eta, 60)
 499                 if eta_mins > 99:
 500                         return '--:--'
 501                 return '%02d:%02d' % (eta_mins, eta_secs)
 502
 503         @staticmethod
 504         def calc_speed(start, now, bytes):
 505                 dif = now - start
 506                 if bytes == 0 or dif < 0.001: # One millisecond
 507                         return '%10s' % '---b/s'
 508                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 509
 510         @staticmethod
 511         def best_block_size(elapsed_time, bytes):
 512                 new_min = max(bytes / 2.0, 1.0)
 513                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 514                 if elapsed_time < 0.001:
 515                         return long(new_max)
 516                 rate = bytes / elapsed_time
 517                 if rate > new_max:
 518                         return long(new_max)
 519                 if rate < new_min:
 520                         return long(new_min)
 521                 return long(rate)
 522
 523         @staticmethod
 524         def parse_bytes(bytestr):
 525                 """Parse a string indicating a byte quantity into a long integer."""
 526                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 527                 if matchobj is None:
 528                         return None
 529                 number = float(matchobj.group(1))
 530                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 531                 return long(round(number * multiplier))
 532
 533         def add_info_extractor(self, ie):
 534                 """Add an InfoExtractor object to the end of the list."""
 535                 self._ies.append(ie)
 536                 ie.set_downloader(self)
 537
 538         def add_post_processor(self, pp):
 539                 """Add a PostProcessor object to the end of the chain."""
 540                 self._pps.append(pp)
 541                 pp.set_downloader(self)
 542
 543         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 544                 """Print message to stdout if not in quiet mode."""
 545                 try:
 546                         if not self.params.get('quiet', False):
 547                                 terminator = [u'\n', u''][skip_eol]
 548                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 549                         self._screen_file.flush()
 550                 except (UnicodeEncodeError), err:
 551                         if not ignore_encoding_errors:
 552                                 raise
 553
 554         def to_stderr(self, message):
 555                 """Print message to stderr."""
 556                 print >>sys.stderr, message.encode(preferredencoding())
 557
 558         def to_cons_title(self, message):
 559                 """Set console/terminal window title to message."""
 560                 if not self.params.get('consoletitle', False):
 561                         return
 562                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 563                         # c_wchar_p() might not be necessary if `message` is
 564                         # already of type unicode()
 565                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 566                 elif 'TERM' in os.environ:
 567                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 568
 569         def fixed_template(self):
 570                 """Checks if the output template is fixed."""
 571                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 572
 573         def trouble(self, message=None):
 574                 """Determine action to take when a download problem appears.
 575
 576                 Depending on if the downloader has been configured to ignore
 577                 download errors or not, this method may throw an exception or
 578                 not when errors are found, after printing the message.
 579                 """
 580                 if message is not None:
 581                         self.to_stderr(message)
 582                 if not self.params.get('ignoreerrors', False):
 583                         raise DownloadError(message)
 584                 self._download_retcode = 1
 585
 586         def slow_down(self, start_time, byte_counter):
 587                 """Sleep if the download speed is over the rate limit."""
 588                 rate_limit = self.params.get('ratelimit', None)
 589                 if rate_limit is None or byte_counter == 0:
 590                         return
 591                 now = time.time()
 592                 elapsed = now - start_time
 593                 if elapsed <= 0.0:
 594                         return
 595                 speed = float(byte_counter) / elapsed
 596                 if speed > rate_limit:
 597                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 598
 599         def temp_name(self, filename):
 600                 """Returns a temporary filename for the given filename."""
 601                 if self.params.get('nopart', False) or filename == u'-' or \
 602                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 603                         return filename
 604                 return filename + u'.part'
 605
 606         def undo_temp_name(self, filename):
 607                 if filename.endswith(u'.part'):
 608                         return filename[:-len(u'.part')]
 609                 return filename
 610
 611         def try_rename(self, old_filename, new_filename):
 612                 try:
 613                         if old_filename == new_filename:
 614                                 return
 615                         os.rename(old_filename, new_filename)
 616                 except (IOError, OSError), err:
 617                         self.trouble(u'ERROR: unable to rename file')
 618
 619         def try_utime(self, filename, last_modified_hdr):
 620                 """Try to set the last-modified time of the given file."""
 621                 if last_modified_hdr is None:
 622                         return
 623                 if not os.path.isfile(filename):
 624                         return
 625                 timestr = last_modified_hdr
 626                 if timestr is None:
 627                         return
 628                 filetime = timeconvert(timestr)
 629                 if filetime is None:
 630                         return filetime
 631                 try:
 632                         os.utime(filename, (time.time(), filetime))
 633                 except:
 634                         pass
 635                 return filetime
 636
 637         def report_writedescription(self, descfn):
 638                 """ Report that the description file is being written """
 639                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 640
 641         def report_writeinfojson(self, infofn):
 642                 """ Report that the metadata file has been written """
 643                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 644
 645         def report_destination(self, filename):
 646                 """Report destination filename."""
 647                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 648
 649         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 650                 """Report download progress."""
 651                 if self.params.get('noprogress', False):
 652                         return
 653                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 654                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 655                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 656                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 657
 658         def report_resuming_byte(self, resume_len):
 659                 """Report attempt to resume at given byte."""
 660                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 661
 662         def report_retry(self, count, retries):
 663                 """Report retry in case of HTTP error 5xx"""
 664                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 665
 666         def report_file_already_downloaded(self, file_name):
 667                 """Report file has already been fully downloaded."""
 668                 try:
 669                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 670                 except (UnicodeEncodeError), err:
 671                         self.to_screen(u'[download] The file has already been downloaded')
 672
 673         def report_unable_to_resume(self):
 674                 """Report it was impossible to resume download."""
 675                 self.to_screen(u'[download] Unable to resume')
 676
 677         def report_finish(self):
 678                 """Report download finished."""
 679                 if self.params.get('noprogress', False):
 680                         self.to_screen(u'[download] Download completed')
 681                 else:
 682                         self.to_screen(u'')
 683
 684         def increment_downloads(self):
 685                 """Increment the ordinal that assigns a number to each file."""
 686                 self._num_downloads += 1
 687
 688         def prepare_filename(self, info_dict):
 689                 """Generate the output filename."""
 690                 try:
 691                         template_dict = dict(info_dict)
 692                         template_dict['epoch'] = unicode(long(time.time()))
 693                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 694                         filename = self.params['outtmpl'] % template_dict
 695                         return filename
 696                 except (ValueError, KeyError), err:
 697                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 698                         return None
 699
 700         def process_info(self, info_dict):
 701                 """Process a single dictionary returned by an InfoExtractor."""
 702                 filename = self.prepare_filename(info_dict)
 703
 704                 # Forced printings
 705                 if self.params.get('forcetitle', False):
 706                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forceurl', False):
 708                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 710                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 712                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forcefilename', False) and filename is not None:
 714                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 715                 if self.params.get('forceformat', False):
 716                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 717
 718                 # Do nothing else if in simulate mode
 719                 if self.params.get('simulate', False):
 720                         return
 721
 722                 if filename is None:
 723                         return
 724
 725                 matchtitle=self.params.get('matchtitle',False)
 726                 rejecttitle=self.params.get('rejecttitle',False)
 727                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 728                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 729                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 730                         return
 731                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 732                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 733                         return
 734
 735                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 736                         self.to_stderr(u'WARNING: file exists and will be skipped')
 737                         return
 738
 739                 try:
 740                         dn = os.path.dirname(filename)
 741                         if dn != '' and not os.path.exists(dn):
 742                                 os.makedirs(dn)
 743                 except (OSError, IOError), err:
 744                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 745                         return
 746
 747                 if self.params.get('writedescription', False):
 748                         try:
 749                                 descfn = filename + '.description'
 750                                 self.report_writedescription(descfn)
 751                                 descfile = open(descfn, 'wb')
 752                                 try:
 753                                         descfile.write(info_dict['description'].encode('utf-8'))
 754                                 finally:
 755                                         descfile.close()
 756                         except (OSError, IOError):
 757                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 758                                 return
 759
 760                 if self.params.get('writeinfojson', False):
 761                         infofn = filename + '.info.json'
 762                         self.report_writeinfojson(infofn)
 763                         try:
 764                                 json.dump
 765                         except (NameError,AttributeError):
 766                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 767                                 return
 768                         try:
 769                                 infof = open(infofn, 'wb')
 770                                 try:
 771                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 772                                         json.dump(json_info_dict, infof)
 773                                 finally:
 774                                         infof.close()
 775                         except (OSError, IOError):
 776                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 777                                 return
 778
 779                 if not self.params.get('skip_download', False):
 780                         try:
 781                                 success = self._do_download(filename, info_dict)
 782                         except (OSError, IOError), err:
 783                                 raise UnavailableVideoError
 784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 785                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 786                                 return
 787                         except (ContentTooShortError, ), err:
 788                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 789                                 return
 790
 791                         if success:
 792                                 try:
 793                                         self.post_process(filename, info_dict)
 794                                 except (PostProcessingError), err:
 795                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 796                                         return
 797
 798         def download(self, url_list):
 799                 """Download a given list of URLs."""
 800                 if len(url_list) > 1 and self.fixed_template():
 801                         raise SameFileError(self.params['outtmpl'])
 802
 803                 for url in url_list:
 804                         suitable_found = False
 805                         for ie in self._ies:
 806                                 # Go to next InfoExtractor if not suitable
 807                                 if not ie.suitable(url):
 808                                         continue
 809
 810                                 # Suitable InfoExtractor found
 811                                 suitable_found = True
 812
 813                                 # Extract information from URL and process it
 814                                 ie.extract(url)
 815
 816                                 # Suitable InfoExtractor had been found; go to next URL
 817                                 break
 818
 819                         if not suitable_found:
 820                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 821
 822                 return self._download_retcode
 823
 824         def post_process(self, filename, ie_info):
 825                 """Run the postprocessing chain on the given file."""
 826                 info = dict(ie_info)
 827                 info['filepath'] = filename
 828                 for pp in self._pps:
 829                         info = pp.run(info)
 830                         if info is None:
 831                                 break
 832
 833         def _download_with_rtmpdump(self, filename, url, player_url):
 834                 self.report_destination(filename)
 835                 tmpfilename = self.temp_name(filename)
 836
 837                 # Check for rtmpdump first
 838                 try:
 839                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 840                 except (OSError, IOError):
 841                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 842                         return False
 843
 844                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 845                 # the connection was interrumpted and resuming appears to be
 846                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 847                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 848                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 849                 while retval == 2 or retval == 1:
 850                         prevsize = os.path.getsize(tmpfilename)
 851                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 852                         time.sleep(5.0) # This seems to be needed
 853                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 854                         cursize = os.path.getsize(tmpfilename)
 855                         if prevsize == cursize and retval == 1:
 856                                 break
 857                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 858                         if prevsize == cursize and retval == 2 and cursize > 1024:
 859                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 860                                 retval = 0
 861                                 break
 862                 if retval == 0:
 863                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 864                         self.try_rename(tmpfilename, filename)
 865                         return True
 866                 else:
 867                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 868                         return False
 869
 870         def _do_download(self, filename, info_dict):
 871                 url = info_dict['url']
 872                 player_url = info_dict.get('player_url', None)
 873
 874                 # Check file already present
 875                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 876                         self.report_file_already_downloaded(filename)
 877                         return True
 878
 879                 # Attempt to download using rtmpdump
 880                 if url.startswith('rtmp'):
 881                         return self._download_with_rtmpdump(filename, url, player_url)
 882
 883                 tmpfilename = self.temp_name(filename)
 884                 stream = None
 885
 886                 # Do not include the Accept-Encoding header
 887                 headers = {'Youtubedl-no-compression': 'True'}
 888                 basic_request = urllib2.Request(url, None, headers)
 889                 request = urllib2.Request(url, None, headers)
 890
 891                 # Establish possible resume length
 892                 if os.path.isfile(tmpfilename):
 893                         resume_len = os.path.getsize(tmpfilename)
 894                 else:
 895                         resume_len = 0
 896
 897                 open_mode = 'wb'
 898                 if resume_len != 0:
 899                         if self.params.get('continuedl', False):
 900                                 self.report_resuming_byte(resume_len)
 901                                 request.add_header('Range','bytes=%d-' % resume_len)
 902                                 open_mode = 'ab'
 903                         else:
 904                                 resume_len = 0
 905
 906                 count = 0
 907                 retries = self.params.get('retries', 0)
 908                 while count <= retries:
 909                         # Establish connection
 910                         try:
 911                                 if count == 0 and 'urlhandle' in info_dict:
 912                                         data = info_dict['urlhandle']
 913                                 data = urllib2.urlopen(request)
 914                                 break
 915                         except (urllib2.HTTPError, ), err:
 916                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 917                                         # Unexpected HTTP error
 918                                         raise
 919                                 elif err.code == 416:
 920                                         # Unable to resume (requested range not satisfiable)
 921                                         try:
 922                                                 # Open the connection again without the range header
 923                                                 data = urllib2.urlopen(basic_request)
 924                                                 content_length = data.info()['Content-Length']
 925                                         except (urllib2.HTTPError, ), err:
 926                                                 if err.code < 500 or err.code >= 600:
 927                                                         raise
 928                                         else:
 929                                                 # Examine the reported length
 930                                                 if (content_length is not None and
 931                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 932                                                         # The file had already been fully downloaded.
 933                                                         # Explanation to the above condition: in issue #175 it was revealed that
 934                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 935                                                         # changing the file size slightly and causing problems for some users. So
 936                                                         # I decided to implement a suggested change and consider the file
 937                                                         # completely downloaded if the file size differs less than 100 bytes from
 938                                                         # the one in the hard drive.
 939                                                         self.report_file_already_downloaded(filename)
 940                                                         self.try_rename(tmpfilename, filename)
 941                                                         return True
 942                                                 else:
 943                                                         # The length does not match, we start the download over
 944                                                         self.report_unable_to_resume()
 945                                                         open_mode = 'wb'
 946                                                         break
 947                         # Retry
 948                         count += 1
 949                         if count <= retries:
 950                                 self.report_retry(count, retries)
 951
 952                 if count > retries:
 953                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 954                         return False
 955
 956                 data_len = data.info().get('Content-length', None)
 957                 if data_len is not None:
 958                         data_len = long(data_len) + resume_len
 959                 data_len_str = self.format_bytes(data_len)
 960                 byte_counter = 0 + resume_len
 961                 block_size = 1024
 962                 start = time.time()
 963                 while True:
 964                         # Download and write
 965                         before = time.time()
 966                         data_block = data.read(block_size)
 967                         after = time.time()
 968                         if len(data_block) == 0:
 969                                 break
 970                         byte_counter += len(data_block)
 971
 972                         # Open file just in time
 973                         if stream is None:
 974                                 try:
 975                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 976                                         assert stream is not None
 977                                         filename = self.undo_temp_name(tmpfilename)
 978                                         self.report_destination(filename)
 979                                 except (OSError, IOError), err:
 980                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 981                                         return False
 982                         try:
 983                                 stream.write(data_block)
 984                         except (IOError, OSError), err:
 985                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 986                                 return False
 987                         block_size = self.best_block_size(after - before, len(data_block))
 988
 989                         # Progress message
 990                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 991                         if data_len is None:
 992                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 993                         else:
 994                                 percent_str = self.calc_percent(byte_counter, data_len)
 995                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 996                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 997
 998                         # Apply rate limit
 999                         self.slow_down(start, byte_counter - resume_len)
1000
1001                 if stream is None:
1002                         self.trouble(u'\nERROR: Did not get any data blocks')
1003                         return False
1004                 stream.close()
1005                 self.report_finish()
1006                 if data_len is not None and byte_counter != data_len:
1007                         raise ContentTooShortError(byte_counter, long(data_len))
1008                 self.try_rename(tmpfilename, filename)
1009
1010                 # Update file modification time
1011                 if self.params.get('updatetime', True):
1012                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1013
1014                 return True
1015
1016
1017 class InfoExtractor(object):
1018         """Information Extractor class.
1019
1020         Information extractors are the classes that, given a URL, extract
1021         information from the video (or videos) the URL refers to. This
1022         information includes the real video URL, the video title and simplified
1023         title, author and others. The information is stored in a dictionary
1024         which is then passed to the FileDownloader. The FileDownloader
1025         processes this information possibly downloading the video to the file
1026         system, among other possible outcomes. The dictionaries must include
1027         the following fields:
1028
1029         id:             Video identifier.
1030         url:            Final video URL.
1031         uploader:       Nickname of the video uploader.
1032         title:          Literal title.
1033         stitle:         Simplified title.
1034         ext:            Video filename extension.
1035         format:         Video format.
1036         player_url:     SWF Player URL (may be None).
1037
1038         The following fields are optional. Their primary purpose is to allow
1039         youtube-dl to serve as the backend for a video search function, such
1040         as the one in youtube2mp3.  They are only used when their respective
1041         forced printing functions are called:
1042
1043         thumbnail:      Full URL to a video thumbnail image.
1044         description:    One-line video description.
1045
1046         Subclasses of this one should re-define the _real_initialize() and
1047         _real_extract() methods and define a _VALID_URL regexp.
1048         Probably, they should also be added to the list of extractors.
1049         """
1050
1051         _ready = False
1052         _downloader = None
1053
1054         def __init__(self, downloader=None):
1055                 """Constructor. Receives an optional downloader."""
1056                 self._ready = False
1057                 self.set_downloader(downloader)
1058
1059         def suitable(self, url):
1060                 """Receives a URL and returns True if suitable for this IE."""
1061                 return re.match(self._VALID_URL, url) is not None
1062
1063         def initialize(self):
1064                 """Initializes an instance (authentication, etc)."""
1065                 if not self._ready:
1066                         self._real_initialize()
1067                         self._ready = True
1068
1069         def extract(self, url):
1070                 """Extracts URL information and returns it in list of dicts."""
1071                 self.initialize()
1072                 return self._real_extract(url)
1073
1074         def set_downloader(self, downloader):
1075                 """Sets the downloader for this IE."""
1076                 self._downloader = downloader
1077
1078         def _real_initialize(self):
1079                 """Real initialization process. Redefine in subclasses."""
1080                 pass
1081
1082         def _real_extract(self, url):
1083                 """Real extraction process. Redefine in subclasses."""
1084                 pass
1085
1086
1087 class YoutubeIE(InfoExtractor):
1088         """Information extractor for youtube.com."""
1089
1090         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1091         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1092         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1093         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1094         _NETRC_MACHINE = 'youtube'
1095         # Listed in order of quality
1096         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1097         _video_extensions = {
1098                 '13': '3gp',
1099                 '17': 'mp4',
1100                 '18': 'mp4',
1101                 '22': 'mp4',
1102                 '37': 'mp4',
1103                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1104                 '43': 'webm',
1105                 '44': 'webm',
1106                 '45': 'webm',
1107         }
1108         _video_dimensions = {
1109                 '5': '240x400',
1110                 '6': '???',
1111                 '13': '???',
1112                 '17': '144x176',
1113                 '18': '360x640',
1114                 '22': '720x1280',
1115                 '34': '360x640',
1116                 '35': '480x854',
1117                 '37': '1080x1920',
1118                 '38': '3072x4096',
1119                 '43': '360x640',
1120                 '44': '480x854',
1121                 '45': '720x1280',
1122         }
1123         IE_NAME = u'youtube'
1124
1125         def report_lang(self):
1126                 """Report attempt to set language."""
1127                 self._downloader.to_screen(u'[youtube] Setting language')
1128
1129         def report_login(self):
1130                 """Report attempt to log in."""
1131                 self._downloader.to_screen(u'[youtube] Logging in')
1132
1133         def report_age_confirmation(self):
1134                 """Report attempt to confirm age."""
1135                 self._downloader.to_screen(u'[youtube] Confirming age')
1136
1137         def report_video_webpage_download(self, video_id):
1138                 """Report attempt to download video webpage."""
1139                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1140
1141         def report_video_info_webpage_download(self, video_id):
1142                 """Report attempt to download video info webpage."""
1143                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1144
1145         def report_information_extraction(self, video_id):
1146                 """Report attempt to extract video information."""
1147                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1148
1149         def report_unavailable_format(self, video_id, format):
1150                 """Report extracted video URL."""
1151                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1152
1153         def report_rtmp_download(self):
1154                 """Indicate the download will use the RTMP protocol."""
1155                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1156
1157         def _print_formats(self, formats):
1158                 print 'Available formats:'
1159                 for x in formats:
1160                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1161
1162         def _real_initialize(self):
1163                 if self._downloader is None:
1164                         return
1165
1166                 username = None
1167                 password = None
1168                 downloader_params = self._downloader.params
1169
1170                 # Attempt to use provided username and password or .netrc data
1171                 if downloader_params.get('username', None) is not None:
1172                         username = downloader_params['username']
1173                         password = downloader_params['password']
1174                 elif downloader_params.get('usenetrc', False):
1175                         try:
1176                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1177                                 if info is not None:
1178                                         username = info[0]
1179                                         password = info[2]
1180                                 else:
1181                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1182                         except (IOError, netrc.NetrcParseError), err:
1183                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1184                                 return
1185
1186                 # Set language
1187                 request = urllib2.Request(self._LANG_URL)
1188                 try:
1189                         self.report_lang()
1190                         urllib2.urlopen(request).read()
1191                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1193                         return
1194
1195                 # No authentication to be performed
1196                 if username is None:
1197                         return
1198
1199                 # Log in
1200                 login_form = {
1201                                 'current_form': 'loginForm',
1202                                 'next':         '/',
1203                                 'action_login': 'Log In',
1204                                 'username':     username,
1205                                 'password':     password,
1206                                 }
1207                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1208                 try:
1209                         self.report_login()
1210                         login_results = urllib2.urlopen(request).read()
1211                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1212                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1213                                 return
1214                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1215                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1216                         return
1217
1218                 # Confirm age
1219                 age_form = {
1220                                 'next_url':             '/',
1221                                 'action_confirm':       'Confirm',
1222                                 }
1223                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1224                 try:
1225                         self.report_age_confirmation()
1226                         age_results = urllib2.urlopen(request).read()
1227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1229                         return
1230
1231         def _real_extract(self, url):
1232                 # Extract video id from URL
1233                 mobj = re.match(self._VALID_URL, url)
1234                 if mobj is None:
1235                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1236                         return
1237                 video_id = mobj.group(2)
1238
1239                 # Get video webpage
1240                 self.report_video_webpage_download(video_id)
1241                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1242                 try:
1243                         video_webpage = urllib2.urlopen(request).read()
1244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1246                         return
1247
1248                 # Attempt to extract SWF player URL
1249                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1250                 if mobj is not None:
1251                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1252                 else:
1253                         player_url = None
1254
1255                 # Get video info
1256                 self.report_video_info_webpage_download(video_id)
1257                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1258                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1259                                         % (video_id, el_type))
1260                         request = urllib2.Request(video_info_url)
1261                         try:
1262                                 video_info_webpage = urllib2.urlopen(request).read()
1263                                 video_info = parse_qs(video_info_webpage)
1264                                 if 'token' in video_info:
1265                                         break
1266                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1267                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1268                                 return
1269                 if 'token' not in video_info:
1270                         if 'reason' in video_info:
1271                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1272                         else:
1273                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1274                         return
1275
1276                 # Start extracting information
1277                 self.report_information_extraction(video_id)
1278
1279                 # uploader
1280                 if 'author' not in video_info:
1281                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1282                         return
1283                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1284
1285                 # title
1286                 if 'title' not in video_info:
1287                         self._downloader.trouble(u'ERROR: unable to extract video title')
1288                         return
1289                 video_title = urllib.unquote_plus(video_info['title'][0])
1290                 video_title = video_title.decode('utf-8')
1291                 video_title = sanitize_title(video_title)
1292
1293                 # simplified title
1294                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1295                 simple_title = simple_title.strip(ur'_')
1296
1297                 # thumbnail image
1298                 if 'thumbnail_url' not in video_info:
1299                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1300                         video_thumbnail = ''
1301                 else:   # don't panic if we can't find it
1302                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1303
1304                 # upload date
1305                 upload_date = u'NA'
1306                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1307                 if mobj is not None:
1308                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1309                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1310                         for expression in format_expressions:
1311                                 try:
1312                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1313                                 except:
1314                                         pass
1315
1316                 # description
1317                 try:
1318                         lxml.etree
1319                 except NameError:
1320                         video_description = u'No description available.'
1321                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1322                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1323                                 if mobj is not None:
1324                                         video_description = mobj.group(1).decode('utf-8')
1325                 else:
1326                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1327                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1328                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1329                         # TODO use another parser
1330
1331                 # token
1332                 video_token = urllib.unquote_plus(video_info['token'][0])
1333
1334                 # Decide which formats to download
1335                 req_format = self._downloader.params.get('format', None)
1336
1337                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1338                         self.report_rtmp_download()
1339                         video_url_list = [(None, video_info['conn'][0])]
1340                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1341                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1342                         url_data = [parse_qs(uds) for uds in url_data_strs]
1343                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1344                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1345
1346                         format_limit = self._downloader.params.get('format_limit', None)
1347                         if format_limit is not None and format_limit in self._available_formats:
1348                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1349                         else:
1350                                 format_list = self._available_formats
1351                         existing_formats = [x for x in format_list if x in url_map]
1352                         if len(existing_formats) == 0:
1353                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1354                                 return
1355                         if self._downloader.params.get('listformats', None):
1356                                 self._print_formats(existing_formats)
1357                                 return
1358                         if req_format is None or req_format == 'best':
1359                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1360                         elif req_format == 'worst':
1361                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1362                         elif req_format in ('-1', 'all'):
1363                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1364                         else:
1365                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1366                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1367                                 req_formats = req_format.split('/')
1368                                 video_url_list = None
1369                                 for rf in req_formats:
1370                                         if rf in url_map:
1371                                                 video_url_list = [(rf, url_map[rf])]
1372                                                 break
1373                                 if video_url_list is None:
1374                                         self._downloader.trouble(u'ERROR: requested format not available')
1375                                         return
1376                 else:
1377                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1378                         return
1379
1380                 for format_param, video_real_url in video_url_list:
1381                         # At this point we have a new video
1382                         self._downloader.increment_downloads()
1383
1384                         # Extension
1385                         video_extension = self._video_extensions.get(format_param, 'flv')
1386
1387                         try:
1388                                 # Process video information
1389                                 self._downloader.process_info({
1390                                         'id':           video_id.decode('utf-8'),
1391                                         'url':          video_real_url.decode('utf-8'),
1392                                         'uploader':     video_uploader.decode('utf-8'),
1393                                         'upload_date':  upload_date,
1394                                         'title':        video_title,
1395                                         'stitle':       simple_title,
1396                                         'ext':          video_extension.decode('utf-8'),
1397                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1398                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1399                                         'description':  video_description,
1400                                         'player_url':   player_url,
1401                                 })
1402                         except UnavailableVideoError, err:
1403                                 self._downloader.trouble(u'\nERROR: unable to download video')
1404
1405
1406 class MetacafeIE(InfoExtractor):
1407         """Information Extractor for metacafe.com."""
1408
1409         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1410         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1411         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1412         _youtube_ie = None
1413         IE_NAME = u'metacafe'
1414
1415         def __init__(self, youtube_ie, downloader=None):
1416                 InfoExtractor.__init__(self, downloader)
1417                 self._youtube_ie = youtube_ie
1418
1419         def report_disclaimer(self):
1420                 """Report disclaimer retrieval."""
1421                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1422
1423         def report_age_confirmation(self):
1424                 """Report attempt to confirm age."""
1425                 self._downloader.to_screen(u'[metacafe] Confirming age')
1426
1427         def report_download_webpage(self, video_id):
1428                 """Report webpage download."""
1429                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1430
1431         def report_extraction(self, video_id):
1432                 """Report information extraction."""
1433                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1434
1435         def _real_initialize(self):
1436                 # Retrieve disclaimer
1437                 request = urllib2.Request(self._DISCLAIMER)
1438                 try:
1439                         self.report_disclaimer()
1440                         disclaimer = urllib2.urlopen(request).read()
1441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1443                         return
1444
1445                 # Confirm age
1446                 disclaimer_form = {
1447                         'filters': '0',
1448                         'submit': "Continue - I'm over 18",
1449                         }
1450                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1451                 try:
1452                         self.report_age_confirmation()
1453                         disclaimer = urllib2.urlopen(request).read()
1454                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1456                         return
1457
1458         def _real_extract(self, url):
1459                 # Extract id and simplified title from URL
1460                 mobj = re.match(self._VALID_URL, url)
1461                 if mobj is None:
1462                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1463                         return
1464
1465                 video_id = mobj.group(1)
1466
1467                 # Check if video comes from YouTube
1468                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1469                 if mobj2 is not None:
1470                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1471                         return
1472
1473                 # At this point we have a new video
1474                 self._downloader.increment_downloads()
1475
1476                 simple_title = mobj.group(2).decode('utf-8')
1477
1478                 # Retrieve video webpage to extract further information
1479                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1480                 try:
1481                         self.report_download_webpage(video_id)
1482                         webpage = urllib2.urlopen(request).read()
1483                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1484                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1485                         return
1486
1487                 # Extract URL, uploader and title from webpage
1488                 self.report_extraction(video_id)
1489                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1490                 if mobj is not None:
1491                         mediaURL = urllib.unquote(mobj.group(1))
1492                         video_extension = mediaURL[-3:]
1493
1494                         # Extract gdaKey if available
1495                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1496                         if mobj is None:
1497                                 video_url = mediaURL
1498                         else:
1499                                 gdaKey = mobj.group(1)
1500                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1501                 else:
1502                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1503                         if mobj is None:
1504                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1505                                 return
1506                         vardict = parse_qs(mobj.group(1))
1507                         if 'mediaData' not in vardict:
1508                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1509                                 return
1510                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1511                         if mobj is None:
1512                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1513                                 return
1514                         mediaURL = mobj.group(1).replace('\\/', '/')
1515                         video_extension = mediaURL[-3:]
1516                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1517
1518                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: unable to extract title')
1521                         return
1522                 video_title = mobj.group(1).decode('utf-8')
1523                 video_title = sanitize_title(video_title)
1524
1525                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1526                 if mobj is None:
1527                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1528                         return
1529                 video_uploader = mobj.group(1)
1530
1531                 try:
1532                         # Process video information
1533                         self._downloader.process_info({
1534                                 'id':           video_id.decode('utf-8'),
1535                                 'url':          video_url.decode('utf-8'),
1536                                 'uploader':     video_uploader.decode('utf-8'),
1537                                 'upload_date':  u'NA',
1538                                 'title':        video_title,
1539                                 'stitle':       simple_title,
1540                                 'ext':          video_extension.decode('utf-8'),
1541                                 'format':       u'NA',
1542                                 'player_url':   None,
1543                         })
1544                 except UnavailableVideoError:
1545                         self._downloader.trouble(u'\nERROR: unable to download video')
1546
1547
1548 class DailymotionIE(InfoExtractor):
1549         """Information Extractor for Dailymotion"""
1550
1551         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1552         IE_NAME = u'dailymotion'
1553
1554         def __init__(self, downloader=None):
1555                 InfoExtractor.__init__(self, downloader)
1556
1557         def report_download_webpage(self, video_id):
1558                 """Report webpage download."""
1559                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1560
1561         def report_extraction(self, video_id):
1562                 """Report information extraction."""
1563                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1564
1565         def _real_initialize(self):
1566                 return
1567
1568         def _real_extract(self, url):
1569                 # Extract id and simplified title from URL
1570                 mobj = re.match(self._VALID_URL, url)
1571                 if mobj is None:
1572                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1573                         return
1574
1575                 # At this point we have a new video
1576                 self._downloader.increment_downloads()
1577                 video_id = mobj.group(1)
1578
1579                 simple_title = mobj.group(2).decode('utf-8')
1580                 video_extension = 'flv'
1581
1582                 # Retrieve video webpage to extract further information
1583                 request = urllib2.Request(url)
1584                 request.add_header('Cookie', 'family_filter=off')
1585                 try:
1586                         self.report_download_webpage(video_id)
1587                         webpage = urllib2.urlopen(request).read()
1588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1590                         return
1591
1592                 # Extract URL, uploader and title from webpage
1593                 self.report_extraction(video_id)
1594                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1595                 if mobj is None:
1596                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1597                         return
1598                 sequence = urllib.unquote(mobj.group(1))
1599                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1600                 if mobj is None:
1601                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1602                         return
1603                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1604
1605                 # if needed add http://www.dailymotion.com/ if relative URL
1606
1607                 video_url = mediaURL
1608
1609                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1610                 if mobj is None:
1611                         self._downloader.trouble(u'ERROR: unable to extract title')
1612                         return
1613                 video_title = mobj.group(1).decode('utf-8')
1614                 video_title = sanitize_title(video_title)
1615
1616                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1619                         return
1620                 video_uploader = mobj.group(1)
1621
1622                 try:
1623                         # Process video information
1624                         self._downloader.process_info({
1625                                 'id':           video_id.decode('utf-8'),
1626                                 'url':          video_url.decode('utf-8'),
1627                                 'uploader':     video_uploader.decode('utf-8'),
1628                                 'upload_date':  u'NA',
1629                                 'title':        video_title,
1630                                 'stitle':       simple_title,
1631                                 'ext':          video_extension.decode('utf-8'),
1632                                 'format':       u'NA',
1633                                 'player_url':   None,
1634                         })
1635                 except UnavailableVideoError:
1636                         self._downloader.trouble(u'\nERROR: unable to download video')
1637
1638
1639 class GoogleIE(InfoExtractor):
1640         """Information extractor for video.google.com."""
1641
1642         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1643         IE_NAME = u'video.google'
1644
1645         def __init__(self, downloader=None):
1646                 InfoExtractor.__init__(self, downloader)
1647
1648         def report_download_webpage(self, video_id):
1649                 """Report webpage download."""
1650                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1651
1652         def report_extraction(self, video_id):
1653                 """Report information extraction."""
1654                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1655
1656         def _real_initialize(self):
1657                 return
1658
1659         def _real_extract(self, url):
1660                 # Extract id from URL
1661                 mobj = re.match(self._VALID_URL, url)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1664                         return
1665
1666                 # At this point we have a new video
1667                 self._downloader.increment_downloads()
1668                 video_id = mobj.group(1)
1669
1670                 video_extension = 'mp4'
1671
1672                 # Retrieve video webpage to extract further information
1673                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1674                 try:
1675                         self.report_download_webpage(video_id)
1676                         webpage = urllib2.urlopen(request).read()
1677                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1679                         return
1680
1681                 # Extract URL, uploader, and title from webpage
1682                 self.report_extraction(video_id)
1683                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1684                 if mobj is None:
1685                         video_extension = 'flv'
1686                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1687                 if mobj is None:
1688                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1689                         return
1690                 mediaURL = urllib.unquote(mobj.group(1))
1691                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1692                 mediaURL = mediaURL.replace('\\x26', '\x26')
1693
1694                 video_url = mediaURL
1695
1696                 mobj = re.search(r'<title>(.*)</title>', webpage)
1697                 if mobj is None:
1698                         self._downloader.trouble(u'ERROR: unable to extract title')
1699                         return
1700                 video_title = mobj.group(1).decode('utf-8')
1701                 video_title = sanitize_title(video_title)
1702                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1703
1704                 # Extract video description
1705                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1706                 if mobj is None:
1707                         self._downloader.trouble(u'ERROR: unable to extract video description')
1708                         return
1709                 video_description = mobj.group(1).decode('utf-8')
1710                 if not video_description:
1711                         video_description = 'No description available.'
1712
1713                 # Extract video thumbnail
1714                 if self._downloader.params.get('forcethumbnail', False):
1715                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1716                         try:
1717                                 webpage = urllib2.urlopen(request).read()
1718                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1719                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1720                                 return
1721                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1722                         if mobj is None:
1723                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1724                                 return
1725                         video_thumbnail = mobj.group(1)
1726                 else:   # we need something to pass to process_info
1727                         video_thumbnail = ''
1728
1729                 try:
1730                         # Process video information
1731                         self._downloader.process_info({
1732                                 'id':           video_id.decode('utf-8'),
1733                                 'url':          video_url.decode('utf-8'),
1734                                 'uploader':     u'NA',
1735                                 'upload_date':  u'NA',
1736                                 'title':        video_title,
1737                                 'stitle':       simple_title,
1738                                 'ext':          video_extension.decode('utf-8'),
1739                                 'format':       u'NA',
1740                                 'player_url':   None,
1741                         })
1742                 except UnavailableVideoError:
1743                         self._downloader.trouble(u'\nERROR: unable to download video')
1744
1745
1746 class PhotobucketIE(InfoExtractor):
1747         """Information extractor for photobucket.com."""
1748
1749         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1750         IE_NAME = u'photobucket'
1751
1752         def __init__(self, downloader=None):
1753                 InfoExtractor.__init__(self, downloader)
1754
1755         def report_download_webpage(self, video_id):
1756                 """Report webpage download."""
1757                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1758
1759         def report_extraction(self, video_id):
1760                 """Report information extraction."""
1761                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1762
1763         def _real_initialize(self):
1764                 return
1765
1766         def _real_extract(self, url):
1767                 # Extract id from URL
1768                 mobj = re.match(self._VALID_URL, url)
1769                 if mobj is None:
1770                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1771                         return
1772
1773                 # At this point we have a new video
1774                 self._downloader.increment_downloads()
1775                 video_id = mobj.group(1)
1776
1777                 video_extension = 'flv'
1778
1779                 # Retrieve video webpage to extract further information
1780                 request = urllib2.Request(url)
1781                 try:
1782                         self.report_download_webpage(video_id)
1783                         webpage = urllib2.urlopen(request).read()
1784                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1785                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1786                         return
1787
1788                 # Extract URL, uploader, and title from webpage
1789                 self.report_extraction(video_id)
1790                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1791                 if mobj is None:
1792                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1793                         return
1794                 mediaURL = urllib.unquote(mobj.group(1))
1795
1796                 video_url = mediaURL
1797
1798                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1799                 if mobj is None:
1800                         self._downloader.trouble(u'ERROR: unable to extract title')
1801                         return
1802                 video_title = mobj.group(1).decode('utf-8')
1803                 video_title = sanitize_title(video_title)
1804                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1805
1806                 video_uploader = mobj.group(2).decode('utf-8')
1807
1808                 try:
1809                         # Process video information
1810                         self._downloader.process_info({
1811                                 'id':           video_id.decode('utf-8'),
1812                                 'url':          video_url.decode('utf-8'),
1813                                 'uploader':     video_uploader,
1814                                 'upload_date':  u'NA',
1815                                 'title':        video_title,
1816                                 'stitle':       simple_title,
1817                                 'ext':          video_extension.decode('utf-8'),
1818                                 'format':       u'NA',
1819                                 'player_url':   None,
1820                         })
1821                 except UnavailableVideoError:
1822                         self._downloader.trouble(u'\nERROR: unable to download video')
1823
1824
1825 class YahooIE(InfoExtractor):
1826         """Information extractor for video.yahoo.com."""
1827
1828         # _VALID_URL matches all Yahoo! Video URLs
1829         # _VPAGE_URL matches only the extractable '/watch/' URLs
1830         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1831         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1832         IE_NAME = u'video.yahoo'
1833
1834         def __init__(self, downloader=None):
1835                 InfoExtractor.__init__(self, downloader)
1836
1837         def report_download_webpage(self, video_id):
1838                 """Report webpage download."""
1839                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1840
1841         def report_extraction(self, video_id):
1842                 """Report information extraction."""
1843                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1844
1845         def _real_initialize(self):
1846                 return
1847
1848         def _real_extract(self, url, new_video=True):
1849                 # Extract ID from URL
1850                 mobj = re.match(self._VALID_URL, url)
1851                 if mobj is None:
1852                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1853                         return
1854
1855                 # At this point we have a new video
1856                 self._downloader.increment_downloads()
1857                 video_id = mobj.group(2)
1858                 video_extension = 'flv'
1859
1860                 # Rewrite valid but non-extractable URLs as
1861                 # extractable English language /watch/ URLs
1862                 if re.match(self._VPAGE_URL, url) is None:
1863                         request = urllib2.Request(url)
1864                         try:
1865                                 webpage = urllib2.urlopen(request).read()
1866                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1867                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1868                                 return
1869
1870                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1871                         if mobj is None:
1872                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1873                                 return
1874                         yahoo_id = mobj.group(1)
1875
1876                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1877                         if mobj is None:
1878                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1879                                 return
1880                         yahoo_vid = mobj.group(1)
1881
1882                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1883                         return self._real_extract(url, new_video=False)
1884
1885                 # Retrieve video webpage to extract further information
1886                 request = urllib2.Request(url)
1887                 try:
1888                         self.report_download_webpage(video_id)
1889                         webpage = urllib2.urlopen(request).read()
1890                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1891                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1892                         return
1893
1894                 # Extract uploader and title from webpage
1895                 self.report_extraction(video_id)
1896                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: unable to extract video title')
1899                         return
1900                 video_title = mobj.group(1).decode('utf-8')
1901                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1902
1903                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1904                 if mobj is None:
1905                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1906                         return
1907                 video_uploader = mobj.group(1).decode('utf-8')
1908
1909                 # Extract video thumbnail
1910                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1913                         return
1914                 video_thumbnail = mobj.group(1).decode('utf-8')
1915
1916                 # Extract video description
1917                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1918                 if mobj is None:
1919                         self._downloader.trouble(u'ERROR: unable to extract video description')
1920                         return
1921                 video_description = mobj.group(1).decode('utf-8')
1922                 if not video_description:
1923                         video_description = 'No description available.'
1924
1925                 # Extract video height and width
1926                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1927                 if mobj is None:
1928                         self._downloader.trouble(u'ERROR: unable to extract video height')
1929                         return
1930                 yv_video_height = mobj.group(1)
1931
1932                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1933                 if mobj is None:
1934                         self._downloader.trouble(u'ERROR: unable to extract video width')
1935                         return
1936                 yv_video_width = mobj.group(1)
1937
1938                 # Retrieve video playlist to extract media URL
1939                 # I'm not completely sure what all these options are, but we
1940                 # seem to need most of them, otherwise the server sends a 401.
1941                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1942                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1943                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1944                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1945                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1946                 try:
1947                         self.report_download_webpage(video_id)
1948                         webpage = urllib2.urlopen(request).read()
1949                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1950                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1951                         return
1952
1953                 # Extract media URL from playlist XML
1954                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1955                 if mobj is None:
1956                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1957                         return
1958                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1959                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1960
1961                 try:
1962                         # Process video information
1963                         self._downloader.process_info({
1964                                 'id':           video_id.decode('utf-8'),
1965                                 'url':          video_url,
1966                                 'uploader':     video_uploader,
1967                                 'upload_date':  u'NA',
1968                                 'title':        video_title,
1969                                 'stitle':       simple_title,
1970                                 'ext':          video_extension.decode('utf-8'),
1971                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1972                                 'description':  video_description,
1973                                 'thumbnail':    video_thumbnail,
1974                                 'player_url':   None,
1975                         })
1976                 except UnavailableVideoError:
1977                         self._downloader.trouble(u'\nERROR: unable to download video')
1978
1979
1980 class VimeoIE(InfoExtractor):
1981         """Information extractor for vimeo.com."""
1982
1983         # _VALID_URL matches Vimeo URLs
1984         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1985         IE_NAME = u'vimeo'
1986
1987         def __init__(self, downloader=None):
1988                 InfoExtractor.__init__(self, downloader)
1989
1990         def report_download_webpage(self, video_id):
1991                 """Report webpage download."""
1992                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1993
1994         def report_extraction(self, video_id):
1995                 """Report information extraction."""
1996                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1997
1998         def _real_initialize(self):
1999                 return
2000
2001         def _real_extract(self, url, new_video=True):
2002                 # Extract ID from URL
2003                 mobj = re.match(self._VALID_URL, url)
2004                 if mobj is None:
2005                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2006                         return
2007
2008                 # At this point we have a new video
2009                 self._downloader.increment_downloads()
2010                 video_id = mobj.group(1)
2011
2012                 # Retrieve video webpage to extract further information
2013                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2014                 try:
2015                         self.report_download_webpage(video_id)
2016                         webpage = urllib2.urlopen(request).read()
2017                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019                         return
2020
2021                 # Now we begin extracting as much information as we can from what we
2022                 # retrieved. First we extract the information common to all extractors,
2023                 # and latter we extract those that are Vimeo specific.
2024                 self.report_extraction(video_id)
2025
2026                 # Extract title
2027                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video title')
2030                         return
2031                 video_title = mobj.group(1).decode('utf-8')
2032                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2033
2034                 # Extract uploader
2035                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038                         return
2039                 video_uploader = mobj.group(1).decode('utf-8')
2040
2041                 # Extract video thumbnail
2042                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045                         return
2046                 video_thumbnail = mobj.group(1).decode('utf-8')
2047
2048                 # # Extract video description
2049                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2050                 # if mobj is None:
2051                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2052                 #       return
2053                 # video_description = mobj.group(1).decode('utf-8')
2054                 # if not video_description: video_description = 'No description available.'
2055                 video_description = 'Foo.'
2056
2057                 # Vimeo specific: extract request signature
2058                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2061                         return
2062                 sig = mobj.group(1).decode('utf-8')
2063
2064                 # Vimeo specific: extract video quality information
2065                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2066                 if mobj is None:
2067                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2068                         return
2069                 quality = mobj.group(1).decode('utf-8')
2070
2071                 if int(quality) == 1:
2072                         quality = 'hd'
2073                 else:
2074                         quality = 'sd'
2075
2076                 # Vimeo specific: Extract request signature expiration
2077                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2078                 if mobj is None:
2079                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2080                         return
2081                 sig_exp = mobj.group(1).decode('utf-8')
2082
2083                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2084
2085                 try:
2086                         # Process video information
2087                         self._downloader.process_info({
2088                                 'id':           video_id.decode('utf-8'),
2089                                 'url':          video_url,
2090                                 'uploader':     video_uploader,
2091                                 'upload_date':  u'NA',
2092                                 'title':        video_title,
2093                                 'stitle':       simple_title,
2094                                 'ext':          u'mp4',
2095                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2096                                 'description':  video_description,
2097                                 'thumbnail':    video_thumbnail,
2098                                 'description':  video_description,
2099                                 'player_url':   None,
2100                         })
2101                 except UnavailableVideoError:
2102                         self._downloader.trouble(u'ERROR: unable to download video')
2103
2104
2105 class GenericIE(InfoExtractor):
2106         """Generic last-resort information extractor."""
2107
2108         _VALID_URL = r'.*'
2109         IE_NAME = u'generic'
2110
2111         def __init__(self, downloader=None):
2112                 InfoExtractor.__init__(self, downloader)
2113
2114         def report_download_webpage(self, video_id):
2115                 """Report webpage download."""
2116                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2117                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2118
2119         def report_extraction(self, video_id):
2120                 """Report information extraction."""
2121                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2122
2123         def _real_initialize(self):
2124                 return
2125
2126         def _real_extract(self, url):
2127                 # At this point we have a new video
2128                 self._downloader.increment_downloads()
2129
2130                 video_id = url.split('/')[-1]
2131                 request = urllib2.Request(url)
2132                 try:
2133                         self.report_download_webpage(video_id)
2134                         webpage = urllib2.urlopen(request).read()
2135                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2136                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2137                         return
2138                 except ValueError, err:
2139                         # since this is the last-resort InfoExtractor, if
2140                         # this error is thrown, it'll be thrown here
2141                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2142                         return
2143
2144                 self.report_extraction(video_id)
2145                 # Start with something easy: JW Player in SWFObject
2146                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2147                 if mobj is None:
2148                         # Broaden the search a little bit
2149                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2150                 if mobj is None:
2151                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2152                         return
2153
2154                 # It's possible that one of the regexes
2155                 # matched, but returned an empty group:
2156                 if mobj.group(1) is None:
2157                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2158                         return
2159
2160                 video_url = urllib.unquote(mobj.group(1))
2161                 video_id = os.path.basename(video_url)
2162
2163                 # here's a fun little line of code for you:
2164                 video_extension = os.path.splitext(video_id)[1][1:]
2165                 video_id = os.path.splitext(video_id)[0]
2166
2167                 # it's tempting to parse this further, but you would
2168                 # have to take into account all the variations like
2169                 #   Video Title - Site Name
2170                 #   Site Name | Video Title
2171                 #   Video Title - Tagline | Site Name
2172                 # and so on and so forth; it's just not practical
2173                 mobj = re.search(r'<title>(.*)</title>', webpage)
2174                 if mobj is None:
2175                         self._downloader.trouble(u'ERROR: unable to extract title')
2176                         return
2177                 video_title = mobj.group(1).decode('utf-8')
2178                 video_title = sanitize_title(video_title)
2179                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2180
2181                 # video uploader is domain name
2182                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2183                 if mobj is None:
2184                         self._downloader.trouble(u'ERROR: unable to extract title')
2185                         return
2186                 video_uploader = mobj.group(1).decode('utf-8')
2187
2188                 try:
2189                         # Process video information
2190                         self._downloader.process_info({
2191                                 'id':           video_id.decode('utf-8'),
2192                                 'url':          video_url.decode('utf-8'),
2193                                 'uploader':     video_uploader,
2194                                 'upload_date':  u'NA',
2195                                 'title':        video_title,
2196                                 'stitle':       simple_title,
2197                                 'ext':          video_extension.decode('utf-8'),
2198                                 'format':       u'NA',
2199                                 'player_url':   None,
2200                         })
2201                 except UnavailableVideoError, err:
2202                         self._downloader.trouble(u'\nERROR: unable to download video')
2203
2204
2205 class YoutubeSearchIE(InfoExtractor):
2206         """Information Extractor for YouTube search queries."""
2207         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2208         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2209         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2210         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2211         _youtube_ie = None
2212         _max_youtube_results = 1000
2213         IE_NAME = u'youtube:search'
2214
2215         def __init__(self, youtube_ie, downloader=None):
2216                 InfoExtractor.__init__(self, downloader)
2217                 self._youtube_ie = youtube_ie
2218
2219         def report_download_page(self, query, pagenum):
2220                 """Report attempt to download playlist page with given number."""
2221                 query = query.decode(preferredencoding())
2222                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2223
2224         def _real_initialize(self):
2225                 self._youtube_ie.initialize()
2226
2227         def _real_extract(self, query):
2228                 mobj = re.match(self._VALID_URL, query)
2229                 if mobj is None:
2230                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2231                         return
2232
2233                 prefix, query = query.split(':')
2234                 prefix = prefix[8:]
2235                 query = query.encode('utf-8')
2236                 if prefix == '':
2237                         self._download_n_results(query, 1)
2238                         return
2239                 elif prefix == 'all':
2240                         self._download_n_results(query, self._max_youtube_results)
2241                         return
2242                 else:
2243                         try:
2244                                 n = long(prefix)
2245                                 if n <= 0:
2246                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2247                                         return
2248                                 elif n > self._max_youtube_results:
2249                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2250                                         n = self._max_youtube_results
2251                                 self._download_n_results(query, n)
2252                                 return
2253                         except ValueError: # parsing prefix as integer fails
2254                                 self._download_n_results(query, 1)
2255                                 return
2256
2257         def _download_n_results(self, query, n):
2258                 """Downloads a specified number of results for a query"""
2259
2260                 video_ids = []
2261                 already_seen = set()
2262                 pagenum = 1
2263
2264                 while True:
2265                         self.report_download_page(query, pagenum)
2266                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2267                         request = urllib2.Request(result_url)
2268                         try:
2269                                 page = urllib2.urlopen(request).read()
2270                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2271                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2272                                 return
2273
2274                         # Extract video identifiers
2275                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2276                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2277                                 if video_id not in already_seen:
2278                                         video_ids.append(video_id)
2279                                         already_seen.add(video_id)
2280                                         if len(video_ids) == n:
2281                                                 # Specified n videos reached
2282                                                 for id in video_ids:
2283                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2284                                                 return
2285
2286                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2287                                 for id in video_ids:
2288                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2289                                 return
2290
2291                         pagenum = pagenum + 1
2292
2293
2294 class GoogleSearchIE(InfoExtractor):
2295         """Information Extractor for Google Video search queries."""
2296         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2297         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2298         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2299         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2300         _google_ie = None
2301         _max_google_results = 1000
2302         IE_NAME = u'video.google:search'
2303
2304         def __init__(self, google_ie, downloader=None):
2305                 InfoExtractor.__init__(self, downloader)
2306                 self._google_ie = google_ie
2307
2308         def report_download_page(self, query, pagenum):
2309                 """Report attempt to download playlist page with given number."""
2310                 query = query.decode(preferredencoding())
2311                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2312
2313         def _real_initialize(self):
2314                 self._google_ie.initialize()
2315
2316         def _real_extract(self, query):
2317                 mobj = re.match(self._VALID_URL, query)
2318                 if mobj is None:
2319                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2320                         return
2321
2322                 prefix, query = query.split(':')
2323                 prefix = prefix[8:]
2324                 query = query.encode('utf-8')
2325                 if prefix == '':
2326                         self._download_n_results(query, 1)
2327                         return
2328                 elif prefix == 'all':
2329                         self._download_n_results(query, self._max_google_results)
2330                         return
2331                 else:
2332                         try:
2333                                 n = long(prefix)
2334                                 if n <= 0:
2335                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2336                                         return
2337                                 elif n > self._max_google_results:
2338                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2339                                         n = self._max_google_results
2340                                 self._download_n_results(query, n)
2341                                 return
2342                         except ValueError: # parsing prefix as integer fails
2343                                 self._download_n_results(query, 1)
2344                                 return
2345
2346         def _download_n_results(self, query, n):
2347                 """Downloads a specified number of results for a query"""
2348
2349                 video_ids = []
2350                 already_seen = set()
2351                 pagenum = 1
2352
2353                 while True:
2354                         self.report_download_page(query, pagenum)
2355                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2356                         request = urllib2.Request(result_url)
2357                         try:
2358                                 page = urllib2.urlopen(request).read()
2359                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2360                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2361                                 return
2362
2363                         # Extract video identifiers
2364                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2365                                 video_id = mobj.group(1)
2366                                 if video_id not in already_seen:
2367                                         video_ids.append(video_id)
2368                                         already_seen.add(video_id)
2369                                         if len(video_ids) == n:
2370                                                 # Specified n videos reached
2371                                                 for id in video_ids:
2372                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2373                                                 return
2374
2375                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2376                                 for id in video_ids:
2377                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2378                                 return
2379
2380                         pagenum = pagenum + 1
2381
2382
2383 class YahooSearchIE(InfoExtractor):
2384         """Information Extractor for Yahoo! Video search queries."""
2385         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2386         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2387         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2388         _MORE_PAGES_INDICATOR = r'\s*Next'
2389         _yahoo_ie = None
2390         _max_yahoo_results = 1000
2391         IE_NAME = u'video.yahoo:search'
2392
2393         def __init__(self, yahoo_ie, downloader=None):
2394                 InfoExtractor.__init__(self, downloader)
2395                 self._yahoo_ie = yahoo_ie
2396
2397         def report_download_page(self, query, pagenum):
2398                 """Report attempt to download playlist page with given number."""
2399                 query = query.decode(preferredencoding())
2400                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2401
2402         def _real_initialize(self):
2403                 self._yahoo_ie.initialize()
2404
2405         def _real_extract(self, query):
2406                 mobj = re.match(self._VALID_URL, query)
2407                 if mobj is None:
2408                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2409                         return
2410
2411                 prefix, query = query.split(':')
2412                 prefix = prefix[8:]
2413                 query = query.encode('utf-8')
2414                 if prefix == '':
2415                         self._download_n_results(query, 1)
2416                         return
2417                 elif prefix == 'all':
2418                         self._download_n_results(query, self._max_yahoo_results)
2419                         return
2420                 else:
2421                         try:
2422                                 n = long(prefix)
2423                                 if n <= 0:
2424                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2425                                         return
2426                                 elif n > self._max_yahoo_results:
2427                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2428                                         n = self._max_yahoo_results
2429                                 self._download_n_results(query, n)
2430                                 return
2431                         except ValueError: # parsing prefix as integer fails
2432                                 self._download_n_results(query, 1)
2433                                 return
2434
2435         def _download_n_results(self, query, n):
2436                 """Downloads a specified number of results for a query"""
2437
2438                 video_ids = []
2439                 already_seen = set()
2440                 pagenum = 1
2441
2442                 while True:
2443                         self.report_download_page(query, pagenum)
2444                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2445                         request = urllib2.Request(result_url)
2446                         try:
2447                                 page = urllib2.urlopen(request).read()
2448                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2449                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2450                                 return
2451
2452                         # Extract video identifiers
2453                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2454                                 video_id = mobj.group(1)
2455                                 if video_id not in already_seen:
2456                                         video_ids.append(video_id)
2457                                         already_seen.add(video_id)
2458                                         if len(video_ids) == n:
2459                                                 # Specified n videos reached
2460                                                 for id in video_ids:
2461                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2462                                                 return
2463
2464                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2465                                 for id in video_ids:
2466                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2467                                 return
2468
2469                         pagenum = pagenum + 1
2470
2471
2472 class YoutubePlaylistIE(InfoExtractor):
2473         """Information Extractor for YouTube playlists."""
2474
2475         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2476         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2477         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2478         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2479         _youtube_ie = None
2480         IE_NAME = u'youtube:playlist'
2481
2482         def __init__(self, youtube_ie, downloader=None):
2483                 InfoExtractor.__init__(self, downloader)
2484                 self._youtube_ie = youtube_ie
2485
2486         def report_download_page(self, playlist_id, pagenum):
2487                 """Report attempt to download playlist page with given number."""
2488                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2489
2490         def _real_initialize(self):
2491                 self._youtube_ie.initialize()
2492
2493         def _real_extract(self, url):
2494                 # Extract playlist id
2495                 mobj = re.match(self._VALID_URL, url)
2496                 if mobj is None:
2497                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2498                         return
2499
2500                 # Single video case
2501                 if mobj.group(3) is not None:
2502                         self._youtube_ie.extract(mobj.group(3))
2503                         return
2504
2505                 # Download playlist pages
2506                 # prefix is 'p' as default for playlists but there are other types that need extra care
2507                 playlist_prefix = mobj.group(1)
2508                 if playlist_prefix == 'a':
2509                         playlist_access = 'artist'
2510                 else:
2511                         playlist_prefix = 'p'
2512                         playlist_access = 'view_play_list'
2513                 playlist_id = mobj.group(2)
2514                 video_ids = []
2515                 pagenum = 1
2516
2517                 while True:
2518                         self.report_download_page(playlist_id, pagenum)
2519                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2520                         request = urllib2.Request(url)
2521                         try:
2522                                 page = urllib2.urlopen(request).read()
2523                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2525                                 return
2526
2527                         # Extract video identifiers
2528                         ids_in_page = []
2529                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2530                                 if mobj.group(1) not in ids_in_page:
2531                                         ids_in_page.append(mobj.group(1))
2532                         video_ids.extend(ids_in_page)
2533
2534                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2535                                 break
2536                         pagenum = pagenum + 1
2537
2538                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2539                 playlistend = self._downloader.params.get('playlistend', -1)
2540                 video_ids = video_ids[playliststart:playlistend]
2541
2542                 for id in video_ids:
2543                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2544                 return
2545
2546
2547 class YoutubeUserIE(InfoExtractor):
2548         """Information Extractor for YouTube users."""
2549
2550         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2551         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2552         _GDATA_PAGE_SIZE = 50
2553         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2554         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2555         _youtube_ie = None
2556         IE_NAME = u'youtube:user'
2557
2558         def __init__(self, youtube_ie, downloader=None):
2559                 InfoExtractor.__init__(self, downloader)
2560                 self._youtube_ie = youtube_ie
2561
2562         def report_download_page(self, username, start_index):
2563                 """Report attempt to download user page."""
2564                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2565                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2566
2567         def _real_initialize(self):
2568                 self._youtube_ie.initialize()
2569
2570         def _real_extract(self, url):
2571                 # Extract username
2572                 mobj = re.match(self._VALID_URL, url)
2573                 if mobj is None:
2574                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2575                         return
2576
2577                 username = mobj.group(1)
2578
2579                 # Download video ids using YouTube Data API. Result size per
2580                 # query is limited (currently to 50 videos) so we need to query
2581                 # page by page until there are no video ids - it means we got
2582                 # all of them.
2583
2584                 video_ids = []
2585                 pagenum = 0
2586
2587                 while True:
2588                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2589                         self.report_download_page(username, start_index)
2590
2591                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2592
2593                         try:
2594                                 page = urllib2.urlopen(request).read()
2595                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2596                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2597                                 return
2598
2599                         # Extract video identifiers
2600                         ids_in_page = []
2601
2602                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2603                                 if mobj.group(1) not in ids_in_page:
2604                                         ids_in_page.append(mobj.group(1))
2605
2606                         video_ids.extend(ids_in_page)
2607
2608                         # A little optimization - if current page is not
2609                         # "full", ie. does not contain PAGE_SIZE video ids then
2610                         # we can assume that this page is the last one - there
2611                         # are no more ids on further pages - no need to query
2612                         # again.
2613
2614                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2615                                 break
2616
2617                         pagenum += 1
2618
2619                 all_ids_count = len(video_ids)
2620                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2621                 playlistend = self._downloader.params.get('playlistend', -1)
2622
2623                 if playlistend == -1:
2624                         video_ids = video_ids[playliststart:]
2625                 else:
2626                         video_ids = video_ids[playliststart:playlistend]
2627
2628                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2629                                 (username, all_ids_count, len(video_ids)))
2630
2631                 for video_id in video_ids:
2632                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2633
2634
2635 class DepositFilesIE(InfoExtractor):
2636         """Information extractor for depositfiles.com"""
2637
2638         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2639         IE_NAME = u'DepositFiles'
2640
2641         def __init__(self, downloader=None):
2642                 InfoExtractor.__init__(self, downloader)
2643
2644         def report_download_webpage(self, file_id):
2645                 """Report webpage download."""
2646                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2647
2648         def report_extraction(self, file_id):
2649                 """Report information extraction."""
2650                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2651
2652         def _real_initialize(self):
2653                 return
2654
2655         def _real_extract(self, url):
2656                 # At this point we have a new file
2657                 self._downloader.increment_downloads()
2658
2659                 file_id = url.split('/')[-1]
2660                 # Rebuild url in english locale
2661                 url = 'http://depositfiles.com/en/files/' + file_id
2662
2663                 # Retrieve file webpage with 'Free download' button pressed
2664                 free_download_indication = { 'gateway_result' : '1' }
2665                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2666                 try:
2667                         self.report_download_webpage(file_id)
2668                         webpage = urllib2.urlopen(request).read()
2669                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2670                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2671                         return
2672
2673                 # Search for the real file URL
2674                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2675                 if (mobj is None) or (mobj.group(1) is None):
2676                         # Try to figure out reason of the error.
2677                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2678                         if (mobj is not None) and (mobj.group(1) is not None):
2679                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2680                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2681                         else:
2682                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2683                         return
2684
2685                 file_url = mobj.group(1)
2686                 file_extension = os.path.splitext(file_url)[1][1:]
2687
2688                 # Search for file title
2689                 mobj = re.search(r'<b title="(.*?)">', webpage)
2690                 if mobj is None:
2691                         self._downloader.trouble(u'ERROR: unable to extract title')
2692                         return
2693                 file_title = mobj.group(1).decode('utf-8')
2694
2695                 try:
2696                         # Process file information
2697                         self._downloader.process_info({
2698                                 'id':           file_id.decode('utf-8'),
2699                                 'url':          file_url.decode('utf-8'),
2700                                 'uploader':     u'NA',
2701                                 'upload_date':  u'NA',
2702                                 'title':        file_title,
2703                                 'stitle':       file_title,
2704                                 'ext':          file_extension.decode('utf-8'),
2705                                 'format':       u'NA',
2706                                 'player_url':   None,
2707                         })
2708                 except UnavailableVideoError, err:
2709                         self._downloader.trouble(u'ERROR: unable to download file')
2710
2711
2712 class FacebookIE(InfoExtractor):
2713         """Information Extractor for Facebook"""
2714
2715         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2716         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2717         _NETRC_MACHINE = 'facebook'
2718         _available_formats = ['video', 'highqual', 'lowqual']
2719         _video_extensions = {
2720                 'video': 'mp4',
2721                 'highqual': 'mp4',
2722                 'lowqual': 'mp4',
2723         }
2724         IE_NAME = u'facebook'
2725
2726         def __init__(self, downloader=None):
2727                 InfoExtractor.__init__(self, downloader)
2728
2729         def _reporter(self, message):
2730                 """Add header and report message."""
2731                 self._downloader.to_screen(u'[facebook] %s' % message)
2732
2733         def report_login(self):
2734                 """Report attempt to log in."""
2735                 self._reporter(u'Logging in')
2736
2737         def report_video_webpage_download(self, video_id):
2738                 """Report attempt to download video webpage."""
2739                 self._reporter(u'%s: Downloading video webpage' % video_id)
2740
2741         def report_information_extraction(self, video_id):
2742                 """Report attempt to extract video information."""
2743                 self._reporter(u'%s: Extracting video information' % video_id)
2744
2745         def _parse_page(self, video_webpage):
2746                 """Extract video information from page"""
2747                 # General data
2748                 data = {'title': r'\("video_title", "(.*?)"\)',
2749                         'description': r'<div class="datawrap">(.*?)</div>',
2750                         'owner': r'\("video_owner_name", "(.*?)"\)',
2751                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2752                         }
2753                 video_info = {}
2754                 for piece in data.keys():
2755                         mobj = re.search(data[piece], video_webpage)
2756                         if mobj is not None:
2757                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2758
2759                 # Video urls
2760                 video_urls = {}
2761                 for fmt in self._available_formats:
2762                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2763                         if mobj is not None:
2764                                 # URL is in a Javascript segment inside an escaped Unicode format within
2765                                 # the generally utf-8 page
2766                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2767                 video_info['video_urls'] = video_urls
2768
2769                 return video_info
2770
2771         def _real_initialize(self):
2772                 if self._downloader is None:
2773                         return
2774
2775                 useremail = None
2776                 password = None
2777                 downloader_params = self._downloader.params
2778
2779                 # Attempt to use provided username and password or .netrc data
2780                 if downloader_params.get('username', None) is not None:
2781                         useremail = downloader_params['username']
2782                         password = downloader_params['password']
2783                 elif downloader_params.get('usenetrc', False):
2784                         try:
2785                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2786                                 if info is not None:
2787                                         useremail = info[0]
2788                                         password = info[2]
2789                                 else:
2790                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2791                         except (IOError, netrc.NetrcParseError), err:
2792                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2793                                 return
2794
2795                 if useremail is None:
2796                         return
2797
2798                 # Log in
2799                 login_form = {
2800                         'email': useremail,
2801                         'pass': password,
2802                         'login': 'Log+In'
2803                         }
2804                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2805                 try:
2806                         self.report_login()
2807                         login_results = urllib2.urlopen(request).read()
2808                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2809                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2810                                 return
2811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2812                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2813                         return
2814
2815         def _real_extract(self, url):
2816                 mobj = re.match(self._VALID_URL, url)
2817                 if mobj is None:
2818                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2819                         return
2820                 video_id = mobj.group('ID')
2821
2822                 # Get video webpage
2823                 self.report_video_webpage_download(video_id)
2824                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2825                 try:
2826                         page = urllib2.urlopen(request)
2827                         video_webpage = page.read()
2828                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2829                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2830                         return
2831
2832                 # Start extracting information
2833                 self.report_information_extraction(video_id)
2834
2835                 # Extract information
2836                 video_info = self._parse_page(video_webpage)
2837
2838                 # uploader
2839                 if 'owner' not in video_info:
2840                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2841                         return
2842                 video_uploader = video_info['owner']
2843
2844                 # title
2845                 if 'title' not in video_info:
2846                         self._downloader.trouble(u'ERROR: unable to extract video title')
2847                         return
2848                 video_title = video_info['title']
2849                 video_title = video_title.decode('utf-8')
2850                 video_title = sanitize_title(video_title)
2851
2852                 # simplified title
2853                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2854                 simple_title = simple_title.strip(ur'_')
2855
2856                 # thumbnail image
2857                 if 'thumbnail' not in video_info:
2858                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2859                         video_thumbnail = ''
2860                 else:
2861                         video_thumbnail = video_info['thumbnail']
2862
2863                 # upload date
2864                 upload_date = u'NA'
2865                 if 'upload_date' in video_info:
2866                         upload_time = video_info['upload_date']
2867                         timetuple = email.utils.parsedate_tz(upload_time)
2868                         if timetuple is not None:
2869                                 try:
2870                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2871                                 except:
2872                                         pass
2873
2874                 # description
2875                 video_description = video_info.get('description', 'No description available.')
2876
2877                 url_map = video_info['video_urls']
2878                 if len(url_map.keys()) > 0:
2879                         # Decide which formats to download
2880                         req_format = self._downloader.params.get('format', None)
2881                         format_limit = self._downloader.params.get('format_limit', None)
2882
2883                         if format_limit is not None and format_limit in self._available_formats:
2884                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2885                         else:
2886                                 format_list = self._available_formats
2887                         existing_formats = [x for x in format_list if x in url_map]
2888                         if len(existing_formats) == 0:
2889                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2890                                 return
2891                         if req_format is None:
2892                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2893                         elif req_format == 'worst':
2894                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2895                         elif req_format == '-1':
2896                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2897                         else:
2898                                 # Specific format
2899                                 if req_format not in url_map:
2900                                         self._downloader.trouble(u'ERROR: requested format not available')
2901                                         return
2902                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2903
2904                 for format_param, video_real_url in video_url_list:
2905
2906                         # At this point we have a new video
2907                         self._downloader.increment_downloads()
2908
2909                         # Extension
2910                         video_extension = self._video_extensions.get(format_param, 'mp4')
2911
2912                         try:
2913                                 # Process video information
2914                                 self._downloader.process_info({
2915                                         'id':           video_id.decode('utf-8'),
2916                                         'url':          video_real_url.decode('utf-8'),
2917                                         'uploader':     video_uploader.decode('utf-8'),
2918                                         'upload_date':  upload_date,
2919                                         'title':        video_title,
2920                                         'stitle':       simple_title,
2921                                         'ext':          video_extension.decode('utf-8'),
2922                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2923                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2924                                         'description':  video_description.decode('utf-8'),
2925                                         'player_url':   None,
2926                                 })
2927                         except UnavailableVideoError, err:
2928                                 self._downloader.trouble(u'\nERROR: unable to download video')
2929
2930 class BlipTVIE(InfoExtractor):
2931         """Information extractor for blip.tv"""
2932
2933         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2934         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2935         IE_NAME = u'blip.tv'
2936
2937         def report_extraction(self, file_id):
2938                 """Report information extraction."""
2939                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2940
2941         def report_direct_download(self, title):
2942                 """Report information extraction."""
2943                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2944
2945         def _simplify_title(self, title):
2946                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2947                 res = res.strip(ur'_')
2948                 return res
2949
2950         def _real_extract(self, url):
2951                 mobj = re.match(self._VALID_URL, url)
2952                 if mobj is None:
2953                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2954                         return
2955
2956                 if '?' in url:
2957                         cchar = '&'
2958                 else:
2959                         cchar = '?'
2960                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2961                 request = urllib2.Request(json_url)
2962                 self.report_extraction(mobj.group(1))
2963                 info = None
2964                 try:
2965                         urlh = urllib2.urlopen(request)
2966                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2967                                 basename = url.split('/')[-1]
2968                                 title,ext = os.path.splitext(basename)
2969                                 ext = ext.replace('.', '')
2970                                 self.report_direct_download(title)
2971                                 info = {
2972                                         'id': title,
2973                                         'url': url,
2974                                         'title': title,
2975                                         'stitle': self._simplify_title(title),
2976                                         'ext': ext,
2977                                         'urlhandle': urlh
2978                                 }
2979                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2980                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2981                         return
2982                 if info is None: # Regular URL
2983                         try:
2984                                 json_code = urlh.read()
2985                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2986                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2987                                 return
2988
2989                         try:
2990                                 json_data = json.loads(json_code)
2991                                 if 'Post' in json_data:
2992                                         data = json_data['Post']
2993                                 else:
2994                                         data = json_data
2995
2996                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2997                                 video_url = data['media']['url']
2998                                 umobj = re.match(self._URL_EXT, video_url)
2999                                 if umobj is None:
3000                                         raise ValueError('Can not determine filename extension')
3001                                 ext = umobj.group(1)
3002
3003                                 info = {
3004                                         'id': data['item_id'],
3005                                         'url': video_url,
3006                                         'uploader': data['display_name'],
3007                                         'upload_date': upload_date,
3008                                         'title': data['title'],
3009                                         'stitle': self._simplify_title(data['title']),
3010                                         'ext': ext,
3011                                         'format': data['media']['mimeType'],
3012                                         'thumbnail': data['thumbnailUrl'],
3013                                         'description': data['description'],
3014                                         'player_url': data['embedUrl']
3015                                 }
3016                         except (ValueError,KeyError), err:
3017                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3018                                 return
3019
3020                 self._downloader.increment_downloads()
3021
3022                 try:
3023                         self._downloader.process_info(info)
3024                 except UnavailableVideoError, err:
3025                         self._downloader.trouble(u'\nERROR: unable to download video')
3026
3027
3028 class MyVideoIE(InfoExtractor):
3029         """Information Extractor for myvideo.de."""
3030
3031         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3032         IE_NAME = u'myvideo'
3033
3034         def __init__(self, downloader=None):
3035                 InfoExtractor.__init__(self, downloader)
3036
3037         def report_download_webpage(self, video_id):
3038                 """Report webpage download."""
3039                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3040
3041         def report_extraction(self, video_id):
3042                 """Report information extraction."""
3043                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3044
3045         def _real_initialize(self):
3046                 return
3047
3048         def _real_extract(self,url):
3049                 mobj = re.match(self._VALID_URL, url)
3050                 if mobj is None:
3051                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3052                         return
3053
3054                 video_id = mobj.group(1)
3055                 simple_title = mobj.group(2).decode('utf-8')
3056                 # should actually not be necessary
3057                 simple_title = sanitize_title(simple_title)
3058                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3059
3060                 # Get video webpage
3061                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3062                 try:
3063                         self.report_download_webpage(video_id)
3064                         webpage = urllib2.urlopen(request).read()
3065                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3066                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3067                         return
3068
3069                 self.report_extraction(video_id)
3070                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3071                                  webpage)
3072                 if mobj is None:
3073                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3074                         return
3075                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3076
3077                 mobj = re.search('<title>([^<]+)</title>', webpage)
3078                 if mobj is None:
3079                         self._downloader.trouble(u'ERROR: unable to extract title')
3080                         return
3081
3082                 video_title = mobj.group(1)
3083                 video_title = sanitize_title(video_title)
3084
3085                 try:
3086                         self._downloader.process_info({
3087                                 'id':           video_id,
3088                                 'url':          video_url,
3089                                 'uploader':     u'NA',
3090                                 'upload_date':  u'NA',
3091                                 'title':        video_title,
3092                                 'stitle':       simple_title,
3093                                 'ext':          u'flv',
3094                                 'format':       u'NA',
3095                                 'player_url':   None,
3096                         })
3097                 except UnavailableVideoError:
3098                         self._downloader.trouble(u'\nERROR: Unable to download video')
3099
3100 class ComedyCentralIE(InfoExtractor):
3101         """Information extractor for The Daily Show and Colbert Report """
3102
3103         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3104         IE_NAME = u'comedycentral'
3105
3106         def report_extraction(self, episode_id):
3107                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3108
3109         def report_config_download(self, episode_id):
3110                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3111
3112         def report_index_download(self, episode_id):
3113                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3114
3115         def report_player_url(self, episode_id):
3116                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3117
3118         def _simplify_title(self, title):
3119                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3120                 res = res.strip(ur'_')
3121                 return res
3122
3123         def _real_extract(self, url):
3124                 mobj = re.match(self._VALID_URL, url)
3125                 if mobj is None:
3126                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3127                         return
3128
3129                 if mobj.group('shortname'):
3130                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3131                                 url = 'http://www.thedailyshow.com/full-episodes/'
3132                         else:
3133                                 url = 'http://www.colbertnation.com/full-episodes/'
3134                         mobj = re.match(self._VALID_URL, url)
3135                         assert mobj is not None
3136
3137                 dlNewest = not mobj.group('episode')
3138                 if dlNewest:
3139                         epTitle = mobj.group('showname')
3140                 else:
3141                         epTitle = mobj.group('episode')
3142
3143                 req = urllib2.Request(url)
3144                 self.report_extraction(epTitle)
3145                 try:
3146                         htmlHandle = urllib2.urlopen(req)
3147                         html = htmlHandle.read()
3148                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3149                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3150                         return
3151                 if dlNewest:
3152                         url = htmlHandle.geturl()
3153                         mobj = re.match(self._VALID_URL, url)
3154                         if mobj is None:
3155                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3156                                 return
3157                         if mobj.group('episode') == '':
3158                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3159                                 return
3160                         epTitle = mobj.group('episode')
3161
3162                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3163                 if len(mMovieParams) == 0:
3164                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3165                         return
3166
3167                 playerUrl_raw = mMovieParams[0][0]
3168                 self.report_player_url(epTitle)
3169                 try:
3170                         urlHandle = urllib2.urlopen(playerUrl_raw)
3171                         playerUrl = urlHandle.geturl()
3172                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3173                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3174                         return
3175
3176                 uri = mMovieParams[0][1]
3177                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3178                 self.report_index_download(epTitle)
3179                 try:
3180                         indexXml = urllib2.urlopen(indexUrl).read()
3181                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3182                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3183                         return
3184
3185                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3186                 itemEls = idoc.findall('.//item')
3187                 for itemEl in itemEls:
3188                         mediaId = itemEl.findall('./guid')[0].text
3189                         shortMediaId = mediaId.split(':')[-1]
3190                         showId = mediaId.split(':')[-2].replace('.com', '')
3191                         officialTitle = itemEl.findall('./title')[0].text
3192                         officialDate = itemEl.findall('./pubDate')[0].text
3193
3194                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3195                                                 urllib.urlencode({'uri': mediaId}))
3196                         configReq = urllib2.Request(configUrl)
3197                         self.report_config_download(epTitle)
3198                         try:
3199                                 configXml = urllib2.urlopen(configReq).read()
3200                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3201                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3202                                 return
3203
3204                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3205                         turls = []
3206                         for rendition in cdoc.findall('.//rendition'):
3207                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3208                                 turls.append(finfo)
3209
3210                         if len(turls) == 0:
3211                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3212                                 continue
3213
3214                         # For now, just pick the highest bitrate
3215                         format,video_url = turls[-1]
3216
3217                         self._downloader.increment_downloads()
3218
3219                         effTitle = showId + '-' + epTitle
3220                         info = {
3221                                 'id': shortMediaId,
3222                                 'url': video_url,
3223                                 'uploader': showId,
3224                                 'upload_date': officialDate,
3225                                 'title': effTitle,
3226                                 'stitle': self._simplify_title(effTitle),
3227                                 'ext': 'mp4',
3228                                 'format': format,
3229                                 'thumbnail': None,
3230                                 'description': officialTitle,
3231                                 'player_url': playerUrl
3232                         }
3233
3234                         try:
3235                                 self._downloader.process_info(info)
3236                         except UnavailableVideoError, err:
3237                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3238                                 continue
3239
3240
3241 class EscapistIE(InfoExtractor):
3242         """Information extractor for The Escapist """
3243
3244         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3245         IE_NAME = u'escapist'
3246
3247         def report_extraction(self, showName):
3248                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3249
3250         def report_config_download(self, showName):
3251                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3252
3253         def _simplify_title(self, title):
3254                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3255                 res = res.strip(ur'_')
3256                 return res
3257
3258         def _real_extract(self, url):
3259                 htmlParser = HTMLParser.HTMLParser()
3260
3261                 mobj = re.match(self._VALID_URL, url)
3262                 if mobj is None:
3263                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3264                         return
3265                 showName = mobj.group('showname')
3266                 videoId = mobj.group('episode')
3267
3268                 self.report_extraction(showName)
3269                 try:
3270                         webPage = urllib2.urlopen(url).read()
3271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3272                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3273                         return
3274
3275                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3276                 description = htmlParser.unescape(descMatch.group(1))
3277                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3278                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3279                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3280                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3281                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3282                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3283
3284                 self.report_config_download(showName)
3285                 try:
3286                         configJSON = urllib2.urlopen(configUrl).read()
3287                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3288                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3289                         return
3290
3291                 # Technically, it's JavaScript, not JSON
3292                 configJSON = configJSON.replace("'", '"')
3293
3294                 try:
3295                         config = json.loads(configJSON)
3296                 except (ValueError,), err:
3297                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3298                         return
3299
3300                 playlist = config['playlist']
3301                 videoUrl = playlist[1]['url']
3302
3303                 self._downloader.increment_downloads()
3304                 info = {
3305                         'id': videoId,
3306                         'url': videoUrl,
3307                         'uploader': showName,
3308                         'upload_date': None,
3309                         'title': showName,
3310                         'stitle': self._simplify_title(showName),
3311                         'ext': 'flv',
3312                         'format': 'flv',
3313                         'thumbnail': imgUrl,
3314                         'description': description,
3315                         'player_url': playerUrl,
3316                 }
3317
3318                 try:
3319                         self._downloader.process_info(info)
3320                 except UnavailableVideoError, err:
3321                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3322
3323
3324 class CollegeHumorIE(InfoExtractor):
3325         """Information extractor for collegehumor.com"""
3326
3327         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3328         IE_NAME = u'collegehumor'
3329
3330         def report_webpage(self, video_id):
3331                 """Report information extraction."""
3332                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3333
3334         def report_extraction(self, video_id):
3335                 """Report information extraction."""
3336                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3337
3338         def _simplify_title(self, title):
3339                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3340                 res = res.strip(ur'_')
3341                 return res
3342
3343         def _real_extract(self, url):
3344                 htmlParser = HTMLParser.HTMLParser()
3345
3346                 mobj = re.match(self._VALID_URL, url)
3347                 if mobj is None:
3348                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3349                         return
3350                 video_id = mobj.group('videoid')
3351
3352                 self.report_webpage(video_id)
3353                 request = urllib2.Request(url)
3354                 try:
3355                         webpage = urllib2.urlopen(request).read()
3356                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3357                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3358                         return
3359
3360                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3361                 if m is None:
3362                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3363                         return
3364                 internal_video_id = m.group('internalvideoid')
3365
3366                 info = {
3367                         'id': video_id,
3368                         'internal_id': internal_video_id,
3369                 }
3370
3371                 self.report_extraction(video_id)
3372                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3373                 try:
3374                         metaXml = urllib2.urlopen(xmlUrl).read()
3375                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3376                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3377                         return
3378
3379                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3380                 try:
3381                         videoNode = mdoc.findall('./video')[0]
3382                         info['description'] = videoNode.findall('./description')[0].text
3383                         info['title'] = videoNode.findall('./caption')[0].text
3384                         info['stitle'] = self._simplify_title(info['title'])
3385                         info['url'] = videoNode.findall('./file')[0].text
3386                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3387                         info['ext'] = info['url'].rpartition('.')[2]
3388                         info['format'] = info['ext']
3389                 except IndexError:
3390                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3391                         return
3392
3393                 self._downloader.increment_downloads()
3394
3395                 try:
3396                         self._downloader.process_info(info)
3397                 except UnavailableVideoError, err:
3398                         self._downloader.trouble(u'\nERROR: unable to download video')
3399
3400
3401 class XVideosIE(InfoExtractor):
3402         """Information extractor for xvideos.com"""
3403
3404         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3405         IE_NAME = u'xvideos'
3406
3407         def report_webpage(self, video_id):
3408                 """Report information extraction."""
3409                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3410
3411         def report_extraction(self, video_id):
3412                 """Report information extraction."""
3413                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3414
3415         def _simplify_title(self, title):
3416                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3417                 res = res.strip(ur'_')
3418                 return res
3419
3420         def _real_extract(self, url):
3421                 htmlParser = HTMLParser.HTMLParser()
3422
3423                 mobj = re.match(self._VALID_URL, url)
3424                 if mobj is None:
3425                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3426                         return
3427                 video_id = mobj.group(1).decode('utf-8')
3428
3429                 self.report_webpage(video_id)
3430
3431                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3432                 try:
3433                         webpage = urllib2.urlopen(request).read()
3434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3436                         return
3437
3438                 self.report_extraction(video_id)
3439
3440
3441                 # Extract video URL
3442                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3443                 if mobj is None:
3444                         self._downloader.trouble(u'ERROR: unable to extract video url')
3445                         return
3446                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3447
3448
3449                 # Extract title
3450                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3451                 if mobj is None:
3452                         self._downloader.trouble(u'ERROR: unable to extract video title')
3453                         return
3454                 video_title = mobj.group(1).decode('utf-8')
3455
3456
3457                 # Extract video thumbnail
3458                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3459                 if mobj is None:
3460                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3461                         return
3462                 video_thumbnail = mobj.group(1).decode('utf-8')
3463
3464
3465
3466                 self._downloader.increment_downloads()
3467                 info = {
3468                         'id': video_id,
3469                         'url': video_url,
3470                         'uploader': None,
3471                         'upload_date': None,
3472                         'title': video_title,
3473                         'stitle': self._simplify_title(video_title),
3474                         'ext': 'flv',
3475                         'format': 'flv',
3476                         'thumbnail': video_thumbnail,
3477                         'description': None,
3478                         'player_url': None,
3479                 }
3480
3481                 try:
3482                         self._downloader.process_info(info)
3483                 except UnavailableVideoError, err:
3484                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3485
3486
3487 class SoundcloudIE(InfoExtractor):
3488         """Information extractor for soundcloud.com
3489            To access the media, the uid of the song and a stream token
3490            must be extracted from the page source and the script must make
3491            a request to media.soundcloud.com/crossdomain.xml. Then
3492            the media can be grabbed by requesting from an url composed
3493            of the stream token and uid
3494          """
3495
3496         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3497         IE_NAME = u'soundcloud'
3498
3499         def __init__(self, downloader=None):
3500                 InfoExtractor.__init__(self, downloader)
3501
3502         def report_webpage(self, video_id):
3503                 """Report information extraction."""
3504                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3505
3506         def report_extraction(self, video_id):
3507                 """Report information extraction."""
3508                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3509
3510         def _real_initialize(self):
3511                 return
3512
3513         def _real_extract(self, url):
3514                 htmlParser = HTMLParser.HTMLParser()
3515
3516                 mobj = re.match(self._VALID_URL, url)
3517                 if mobj is None:
3518                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3519                         return
3520
3521                 # extract uploader (which is in the url)
3522                 uploader = mobj.group(1).decode('utf-8')
3523                 # extract simple title (uploader + slug of song title)
3524                 slug_title =  mobj.group(2).decode('utf-8')
3525                 simple_title = uploader + '-' + slug_title
3526
3527                 self.report_webpage('%s/%s' % (uploader, slug_title))
3528
3529                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3530                 try:
3531                         webpage = urllib2.urlopen(request).read()
3532                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3533                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3534                         return
3535
3536                 self.report_extraction('%s/%s' % (uploader, slug_title))
3537
3538                 # extract uid and stream token that soundcloud hands out for access
3539                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3540                 if mobj:
3541                         video_id = mobj.group(1)
3542                         stream_token = mobj.group(2)
3543
3544                 # extract unsimplified title
3545                 mobj = re.search('"title":"(.*?)",', webpage)
3546                 if mobj:
3547                         title = mobj.group(1)
3548
3549                 # construct media url (with uid/token)
3550                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3551                 mediaURL = mediaURL % (video_id, stream_token)
3552
3553                 # description
3554                 description = u'No description available'
3555                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3556                 if mobj:
3557                         description = mobj.group(1)
3558
3559                 # upload date
3560                 upload_date = None
3561                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3562                 if mobj:
3563                         try:
3564                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3565                         except Exception as e:
3566                                 print str(e)
3567
3568                 # for soundcloud, a request to a cross domain is required for cookies
3569                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3570
3571                 try:
3572                         self._downloader.process_info({
3573                                 'id':           video_id.decode('utf-8'),
3574                                 'url':          mediaURL,
3575                                 'uploader':     uploader.decode('utf-8'),
3576                                 'upload_date':  upload_date,
3577                                 'title':        simple_title.decode('utf-8'),
3578                                 'stitle':       simple_title.decode('utf-8'),
3579                                 'ext':          u'mp3',
3580                                 'format':       u'NA',
3581                                 'player_url':   None,
3582                                 'description': description.decode('utf-8')
3583                         })
3584                 except UnavailableVideoError:
3585                         self._downloader.trouble(u'\nERROR: unable to download video')
3586
3587
3588 class InfoQIE(InfoExtractor):
3589         """Information extractor for infoq.com"""
3590
3591         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3592         IE_NAME = u'infoq'
3593
3594         def report_webpage(self, video_id):
3595                 """Report information extraction."""
3596                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3597
3598         def report_extraction(self, video_id):
3599                 """Report information extraction."""
3600                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3601
3602         def _simplify_title(self, title):
3603                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3604                 res = res.strip(ur'_')
3605                 return res
3606
3607         def _real_extract(self, url):
3608                 htmlParser = HTMLParser.HTMLParser()
3609
3610                 mobj = re.match(self._VALID_URL, url)
3611                 if mobj is None:
3612                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3613                         return
3614
3615                 self.report_webpage(url)
3616
3617                 request = urllib2.Request(url)
3618                 try:
3619                         webpage = urllib2.urlopen(request).read()
3620                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3621                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3622                         return
3623
3624                 self.report_extraction(url)
3625
3626
3627                 # Extract video URL
3628                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3629                 if mobj is None:
3630                         self._downloader.trouble(u'ERROR: unable to extract video url')
3631                         return
3632                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3633
3634
3635                 # Extract title
3636                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3637                 if mobj is None:
3638                         self._downloader.trouble(u'ERROR: unable to extract video title')
3639                         return
3640                 video_title = mobj.group(1).decode('utf-8')
3641
3642
3643                 # Extract description
3644                 video_description = u'No description available.'
3645                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3646                 if mobj is not None:
3647                         video_description = mobj.group(1).decode('utf-8')
3648
3649                 video_filename = video_url.split('/')[-1]
3650                 video_id, extension = video_filename.split('.')
3651
3652                 self._downloader.increment_downloads()
3653                 info = {
3654                         'id': video_id,
3655                         'url': video_url,
3656                         'uploader': None,
3657                         'upload_date': None,
3658                         'title': video_title,
3659                         'stitle': self._simplify_title(video_title),
3660                         'ext': extension,
3661                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3662                         'thumbnail': None,
3663                         'description': video_description,
3664                         'player_url': None,
3665                 }
3666
3667                 try:
3668                         self._downloader.process_info(info)
3669                 except UnavailableVideoError, err:
3670                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3671
3672
3673
3674 class PostProcessor(object):
3675         """Post Processor class.
3676
3677         PostProcessor objects can be added to downloaders with their
3678         add_post_processor() method. When the downloader has finished a
3679         successful download, it will take its internal chain of PostProcessors
3680         and start calling the run() method on each one of them, first with
3681         an initial argument and then with the returned value of the previous
3682         PostProcessor.
3683
3684         The chain will be stopped if one of them ever returns None or the end
3685         of the chain is reached.
3686
3687         PostProcessor objects follow a "mutual registration" process similar
3688         to InfoExtractor objects.
3689         """
3690
3691         _downloader = None
3692
3693         def __init__(self, downloader=None):
3694                 self._downloader = downloader
3695
3696         def set_downloader(self, downloader):
3697                 """Sets the downloader for this PP."""
3698                 self._downloader = downloader
3699
3700         def run(self, information):
3701                 """Run the PostProcessor.
3702
3703                 The "information" argument is a dictionary like the ones
3704                 composed by InfoExtractors. The only difference is that this
3705                 one has an extra field called "filepath" that points to the
3706                 downloaded file.
3707
3708                 When this method returns None, the postprocessing chain is
3709                 stopped. However, this method may return an information
3710                 dictionary that will be passed to the next postprocessing
3711                 object in the chain. It can be the one it received after
3712                 changing some fields.
3713
3714                 In addition, this method may raise a PostProcessingError
3715                 exception that will be taken into account by the downloader
3716                 it was called from.
3717                 """
3718                 return information # by default, do nothing
3719
3720
3721 class FFmpegExtractAudioPP(PostProcessor):
3722
3723         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3724                 PostProcessor.__init__(self, downloader)
3725                 if preferredcodec is None:
3726                         preferredcodec = 'best'
3727                 self._preferredcodec = preferredcodec
3728                 self._preferredquality = preferredquality
3729                 self._keepvideo = keepvideo
3730
3731         @staticmethod
3732         def get_audio_codec(path):
3733                 try:
3734                         cmd = ['ffprobe', '-show_streams', '--', path]
3735                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3736                         output = handle.communicate()[0]
3737                         if handle.wait() != 0:
3738                                 return None
3739                 except (IOError, OSError):
3740                         return None
3741                 audio_codec = None
3742                 for line in output.split('\n'):
3743                         if line.startswith('codec_name='):
3744                                 audio_codec = line.split('=')[1].strip()
3745                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3746                                 return audio_codec
3747                 return None
3748
3749         @staticmethod
3750         def run_ffmpeg(path, out_path, codec, more_opts):
3751                 try:
3752                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3753                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3754                         return (ret == 0)
3755                 except (IOError, OSError):
3756                         return False
3757
3758         def run(self, information):
3759                 path = information['filepath']
3760
3761                 filecodec = self.get_audio_codec(path)
3762                 if filecodec is None:
3763                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3764                         return None
3765
3766                 more_opts = []
3767                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3768                         if filecodec in ['aac', 'mp3', 'vorbis']:
3769                                 # Lossless if possible
3770                                 acodec = 'copy'
3771                                 extension = filecodec
3772                                 if filecodec == 'aac':
3773                                         more_opts = ['-f', 'adts']
3774                                 if filecodec == 'vorbis':
3775                                         extension = 'ogg'
3776                         else:
3777                                 # MP3 otherwise.
3778                                 acodec = 'libmp3lame'
3779                                 extension = 'mp3'
3780                                 more_opts = []
3781                                 if self._preferredquality is not None:
3782                                         more_opts += ['-ab', self._preferredquality]
3783                 else:
3784                         # We convert the audio (lossy)
3785                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3786                         extension = self._preferredcodec
3787                         more_opts = []
3788                         if self._preferredquality is not None:
3789                                 more_opts += ['-ab', self._preferredquality]
3790                         if self._preferredcodec == 'aac':
3791                                 more_opts += ['-f', 'adts']
3792                         if self._preferredcodec == 'vorbis':
3793                                 extension = 'ogg'
3794
3795                 (prefix, ext) = os.path.splitext(path)
3796                 new_path = prefix + '.' + extension
3797                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3798                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3799
3800                 if not status:
3801                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3802                         return None
3803
3804                 # Try to update the date time for extracted audio file.
3805                 if information.get('filetime') is not None:
3806                         try:
3807                                 os.utime(new_path, (time.time(), information['filetime']))
3808                         except:
3809                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3810
3811                 if not self._keepvideo:
3812                         try:
3813                                 os.remove(path)
3814                         except (IOError, OSError):
3815                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3816                                 return None
3817
3818                 information['filepath'] = new_path
3819                 return information
3820
3821
3822 def updateSelf(downloader, filename):
3823         ''' Update the program file with the latest version from the repository '''
3824         # Note: downloader only used for options
3825         if not os.access(filename, os.W_OK):
3826                 sys.exit('ERROR: no write permissions on %s' % filename)
3827
3828         downloader.to_screen('Updating to latest version...')
3829
3830         try:
3831                 try:
3832                         urlh = urllib.urlopen(UPDATE_URL)
3833                         newcontent = urlh.read()
3834
3835                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3836                         if vmatch is not None and vmatch.group(1) == __version__:
3837                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3838                                 return
3839                 finally:
3840                         urlh.close()
3841         except (IOError, OSError), err:
3842                 sys.exit('ERROR: unable to download latest version')
3843
3844         try:
3845                 outf = open(filename, 'wb')
3846                 try:
3847                         outf.write(newcontent)
3848                 finally:
3849                         outf.close()
3850         except (IOError, OSError), err:
3851                 sys.exit('ERROR: unable to overwrite current version')
3852
3853         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3854
3855 def parseOpts():
3856         # Deferred imports
3857         import getpass
3858         import optparse
3859
3860         def _format_option_string(option):
3861                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3862
3863                 opts = []
3864
3865                 if option._short_opts: opts.append(option._short_opts[0])
3866                 if option._long_opts: opts.append(option._long_opts[0])
3867                 if len(opts) > 1: opts.insert(1, ', ')
3868
3869                 if option.takes_value(): opts.append(' %s' % option.metavar)
3870
3871                 return "".join(opts)
3872
3873         def _find_term_columns():
3874                 columns = os.environ.get('COLUMNS', None)
3875                 if columns:
3876                         return int(columns)
3877
3878                 try:
3879                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3880                         out,err = sp.communicate()
3881                         return int(out.split()[1])
3882                 except:
3883                         pass
3884                 return None
3885
3886         max_width = 80
3887         max_help_position = 80
3888
3889         # No need to wrap help messages if we're on a wide console
3890         columns = _find_term_columns()
3891         if columns: max_width = columns
3892
3893         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3894         fmt.format_option_strings = _format_option_string
3895
3896         kw = {
3897                 'version'   : __version__,
3898                 'formatter' : fmt,
3899                 'usage' : '%prog [options] url [url...]',
3900                 'conflict_handler' : 'resolve',
3901         }
3902
3903         parser = optparse.OptionParser(**kw)
3904
3905         # option groups
3906         general        = optparse.OptionGroup(parser, 'General Options')
3907         selection      = optparse.OptionGroup(parser, 'Video Selection')
3908         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3909         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3910         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3911         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3912         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3913
3914         general.add_option('-h', '--help',
3915                         action='help', help='print this help text and exit')
3916         general.add_option('-v', '--version',
3917                         action='version', help='print program version and exit')
3918         general.add_option('-U', '--update',
3919                         action='store_true', dest='update_self', help='update this program to latest version')
3920         general.add_option('-i', '--ignore-errors',
3921                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3922         general.add_option('-r', '--rate-limit',
3923                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3924         general.add_option('-R', '--retries',
3925                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3926         general.add_option('--dump-user-agent',
3927                         action='store_true', dest='dump_user_agent',
3928                         help='display the current browser identification', default=False)
3929         general.add_option('--list-extractors',
3930                         action='store_true', dest='list_extractors',
3931                         help='List all supported extractors and the URLs they would handle', default=False)
3932
3933         selection.add_option('--playlist-start',
3934                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3935         selection.add_option('--playlist-end',
3936                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3937         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3938         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3939
3940         authentication.add_option('-u', '--username',
3941                         dest='username', metavar='USERNAME', help='account username')
3942         authentication.add_option('-p', '--password',
3943                         dest='password', metavar='PASSWORD', help='account password')
3944         authentication.add_option('-n', '--netrc',
3945                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3946
3947
3948         video_format.add_option('-f', '--format',
3949                         action='store', dest='format', metavar='FORMAT', help='video format code')
3950         video_format.add_option('--all-formats',
3951                         action='store_const', dest='format', help='download all available video formats', const='all')
3952         video_format.add_option('--max-quality',
3953                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3954         video_format.add_option('-F', '--list-formats',
3955                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3956
3957
3958         verbosity.add_option('-q', '--quiet',
3959                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3960         verbosity.add_option('-s', '--simulate',
3961                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3962         verbosity.add_option('--skip-download',
3963                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3964         verbosity.add_option('-g', '--get-url',
3965                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3966         verbosity.add_option('-e', '--get-title',
3967                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3968         verbosity.add_option('--get-thumbnail',
3969                         action='store_true', dest='getthumbnail',
3970                         help='simulate, quiet but print thumbnail URL', default=False)
3971         verbosity.add_option('--get-description',
3972                         action='store_true', dest='getdescription',
3973                         help='simulate, quiet but print video description', default=False)
3974         verbosity.add_option('--get-filename',
3975                         action='store_true', dest='getfilename',
3976                         help='simulate, quiet but print output filename', default=False)
3977         verbosity.add_option('--get-format',
3978                         action='store_true', dest='getformat',
3979                         help='simulate, quiet but print output format', default=False)
3980         verbosity.add_option('--no-progress',
3981                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3982         verbosity.add_option('--console-title',
3983                         action='store_true', dest='consoletitle',
3984                         help='display progress in console titlebar', default=False)
3985
3986
3987         filesystem.add_option('-t', '--title',
3988                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3989         filesystem.add_option('-l', '--literal',
3990                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3991         filesystem.add_option('-A', '--auto-number',
3992                         action='store_true', dest='autonumber',
3993                         help='number downloaded files starting from 00000', default=False)
3994         filesystem.add_option('-o', '--output',
3995                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3996         filesystem.add_option('-a', '--batch-file',
3997                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3998         filesystem.add_option('-w', '--no-overwrites',
3999                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4000         filesystem.add_option('-c', '--continue',
4001                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4002         filesystem.add_option('--no-continue',
4003                         action='store_false', dest='continue_dl',
4004                         help='do not resume partially downloaded files (restart from beginning)')
4005         filesystem.add_option('--cookies',
4006                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4007         filesystem.add_option('--no-part',
4008                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4009         filesystem.add_option('--no-mtime',
4010                         action='store_false', dest='updatetime',
4011                         help='do not use the Last-modified header to set the file modification time', default=True)
4012         filesystem.add_option('--write-description',
4013                         action='store_true', dest='writedescription',
4014                         help='write video description to a .description file', default=False)
4015         filesystem.add_option('--write-info-json',
4016                         action='store_true', dest='writeinfojson',
4017                         help='write video metadata to a .info.json file', default=False)
4018
4019
4020         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4021                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4022         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4023                         help='"best", "aac", "vorbis" or "mp3"; best by default')
4024         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4025                         help='ffmpeg audio bitrate specification, 128k by default')
4026         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4027                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4028
4029
4030         parser.add_option_group(general)
4031         parser.add_option_group(selection)
4032         parser.add_option_group(filesystem)
4033         parser.add_option_group(verbosity)
4034         parser.add_option_group(video_format)
4035         parser.add_option_group(authentication)
4036         parser.add_option_group(postproc)
4037
4038         opts, args = parser.parse_args()
4039
4040         return parser, opts, args
4041
4042 def gen_extractors():
4043         """ Return a list of an instance of every supported extractor.
4044         The order does matter; the first extractor matched is the one handling the URL.
4045         """
4046         youtube_ie = YoutubeIE()
4047         google_ie = GoogleIE()
4048         yahoo_ie = YahooIE()
4049         return [
4050                 YoutubePlaylistIE(youtube_ie),
4051                 YoutubeUserIE(youtube_ie),
4052                 YoutubeSearchIE(youtube_ie),
4053                 youtube_ie,
4054                 MetacafeIE(youtube_ie),
4055                 DailymotionIE(),
4056                 google_ie,
4057                 GoogleSearchIE(google_ie),
4058                 PhotobucketIE(),
4059                 yahoo_ie,
4060                 YahooSearchIE(yahoo_ie),
4061                 DepositFilesIE(),
4062                 FacebookIE(),
4063                 BlipTVIE(),
4064                 VimeoIE(),
4065                 MyVideoIE(),
4066                 ComedyCentralIE(),
4067                 EscapistIE(),
4068                 CollegeHumorIE(),
4069                 XVideosIE(),
4070                 SoundcloudIE(),
4071                 InfoQIE(),
4072
4073                 GenericIE()
4074         ]
4075
4076 def main():
4077         parser, opts, args = parseOpts()
4078
4079         # Open appropriate CookieJar
4080         if opts.cookiefile is None:
4081                 jar = cookielib.CookieJar()
4082         else:
4083                 try:
4084                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4085                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4086                                 jar.load()
4087                 except (IOError, OSError), err:
4088                         sys.exit(u'ERROR: unable to open cookie file')
4089
4090         # Dump user agent
4091         if opts.dump_user_agent:
4092                 print std_headers['User-Agent']
4093                 sys.exit(0)
4094
4095         # Batch file verification
4096         batchurls = []
4097         if opts.batchfile is not None:
4098                 try:
4099                         if opts.batchfile == '-':
4100                                 batchfd = sys.stdin
4101                         else:
4102                                 batchfd = open(opts.batchfile, 'r')
4103                         batchurls = batchfd.readlines()
4104                         batchurls = [x.strip() for x in batchurls]
4105                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4106                 except IOError:
4107                         sys.exit(u'ERROR: batch file could not be read')
4108         all_urls = batchurls + args
4109
4110         # General configuration
4111         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4112         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4113         urllib2.install_opener(opener)
4114         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4115
4116         extractors = gen_extractors()
4117
4118         if opts.list_extractors:
4119                 for ie in extractors:
4120                         print(ie.IE_NAME)
4121                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4122                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4123                         for mu in matchedUrls:
4124                                 print(u'  ' + mu)
4125                 sys.exit(0)
4126
4127         # Conflicting, missing and erroneous options
4128         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4129                 parser.error(u'using .netrc conflicts with giving username/password')
4130         if opts.password is not None and opts.username is None:
4131                 parser.error(u'account username missing')
4132         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4133                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4134         if opts.usetitle and opts.useliteral:
4135                 parser.error(u'using title conflicts with using literal title')
4136         if opts.username is not None and opts.password is None:
4137                 opts.password = getpass.getpass(u'Type account password and press return:')
4138         if opts.ratelimit is not None:
4139                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4140                 if numeric_limit is None:
4141                         parser.error(u'invalid rate limit specified')
4142                 opts.ratelimit = numeric_limit
4143         if opts.retries is not None:
4144                 try:
4145                         opts.retries = long(opts.retries)
4146                 except (TypeError, ValueError), err:
4147                         parser.error(u'invalid retry count specified')
4148         try:
4149                 opts.playliststart = int(opts.playliststart)
4150                 if opts.playliststart <= 0:
4151                         raise ValueError(u'Playlist start must be positive')
4152         except (TypeError, ValueError), err:
4153                 parser.error(u'invalid playlist start number specified')
4154         try:
4155                 opts.playlistend = int(opts.playlistend)
4156                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4157                         raise ValueError(u'Playlist end must be greater than playlist start')
4158         except (TypeError, ValueError), err:
4159                 parser.error(u'invalid playlist end number specified')
4160         if opts.extractaudio:
4161                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4162                         parser.error(u'invalid audio format specified')
4163
4164         # File downloader
4165         fd = FileDownloader({
4166                 'usenetrc': opts.usenetrc,
4167                 'username': opts.username,
4168                 'password': opts.password,
4169                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4170                 'forceurl': opts.geturl,
4171                 'forcetitle': opts.gettitle,
4172                 'forcethumbnail': opts.getthumbnail,
4173                 'forcedescription': opts.getdescription,
4174                 'forcefilename': opts.getfilename,
4175                 'forceformat': opts.getformat,
4176                 'simulate': opts.simulate,
4177                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4178                 'format': opts.format,
4179                 'format_limit': opts.format_limit,
4180                 'listformats': opts.listformats,
4181                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4182                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4183                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4184                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4185                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4186                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4187                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4188                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4189                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4190                         or u'%(id)s.%(ext)s'),
4191                 'ignoreerrors': opts.ignoreerrors,
4192                 'ratelimit': opts.ratelimit,
4193                 'nooverwrites': opts.nooverwrites,
4194                 'retries': opts.retries,
4195                 'continuedl': opts.continue_dl,
4196                 'noprogress': opts.noprogress,
4197                 'playliststart': opts.playliststart,
4198                 'playlistend': opts.playlistend,
4199                 'logtostderr': opts.outtmpl == '-',
4200                 'consoletitle': opts.consoletitle,
4201                 'nopart': opts.nopart,
4202                 'updatetime': opts.updatetime,
4203                 'writedescription': opts.writedescription,
4204                 'writeinfojson': opts.writeinfojson,
4205                 'matchtitle': opts.matchtitle,
4206                 'rejecttitle': opts.rejecttitle,
4207                 })
4208         for extractor in extractors:
4209                 fd.add_info_extractor(extractor)
4210
4211         # PostProcessors
4212         if opts.extractaudio:
4213                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4214
4215         # Update version
4216         if opts.update_self:
4217                 updateSelf(fd, sys.argv[0])
4218
4219         # Maybe do nothing
4220         if len(all_urls) < 1:
4221                 if not opts.update_self:
4222                         parser.error(u'you must provide at least one URL')
4223                 else:
4224                         sys.exit()
4225         retcode = fd.download(all_urls)
4226
4227         # Dump cookie jar if requested
4228         if opts.cookiefile is not None:
4229                 try:
4230                         jar.save()
4231                 except (IOError, OSError), err:
4232                         sys.exit(u'ERROR: unable to save cookie jar')
4233
4234         sys.exit(retcode)
4235
4236
4237 if __name__ == '__main__':
4238         try:
4239                 main()
4240         except DownloadError:
4241                 sys.exit(1)
4242         except SameFileError:
4243                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4244         except KeyboardInterrupt:
4245                 sys.exit(u'\nERROR: Interrupted by user')
4246
4247 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: