youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # License: Public domain code
   9 import cookielib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import gzip
  14 import htmlentitydefs
  15 import httplib
  16 import locale
  17 import math
  18 import netrc
  19 import os
  20 import os.path
  21 import re
  22 import socket
  23 import string
  24 import StringIO
  25 import subprocess
  26 import sys
  27 import time
  28 import urllib
  29 import urllib2
  30 import zlib
  31
  32 # parse_qs was moved from the cgi module to the urlparse module recently.
  33 try:
  34         from urlparse import parse_qs
  35 except ImportError:
  36         from cgi import parse_qs
  37
  38 std_headers = {
  39         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  40         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  41         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  42         'Accept-Encoding': 'gzip, deflate',
  43         'Accept-Language': 'en-us,en;q=0.5',
  44 }
  45
  46 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  47
  48 def preferredencoding():
  49         """Get preferred encoding.
  50
  51         Returns the best encoding scheme for the system, based on
  52         locale.getpreferredencoding() and some further tweaks.
  53         """
  54         def yield_preferredencoding():
  55                 try:
  56                         pref = locale.getpreferredencoding()
  57                         u'TEST'.encode(pref)
  58                 except:
  59                         pref = 'UTF-8'
  60                 while True:
  61                         yield pref
  62         return yield_preferredencoding().next()
  63
  64 def htmlentity_transform(matchobj):
  65         """Transforms an HTML entity to a Unicode character.
  66
  67         This function receives a match object and is intended to be used with
  68         the re.sub() function.
  69         """
  70         entity = matchobj.group(1)
  71
  72         # Known non-numeric HTML entity
  73         if entity in htmlentitydefs.name2codepoint:
  74                 return unichr(htmlentitydefs.name2codepoint[entity])
  75
  76         # Unicode character
  77         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  78         if mobj is not None:
  79                 numstr = mobj.group(1)
  80                 if numstr.startswith(u'x'):
  81                         base = 16
  82                         numstr = u'0%s' % numstr
  83                 else:
  84                         base = 10
  85                 return unichr(long(numstr, base))
  86
  87         # Unknown entity in name, return its literal representation
  88         return (u'&%s;' % entity)
  89
  90 def sanitize_title(utitle):
  91         """Sanitizes a video title so it could be used as part of a filename."""
  92         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  93         return utitle.replace(unicode(os.sep), u'%')
  94
  95 def sanitize_open(filename, open_mode):
  96         """Try to open the given filename, and slightly tweak it if this fails.
  97
  98         Attempts to open the given filename. If this fails, it tries to change
  99         the filename slightly, step by step, until it's either able to open it
 100         or it fails and raises a final exception, like the standard open()
 101         function.
 102
 103         It returns the tuple (stream, definitive_file_name).
 104         """
 105         try:
 106                 if filename == u'-':
 107                         if sys.platform == 'win32':
 108                                 import msvcrt
 109                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 110                         return (sys.stdout, filename)
 111                 stream = open(filename, open_mode)
 112                 return (stream, filename)
 113         except (IOError, OSError), err:
 114                 # In case of error, try to remove win32 forbidden chars
 115                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 116
 117                 # An exception here should be caught in the caller
 118                 stream = open(filename, open_mode)
 119                 return (stream, filename)
 120
 121 def timeconvert(timestr):
 122     """Convert RFC 2822 defined time string into system timestamp"""
 123     timestamp = None
 124     timetuple = email.utils.parsedate_tz(timestr)
 125     if timetuple is not None:
 126         timestamp = email.utils.mktime_tz(timetuple)
 127     return timestamp
 128
 129 class DownloadError(Exception):
 130         """Download Error exception.
 131
 132         This exception may be thrown by FileDownloader objects if they are not
 133         configured to continue on errors. They will contain the appropriate
 134         error message.
 135         """
 136         pass
 137
 138 class SameFileError(Exception):
 139         """Same File exception.
 140
 141         This exception will be thrown by FileDownloader objects if they detect
 142         multiple files would have to be downloaded to the same file on disk.
 143         """
 144         pass
 145
 146 class PostProcessingError(Exception):
 147         """Post Processing exception.
 148
 149         This exception may be raised by PostProcessor's .run() method to
 150         indicate an error in the postprocessing task.
 151         """
 152         pass
 153
 154 class UnavailableVideoError(Exception):
 155         """Unavailable Format exception.
 156
 157         This exception will be thrown when a video is requested
 158         in a format that is not available for that video.
 159         """
 160         pass
 161
 162 class ContentTooShortError(Exception):
 163         """Content Too Short exception.
 164
 165         This exception may be raised by FileDownloader objects when a file they
 166         download is too small for what the server announced first, indicating
 167         the connection was probably interrupted.
 168         """
 169         # Both in bytes
 170         downloaded = None
 171         expected = None
 172
 173         def __init__(self, downloaded, expected):
 174                 self.downloaded = downloaded
 175                 self.expected = expected
 176
 177 class YoutubeDLHandler(urllib2.HTTPHandler):
 178         """Handler for HTTP requests and responses.
 179
 180         This class, when installed with an OpenerDirector, automatically adds
 181         the standard headers to every HTTP request and handles gzipped and
 182         deflated responses from web servers. If compression is to be avoided in
 183         a particular request, the original request in the program code only has
 184         to include the HTTP header "Youtubedl-No-Compression", which will be
 185         removed before making the real request.
 186
 187         Part of this code was copied from:
 188
 189           http://techknack.net/python-urllib2-handlers/
 190
 191         Andrew Rowls, the author of that code, agreed to release it to the
 192         public domain.
 193         """
 194
 195         @staticmethod
 196         def deflate(data):
 197                 try:
 198                         return zlib.decompress(data, -zlib.MAX_WBITS)
 199                 except zlib.error:
 200                         return zlib.decompress(data)
 201
 202         @staticmethod
 203         def addinfourl_wrapper(stream, headers, url, code):
 204                 if hasattr(urllib2.addinfourl, 'getcode'):
 205                         return urllib2.addinfourl(stream, headers, url, code)
 206                 ret = urllib2.addinfourl(stream, headers, url)
 207                 ret.code = code
 208                 return ret
 209
 210         def http_request(self, req):
 211                 for h in std_headers:
 212                         if h in req.headers:
 213                                 del req.headers[h]
 214                         req.add_header(h, std_headers[h])
 215                 if 'Youtubedl-no-compression' in req.headers:
 216                         if 'Accept-encoding' in req.headers:
 217                                 del req.headers['Accept-encoding']
 218                         del req.headers['Youtubedl-no-compression']
 219                 return req
 220
 221         def http_response(self, req, resp):
 222                 old_resp = resp
 223                 # gzip
 224                 if resp.headers.get('Content-encoding', '') == 'gzip':
 225                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 226                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 227                         resp.msg = old_resp.msg
 228                 # deflate
 229                 if resp.headers.get('Content-encoding', '') == 'deflate':
 230                         gz = StringIO.StringIO(self.deflate(resp.read()))
 231                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 232                         resp.msg = old_resp.msg
 233                 return resp
 234
 235 class FileDownloader(object):
 236         """File Downloader class.
 237
 238         File downloader objects are the ones responsible of downloading the
 239         actual video file and writing it to disk if the user has requested
 240         it, among some other tasks. In most cases there should be one per
 241         program. As, given a video URL, the downloader doesn't know how to
 242         extract all the needed information, task that InfoExtractors do, it
 243         has to pass the URL to one of them.
 244
 245         For this, file downloader objects have a method that allows
 246         InfoExtractors to be registered in a given order. When it is passed
 247         a URL, the file downloader handles it to the first InfoExtractor it
 248         finds that reports being able to handle it. The InfoExtractor extracts
 249         all the information about the video or videos the URL refers to, and
 250         asks the FileDownloader to process the video information, possibly
 251         downloading the video.
 252
 253         File downloaders accept a lot of parameters. In order not to saturate
 254         the object constructor with arguments, it receives a dictionary of
 255         options instead. These options are available through the params
 256         attribute for the InfoExtractors to use. The FileDownloader also
 257         registers itself as the downloader in charge for the InfoExtractors
 258         that are added to it, so this is a "mutual registration".
 259
 260         Available options:
 261
 262         username:         Username for authentication purposes.
 263         password:         Password for authentication purposes.
 264         usenetrc:         Use netrc for authentication instead.
 265         quiet:            Do not print messages to stdout.
 266         forceurl:         Force printing final URL.
 267         forcetitle:       Force printing title.
 268         forcethumbnail:   Force printing thumbnail URL.
 269         forcedescription: Force printing description.
 270         forcefilename:    Force printing final filename.
 271         simulate:         Do not download the video files.
 272         format:           Video format code.
 273         format_limit:     Highest quality format to try.
 274         outtmpl:          Template for output names.
 275         ignoreerrors:     Do not stop on download errors.
 276         ratelimit:        Download speed limit, in bytes/sec.
 277         nooverwrites:     Prevent overwriting files.
 278         retries:          Number of times to retry for HTTP error 5xx
 279         continuedl:       Try to continue downloads if possible.
 280         noprogress:       Do not print the progress bar.
 281         playliststart:    Playlist item to start at.
 282         playlistend:      Playlist item to end at.
 283         logtostderr:      Log messages to stderr instead of stdout.
 284         consoletitle:     Display progress in console window's titlebar.
 285         nopart:           Do not use temporary .part files.
 286         updatetime:       Use the Last-modified header to set output file timestamps.
 287         """
 288
 289         params = None
 290         _ies = []
 291         _pps = []
 292         _download_retcode = None
 293         _num_downloads = None
 294         _screen_file = None
 295
 296         def __init__(self, params):
 297                 """Create a FileDownloader object with the given options."""
 298                 self._ies = []
 299                 self._pps = []
 300                 self._download_retcode = 0
 301                 self._num_downloads = 0
 302                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 303                 self.params = params
 304
 305         @staticmethod
 306         def pmkdir(filename):
 307                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 308                 components = filename.split(os.sep)
 309                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 310                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 311                 for dir in aggregate:
 312                         if not os.path.exists(dir):
 313                                 os.mkdir(dir)
 314
 315         @staticmethod
 316         def format_bytes(bytes):
 317                 if bytes is None:
 318                         return 'N/A'
 319                 if type(bytes) is str:
 320                         bytes = float(bytes)
 321                 if bytes == 0.0:
 322                         exponent = 0
 323                 else:
 324                         exponent = long(math.log(bytes, 1024.0))
 325                 suffix = 'bkMGTPEZY'[exponent]
 326                 converted = float(bytes) / float(1024**exponent)
 327                 return '%.2f%s' % (converted, suffix)
 328
 329         @staticmethod
 330         def calc_percent(byte_counter, data_len):
 331                 if data_len is None:
 332                         return '---.-%'
 333                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 334
 335         @staticmethod
 336         def calc_eta(start, now, total, current):
 337                 if total is None:
 338                         return '--:--'
 339                 dif = now - start
 340                 if current == 0 or dif < 0.001: # One millisecond
 341                         return '--:--'
 342                 rate = float(current) / dif
 343                 eta = long((float(total) - float(current)) / rate)
 344                 (eta_mins, eta_secs) = divmod(eta, 60)
 345                 if eta_mins > 99:
 346                         return '--:--'
 347                 return '%02d:%02d' % (eta_mins, eta_secs)
 348
 349         @staticmethod
 350         def calc_speed(start, now, bytes):
 351                 dif = now - start
 352                 if bytes == 0 or dif < 0.001: # One millisecond
 353                         return '%10s' % '---b/s'
 354                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 355
 356         @staticmethod
 357         def best_block_size(elapsed_time, bytes):
 358                 new_min = max(bytes / 2.0, 1.0)
 359                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 360                 if elapsed_time < 0.001:
 361                         return long(new_max)
 362                 rate = bytes / elapsed_time
 363                 if rate > new_max:
 364                         return long(new_max)
 365                 if rate < new_min:
 366                         return long(new_min)
 367                 return long(rate)
 368
 369         @staticmethod
 370         def parse_bytes(bytestr):
 371                 """Parse a string indicating a byte quantity into a long integer."""
 372                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 373                 if matchobj is None:
 374                         return None
 375                 number = float(matchobj.group(1))
 376                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 377                 return long(round(number * multiplier))
 378
 379         def add_info_extractor(self, ie):
 380                 """Add an InfoExtractor object to the end of the list."""
 381                 self._ies.append(ie)
 382                 ie.set_downloader(self)
 383
 384         def add_post_processor(self, pp):
 385                 """Add a PostProcessor object to the end of the chain."""
 386                 self._pps.append(pp)
 387                 pp.set_downloader(self)
 388
 389         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 390                 """Print message to stdout if not in quiet mode."""
 391                 try:
 392                         if not self.params.get('quiet', False):
 393                                 terminator = [u'\n', u''][skip_eol]
 394                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 395                         self._screen_file.flush()
 396                 except (UnicodeEncodeError), err:
 397                         if not ignore_encoding_errors:
 398                                 raise
 399
 400         def to_stderr(self, message):
 401                 """Print message to stderr."""
 402                 print >>sys.stderr, message.encode(preferredencoding())
 403
 404         def to_cons_title(self, message):
 405                 """Set console/terminal window title to message."""
 406                 if not self.params.get('consoletitle', False):
 407                         return
 408                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 409                         # c_wchar_p() might not be necessary if `message` is
 410                         # already of type unicode()
 411                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 412                 elif 'TERM' in os.environ:
 413                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 414
 415         def fixed_template(self):
 416                 """Checks if the output template is fixed."""
 417                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 418
 419         def trouble(self, message=None):
 420                 """Determine action to take when a download problem appears.
 421
 422                 Depending on if the downloader has been configured to ignore
 423                 download errors or not, this method may throw an exception or
 424                 not when errors are found, after printing the message.
 425                 """
 426                 if message is not None:
 427                         self.to_stderr(message)
 428                 if not self.params.get('ignoreerrors', False):
 429                         raise DownloadError(message)
 430                 self._download_retcode = 1
 431
 432         def slow_down(self, start_time, byte_counter):
 433                 """Sleep if the download speed is over the rate limit."""
 434                 rate_limit = self.params.get('ratelimit', None)
 435                 if rate_limit is None or byte_counter == 0:
 436                         return
 437                 now = time.time()
 438                 elapsed = now - start_time
 439                 if elapsed <= 0.0:
 440                         return
 441                 speed = float(byte_counter) / elapsed
 442                 if speed > rate_limit:
 443                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 444
 445         def temp_name(self, filename):
 446                 """Returns a temporary filename for the given filename."""
 447                 if self.params.get('nopart', False) or filename == u'-' or \
 448                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 449                         return filename
 450                 return filename + u'.part'
 451
 452         def undo_temp_name(self, filename):
 453                 if filename.endswith(u'.part'):
 454                         return filename[:-len(u'.part')]
 455                 return filename
 456
 457         def try_rename(self, old_filename, new_filename):
 458                 try:
 459                         if old_filename == new_filename:
 460                                 return
 461                         os.rename(old_filename, new_filename)
 462                 except (IOError, OSError), err:
 463                         self.trouble(u'ERROR: unable to rename file')
 464
 465         def try_utime(self, filename, last_modified_hdr):
 466                 """Try to set the last-modified time of the given file."""
 467                 if last_modified_hdr is None:
 468                         return
 469                 if not os.path.isfile(filename):
 470                         return
 471                 timestr = last_modified_hdr
 472                 if timestr is None:
 473                         return
 474                 filetime = timeconvert(timestr)
 475                 if filetime is None:
 476                         return
 477                 try:
 478                         os.utime(filename,(time.time(), filetime))
 479                 except:
 480                         pass
 481
 482         def report_destination(self, filename):
 483                 """Report destination filename."""
 484                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 485
 486         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 487                 """Report download progress."""
 488                 if self.params.get('noprogress', False):
 489                         return
 490                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 491                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 492                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 493                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 494
 495         def report_resuming_byte(self, resume_len):
 496                 """Report attempt to resume at given byte."""
 497                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 498
 499         def report_retry(self, count, retries):
 500                 """Report retry in case of HTTP error 5xx"""
 501                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 502
 503         def report_file_already_downloaded(self, file_name):
 504                 """Report file has already been fully downloaded."""
 505                 try:
 506                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 507                 except (UnicodeEncodeError), err:
 508                         self.to_screen(u'[download] The file has already been downloaded')
 509
 510         def report_unable_to_resume(self):
 511                 """Report it was impossible to resume download."""
 512                 self.to_screen(u'[download] Unable to resume')
 513
 514         def report_finish(self):
 515                 """Report download finished."""
 516                 if self.params.get('noprogress', False):
 517                         self.to_screen(u'[download] Download completed')
 518                 else:
 519                         self.to_screen(u'')
 520
 521         def increment_downloads(self):
 522                 """Increment the ordinal that assigns a number to each file."""
 523                 self._num_downloads += 1
 524
 525         def prepare_filename(self, info_dict):
 526                 """Generate the output filename."""
 527                 try:
 528                         template_dict = dict(info_dict)
 529                         template_dict['epoch'] = unicode(long(time.time()))
 530                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 531                         filename = self.params['outtmpl'] % template_dict
 532                         return filename
 533                 except (ValueError, KeyError), err:
 534                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 535                         return None
 536
 537         def process_info(self, info_dict):
 538                 """Process a single dictionary returned by an InfoExtractor."""
 539                 filename = self.prepare_filename(info_dict)
 540                 # Do nothing else if in simulate mode
 541                 if self.params.get('simulate', False):
 542                         # Forced printings
 543                         if self.params.get('forcetitle', False):
 544                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 545                         if self.params.get('forceurl', False):
 546                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 547                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 548                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 549                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 550                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 551                         if self.params.get('forcefilename', False) and filename is not None:
 552                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 553
 554                         return
 555
 556                 if filename is None:
 557                         return
 558                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 559                         self.to_stderr(u'WARNING: file exists and will be skipped')
 560                         return
 561
 562                 try:
 563                         self.pmkdir(filename)
 564                 except (OSError, IOError), err:
 565                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 566                         return
 567
 568                 try:
 569                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 570                 except (OSError, IOError), err:
 571                         raise UnavailableVideoError
 572                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 573                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 574                         return
 575                 except (ContentTooShortError, ), err:
 576                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 577                         return
 578
 579                 if success:
 580                         try:
 581                                 self.post_process(filename, info_dict)
 582                         except (PostProcessingError), err:
 583                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 584                                 return
 585
 586         def download(self, url_list):
 587                 """Download a given list of URLs."""
 588                 if len(url_list) > 1 and self.fixed_template():
 589                         raise SameFileError(self.params['outtmpl'])
 590
 591                 for url in url_list:
 592                         suitable_found = False
 593                         for ie in self._ies:
 594                                 # Go to next InfoExtractor if not suitable
 595                                 if not ie.suitable(url):
 596                                         continue
 597
 598                                 # Suitable InfoExtractor found
 599                                 suitable_found = True
 600
 601                                 # Extract information from URL and process it
 602                                 ie.extract(url)
 603
 604                                 # Suitable InfoExtractor had been found; go to next URL
 605                                 break
 606
 607                         if not suitable_found:
 608                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 609
 610                 return self._download_retcode
 611
 612         def post_process(self, filename, ie_info):
 613                 """Run the postprocessing chain on the given file."""
 614                 info = dict(ie_info)
 615                 info['filepath'] = filename
 616                 for pp in self._pps:
 617                         info = pp.run(info)
 618                         if info is None:
 619                                 break
 620
 621         def _download_with_rtmpdump(self, filename, url, player_url):
 622                 self.report_destination(filename)
 623                 tmpfilename = self.temp_name(filename)
 624
 625                 # Check for rtmpdump first
 626                 try:
 627                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 628                 except (OSError, IOError):
 629                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 630                         return False
 631
 632                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 633                 # the connection was interrumpted and resuming appears to be
 634                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 635                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 636                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 637                 while retval == 2 or retval == 1:
 638                         prevsize = os.path.getsize(tmpfilename)
 639                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 640                         time.sleep(5.0) # This seems to be needed
 641                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 642                         cursize = os.path.getsize(tmpfilename)
 643                         if prevsize == cursize and retval == 1:
 644                                 break
 645                 if retval == 0:
 646                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 647                         self.try_rename(tmpfilename, filename)
 648                         return True
 649                 else:
 650                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 651                         return False
 652
 653         def _do_download(self, filename, url, player_url):
 654                 # Check file already present
 655                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 656                         self.report_file_already_downloaded(filename)
 657                         return True
 658
 659                 # Attempt to download using rtmpdump
 660                 if url.startswith('rtmp'):
 661                         return self._download_with_rtmpdump(filename, url, player_url)
 662
 663                 tmpfilename = self.temp_name(filename)
 664                 stream = None
 665                 open_mode = 'wb'
 666
 667                 # Do not include the Accept-Encoding header
 668                 headers = {'Youtubedl-no-compression': 'True'}
 669                 basic_request = urllib2.Request(url, None, headers)
 670                 request = urllib2.Request(url, None, headers)
 671
 672                 # Establish possible resume length
 673                 if os.path.isfile(tmpfilename):
 674                         resume_len = os.path.getsize(tmpfilename)
 675                 else:
 676                         resume_len = 0
 677
 678                 # Request parameters in case of being able to resume
 679                 if self.params.get('continuedl', False) and resume_len != 0:
 680                         self.report_resuming_byte(resume_len)
 681                         request.add_header('Range','bytes=%d-' % resume_len)
 682                         open_mode = 'ab'
 683
 684                 count = 0
 685                 retries = self.params.get('retries', 0)
 686                 while count <= retries:
 687                         # Establish connection
 688                         try:
 689                                 data = urllib2.urlopen(request)
 690                                 break
 691                         except (urllib2.HTTPError, ), err:
 692                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 693                                         # Unexpected HTTP error
 694                                         raise
 695                                 elif err.code == 416:
 696                                         # Unable to resume (requested range not satisfiable)
 697                                         try:
 698                                                 # Open the connection again without the range header
 699                                                 data = urllib2.urlopen(basic_request)
 700                                                 content_length = data.info()['Content-Length']
 701                                         except (urllib2.HTTPError, ), err:
 702                                                 if err.code < 500 or err.code >= 600:
 703                                                         raise
 704                                         else:
 705                                                 # Examine the reported length
 706                                                 if (content_length is not None and
 707                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 708                                                         # The file had already been fully downloaded.
 709                                                         # Explanation to the above condition: in issue #175 it was revealed that
 710                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 711                                                         # changing the file size slightly and causing problems for some users. So
 712                                                         # I decided to implement a suggested change and consider the file
 713                                                         # completely downloaded if the file size differs less than 100 bytes from
 714                                                         # the one in the hard drive.
 715                                                         self.report_file_already_downloaded(filename)
 716                                                         self.try_rename(tmpfilename, filename)
 717                                                         return True
 718                                                 else:
 719                                                         # The length does not match, we start the download over
 720                                                         self.report_unable_to_resume()
 721                                                         open_mode = 'wb'
 722                                                         break
 723                         # Retry
 724                         count += 1
 725                         if count <= retries:
 726                                 self.report_retry(count, retries)
 727
 728                 if count > retries:
 729                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 730                         return False
 731
 732                 data_len = data.info().get('Content-length', None)
 733                 if data_len is not None:
 734                         data_len = long(data_len) + resume_len
 735                 data_len_str = self.format_bytes(data_len)
 736                 byte_counter = 0 + resume_len
 737                 block_size = 1024
 738                 start = time.time()
 739                 while True:
 740                         # Download and write
 741                         before = time.time()
 742                         data_block = data.read(block_size)
 743                         after = time.time()
 744                         if len(data_block) == 0:
 745                                 break
 746                         byte_counter += len(data_block)
 747
 748                         # Open file just in time
 749                         if stream is None:
 750                                 try:
 751                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 752                                         filename = self.undo_temp_name(tmpfilename)
 753                                         self.report_destination(filename)
 754                                 except (OSError, IOError), err:
 755                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 756                                         return False
 757                         try:
 758                                 stream.write(data_block)
 759                         except (IOError, OSError), err:
 760                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 761                                 return False
 762                         block_size = self.best_block_size(after - before, len(data_block))
 763
 764                         # Progress message
 765                         percent_str = self.calc_percent(byte_counter, data_len)
 766                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 767                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 768                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 769
 770                         # Apply rate limit
 771                         self.slow_down(start, byte_counter - resume_len)
 772
 773                 stream.close()
 774                 self.report_finish()
 775                 if data_len is not None and byte_counter != data_len:
 776                         raise ContentTooShortError(byte_counter, long(data_len))
 777                 self.try_rename(tmpfilename, filename)
 778
 779                 # Update file modification time
 780                 if self.params.get('updatetime', True):
 781                         self.try_utime(filename, data.info().get('last-modified', None))
 782
 783                 return True
 784
 785 class InfoExtractor(object):
 786         """Information Extractor class.
 787
 788         Information extractors are the classes that, given a URL, extract
 789         information from the video (or videos) the URL refers to. This
 790         information includes the real video URL, the video title and simplified
 791         title, author and others. The information is stored in a dictionary
 792         which is then passed to the FileDownloader. The FileDownloader
 793         processes this information possibly downloading the video to the file
 794         system, among other possible outcomes. The dictionaries must include
 795         the following fields:
 796
 797         id:             Video identifier.
 798         url:            Final video URL.
 799         uploader:       Nickname of the video uploader.
 800         title:          Literal title.
 801         stitle:         Simplified title.
 802         ext:            Video filename extension.
 803         format:         Video format.
 804         player_url:     SWF Player URL (may be None).
 805
 806         The following fields are optional. Their primary purpose is to allow
 807         youtube-dl to serve as the backend for a video search function, such
 808         as the one in youtube2mp3.  They are only used when their respective
 809         forced printing functions are called:
 810
 811         thumbnail:      Full URL to a video thumbnail image.
 812         description:    One-line video description.
 813
 814         Subclasses of this one should re-define the _real_initialize() and
 815         _real_extract() methods, as well as the suitable() static method.
 816         Probably, they should also be instantiated and added to the main
 817         downloader.
 818         """
 819
 820         _ready = False
 821         _downloader = None
 822
 823         def __init__(self, downloader=None):
 824                 """Constructor. Receives an optional downloader."""
 825                 self._ready = False
 826                 self.set_downloader(downloader)
 827
 828         @staticmethod
 829         def suitable(url):
 830                 """Receives a URL and returns True if suitable for this IE."""
 831                 return False
 832
 833         def initialize(self):
 834                 """Initializes an instance (authentication, etc)."""
 835                 if not self._ready:
 836                         self._real_initialize()
 837                         self._ready = True
 838
 839         def extract(self, url):
 840                 """Extracts URL information and returns it in list of dicts."""
 841                 self.initialize()
 842                 return self._real_extract(url)
 843
 844         def set_downloader(self, downloader):
 845                 """Sets the downloader for this IE."""
 846                 self._downloader = downloader
 847
 848         def _real_initialize(self):
 849                 """Real initialization process. Redefine in subclasses."""
 850                 pass
 851
 852         def _real_extract(self, url):
 853                 """Real extraction process. Redefine in subclasses."""
 854                 pass
 855
 856 class YoutubeIE(InfoExtractor):
 857         """Information extractor for youtube.com."""
 858
 859         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 860         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 861         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 862         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 863         _NETRC_MACHINE = 'youtube'
 864         # Listed in order of quality
 865         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 866         _video_extensions = {
 867                 '13': '3gp',
 868                 '17': 'mp4',
 869                 '18': 'mp4',
 870                 '22': 'mp4',
 871                 '37': 'mp4',
 872                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 873                 '43': 'webm',
 874                 '45': 'webm',
 875         }
 876
 877         @staticmethod
 878         def suitable(url):
 879                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 880
 881         def report_lang(self):
 882                 """Report attempt to set language."""
 883                 self._downloader.to_screen(u'[youtube] Setting language')
 884
 885         def report_login(self):
 886                 """Report attempt to log in."""
 887                 self._downloader.to_screen(u'[youtube] Logging in')
 888
 889         def report_age_confirmation(self):
 890                 """Report attempt to confirm age."""
 891                 self._downloader.to_screen(u'[youtube] Confirming age')
 892
 893         def report_video_webpage_download(self, video_id):
 894                 """Report attempt to download video webpage."""
 895                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 896
 897         def report_video_info_webpage_download(self, video_id):
 898                 """Report attempt to download video info webpage."""
 899                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 900
 901         def report_information_extraction(self, video_id):
 902                 """Report attempt to extract video information."""
 903                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 904
 905         def report_unavailable_format(self, video_id, format):
 906                 """Report extracted video URL."""
 907                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 908
 909         def report_rtmp_download(self):
 910                 """Indicate the download will use the RTMP protocol."""
 911                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 912
 913         def _real_initialize(self):
 914                 if self._downloader is None:
 915                         return
 916
 917                 username = None
 918                 password = None
 919                 downloader_params = self._downloader.params
 920
 921                 # Attempt to use provided username and password or .netrc data
 922                 if downloader_params.get('username', None) is not None:
 923                         username = downloader_params['username']
 924                         password = downloader_params['password']
 925                 elif downloader_params.get('usenetrc', False):
 926                         try:
 927                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 928                                 if info is not None:
 929                                         username = info[0]
 930                                         password = info[2]
 931                                 else:
 932                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 933                         except (IOError, netrc.NetrcParseError), err:
 934                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 935                                 return
 936
 937                 # Set language
 938                 request = urllib2.Request(self._LANG_URL)
 939                 try:
 940                         self.report_lang()
 941                         urllib2.urlopen(request).read()
 942                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 943                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 944                         return
 945
 946                 # No authentication to be performed
 947                 if username is None:
 948                         return
 949
 950                 # Log in
 951                 login_form = {
 952                                 'current_form': 'loginForm',
 953                                 'next':         '/',
 954                                 'action_login': 'Log In',
 955                                 'username':     username,
 956                                 'password':     password,
 957                                 }
 958                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 959                 try:
 960                         self.report_login()
 961                         login_results = urllib2.urlopen(request).read()
 962                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 963                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 964                                 return
 965                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 966                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 967                         return
 968
 969                 # Confirm age
 970                 age_form = {
 971                                 'next_url':             '/',
 972                                 'action_confirm':       'Confirm',
 973                                 }
 974                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 975                 try:
 976                         self.report_age_confirmation()
 977                         age_results = urllib2.urlopen(request).read()
 978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 979                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 980                         return
 981
 982         def _real_extract(self, url):
 983                 # Extract video id from URL
 984                 mobj = re.match(self._VALID_URL, url)
 985                 if mobj is None:
 986                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 987                         return
 988                 video_id = mobj.group(2)
 989
 990                 # Get video webpage
 991                 self.report_video_webpage_download(video_id)
 992                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 993                 try:
 994                         video_webpage = urllib2.urlopen(request).read()
 995                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 996                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 997                         return
 998
 999                 # Attempt to extract SWF player URL
1000                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1001                 if mobj is not None:
1002                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1003                 else:
1004                         player_url = None
1005
1006                 # Get video info
1007                 self.report_video_info_webpage_download(video_id)
1008                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1009                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1010                                            % (video_id, el_type))
1011                         request = urllib2.Request(video_info_url)
1012                         try:
1013                                 video_info_webpage = urllib2.urlopen(request).read()
1014                                 video_info = parse_qs(video_info_webpage)
1015                                 if 'token' in video_info:
1016                                         break
1017                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1018                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1019                                 return
1020                 if 'token' not in video_info:
1021                         if 'reason' in video_info:
1022                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1023                         else:
1024                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1025                         return
1026
1027                 # Start extracting information
1028                 self.report_information_extraction(video_id)
1029
1030                 # uploader
1031                 if 'author' not in video_info:
1032                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1033                         return
1034                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1035
1036                 # title
1037                 if 'title' not in video_info:
1038                         self._downloader.trouble(u'ERROR: unable to extract video title')
1039                         return
1040                 video_title = urllib.unquote_plus(video_info['title'][0])
1041                 video_title = video_title.decode('utf-8')
1042                 video_title = sanitize_title(video_title)
1043
1044                 # simplified title
1045                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1046                 simple_title = simple_title.strip(ur'_')
1047
1048                 # thumbnail image
1049                 if 'thumbnail_url' not in video_info:
1050                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1051                         video_thumbnail = ''
1052                 else:   # don't panic if we can't find it
1053                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1054
1055                 # upload date
1056                 upload_date = u'NA'
1057                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1058                 if mobj is not None:
1059                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1060                         format_expressions = ['%d %B %Y', '%B %d %Y']
1061                         for expression in format_expressions:
1062                                 try:
1063                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1064                                 except:
1065                                         pass
1066
1067                 # description
1068                 video_description = 'No description available.'
1069                 if self._downloader.params.get('forcedescription', False):
1070                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1071                         if mobj is not None:
1072                                 video_description = mobj.group(1)
1073
1074                 # token
1075                 video_token = urllib.unquote_plus(video_info['token'][0])
1076
1077                 # Decide which formats to download
1078                 req_format = self._downloader.params.get('format', None)
1079
1080                 if 'fmt_url_map' in video_info:
1081                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1082                         format_limit = self._downloader.params.get('format_limit', None)
1083                         if format_limit is not None and format_limit in self._available_formats:
1084                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1085                         else:
1086                                 format_list = self._available_formats
1087                         existing_formats = [x for x in format_list if x in url_map]
1088                         if len(existing_formats) == 0:
1089                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1090                                 return
1091                         if req_format is None:
1092                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1093                         elif req_format == '-1':
1094                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1095                         else:
1096                                 # Specific format
1097                                 if req_format not in url_map:
1098                                         self._downloader.trouble(u'ERROR: requested format not available')
1099                                         return
1100                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1101
1102                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1103                         self.report_rtmp_download()
1104                         video_url_list = [(None, video_info['conn'][0])]
1105
1106                 else:
1107                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1108                         return
1109
1110                 for format_param, video_real_url in video_url_list:
1111                         # At this point we have a new video
1112                         self._downloader.increment_downloads()
1113
1114                         # Extension
1115                         video_extension = self._video_extensions.get(format_param, 'flv')
1116
1117                         # Find the video URL in fmt_url_map or conn paramters
1118                         try:
1119                                 # Process video information
1120                                 self._downloader.process_info({
1121                                         'id':           video_id.decode('utf-8'),
1122                                         'url':          video_real_url.decode('utf-8'),
1123                                         'uploader':     video_uploader.decode('utf-8'),
1124                                         'upload_date':  upload_date,
1125                                         'title':        video_title,
1126                                         'stitle':       simple_title,
1127                                         'ext':          video_extension.decode('utf-8'),
1128                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1129                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1130                                         'description':  video_description.decode('utf-8'),
1131                                         'player_url':   player_url,
1132                                 })
1133                         except UnavailableVideoError, err:
1134                                 self._downloader.trouble(u'\nERROR: unable to download video')
1135
1136
1137 class MetacafeIE(InfoExtractor):
1138         """Information Extractor for metacafe.com."""
1139
1140         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1141         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1142         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1143         _youtube_ie = None
1144
1145         def __init__(self, youtube_ie, downloader=None):
1146                 InfoExtractor.__init__(self, downloader)
1147                 self._youtube_ie = youtube_ie
1148
1149         @staticmethod
1150         def suitable(url):
1151                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1152
1153         def report_disclaimer(self):
1154                 """Report disclaimer retrieval."""
1155                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1156
1157         def report_age_confirmation(self):
1158                 """Report attempt to confirm age."""
1159                 self._downloader.to_screen(u'[metacafe] Confirming age')
1160
1161         def report_download_webpage(self, video_id):
1162                 """Report webpage download."""
1163                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1164
1165         def report_extraction(self, video_id):
1166                 """Report information extraction."""
1167                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1168
1169         def _real_initialize(self):
1170                 # Retrieve disclaimer
1171                 request = urllib2.Request(self._DISCLAIMER)
1172                 try:
1173                         self.report_disclaimer()
1174                         disclaimer = urllib2.urlopen(request).read()
1175                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1177                         return
1178
1179                 # Confirm age
1180                 disclaimer_form = {
1181                         'filters': '0',
1182                         'submit': "Continue - I'm over 18",
1183                         }
1184                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1185                 try:
1186                         self.report_age_confirmation()
1187                         disclaimer = urllib2.urlopen(request).read()
1188                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1190                         return
1191
1192         def _real_extract(self, url):
1193                 # Extract id and simplified title from URL
1194                 mobj = re.match(self._VALID_URL, url)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1197                         return
1198
1199                 video_id = mobj.group(1)
1200
1201                 # Check if video comes from YouTube
1202                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1203                 if mobj2 is not None:
1204                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1205                         return
1206
1207                 # At this point we have a new video
1208                 self._downloader.increment_downloads()
1209
1210                 simple_title = mobj.group(2).decode('utf-8')
1211
1212                 # Retrieve video webpage to extract further information
1213                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1214                 try:
1215                         self.report_download_webpage(video_id)
1216                         webpage = urllib2.urlopen(request).read()
1217                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1218                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1219                         return
1220
1221                 # Extract URL, uploader and title from webpage
1222                 self.report_extraction(video_id)
1223                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1224                 if mobj is not None:
1225                         mediaURL = urllib.unquote(mobj.group(1))
1226                         video_extension = mediaURL[-3:]
1227
1228                         # Extract gdaKey if available
1229                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1230                         if mobj is None:
1231                                 video_url = mediaURL
1232                         else:
1233                                 gdaKey = mobj.group(1)
1234                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1235                 else:
1236                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1237                         if mobj is None:
1238                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1239                                 return
1240                         vardict = parse_qs(mobj.group(1))
1241                         if 'mediaData' not in vardict:
1242                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1243                                 return
1244                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1245                         if mobj is None:
1246                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1247                                 return
1248                         mediaURL = mobj.group(1).replace('\\/', '/')
1249                         video_extension = mediaURL[-3:]
1250                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1251
1252                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1253                 if mobj is None:
1254                         self._downloader.trouble(u'ERROR: unable to extract title')
1255                         return
1256                 video_title = mobj.group(1).decode('utf-8')
1257                 video_title = sanitize_title(video_title)
1258
1259                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1260                 if mobj is None:
1261                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1262                         return
1263                 video_uploader = mobj.group(1)
1264
1265                 try:
1266                         # Process video information
1267                         self._downloader.process_info({
1268                                 'id':           video_id.decode('utf-8'),
1269                                 'url':          video_url.decode('utf-8'),
1270                                 'uploader':     video_uploader.decode('utf-8'),
1271                                 'upload_date':  u'NA',
1272                                 'title':        video_title,
1273                                 'stitle':       simple_title,
1274                                 'ext':          video_extension.decode('utf-8'),
1275                                 'format':       u'NA',
1276                                 'player_url':   None,
1277                         })
1278                 except UnavailableVideoError:
1279                         self._downloader.trouble(u'\nERROR: unable to download video')
1280
1281
1282 class DailymotionIE(InfoExtractor):
1283         """Information Extractor for Dailymotion"""
1284
1285         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1286
1287         def __init__(self, downloader=None):
1288                 InfoExtractor.__init__(self, downloader)
1289
1290         @staticmethod
1291         def suitable(url):
1292                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1293
1294         def report_download_webpage(self, video_id):
1295                 """Report webpage download."""
1296                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1297
1298         def report_extraction(self, video_id):
1299                 """Report information extraction."""
1300                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1301
1302         def _real_initialize(self):
1303                 return
1304
1305         def _real_extract(self, url):
1306                 # Extract id and simplified title from URL
1307                 mobj = re.match(self._VALID_URL, url)
1308                 if mobj is None:
1309                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1310                         return
1311
1312                 # At this point we have a new video
1313                 self._downloader.increment_downloads()
1314                 video_id = mobj.group(1)
1315
1316                 simple_title = mobj.group(2).decode('utf-8')
1317                 video_extension = 'flv'
1318
1319                 # Retrieve video webpage to extract further information
1320                 request = urllib2.Request(url)
1321                 try:
1322                         self.report_download_webpage(video_id)
1323                         webpage = urllib2.urlopen(request).read()
1324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1326                         return
1327
1328                 # Extract URL, uploader and title from webpage
1329                 self.report_extraction(video_id)
1330                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1331                 if mobj is None:
1332                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1333                         return
1334                 mediaURL = urllib.unquote(mobj.group(1))
1335
1336                 # if needed add http://www.dailymotion.com/ if relative URL
1337
1338                 video_url = mediaURL
1339
1340                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1341                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1342                 if mobj is None:
1343                         self._downloader.trouble(u'ERROR: unable to extract title')
1344                         return
1345                 video_title = mobj.group(1).decode('utf-8')
1346                 video_title = sanitize_title(video_title)
1347
1348                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1349                 if mobj is None:
1350                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1351                         return
1352                 video_uploader = mobj.group(1)
1353
1354                 try:
1355                         # Process video information
1356                         self._downloader.process_info({
1357                                 'id':           video_id.decode('utf-8'),
1358                                 'url':          video_url.decode('utf-8'),
1359                                 'uploader':     video_uploader.decode('utf-8'),
1360                                 'upload_date':  u'NA',
1361                                 'title':        video_title,
1362                                 'stitle':       simple_title,
1363                                 'ext':          video_extension.decode('utf-8'),
1364                                 'format':       u'NA',
1365                                 'player_url':   None,
1366                         })
1367                 except UnavailableVideoError:
1368                         self._downloader.trouble(u'\nERROR: unable to download video')
1369
1370 class GoogleIE(InfoExtractor):
1371         """Information extractor for video.google.com."""
1372
1373         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1374
1375         def __init__(self, downloader=None):
1376                 InfoExtractor.__init__(self, downloader)
1377
1378         @staticmethod
1379         def suitable(url):
1380                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1381
1382         def report_download_webpage(self, video_id):
1383                 """Report webpage download."""
1384                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1385
1386         def report_extraction(self, video_id):
1387                 """Report information extraction."""
1388                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1389
1390         def _real_initialize(self):
1391                 return
1392
1393         def _real_extract(self, url):
1394                 # Extract id from URL
1395                 mobj = re.match(self._VALID_URL, url)
1396                 if mobj is None:
1397                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1398                         return
1399
1400                 # At this point we have a new video
1401                 self._downloader.increment_downloads()
1402                 video_id = mobj.group(1)
1403
1404                 video_extension = 'mp4'
1405
1406                 # Retrieve video webpage to extract further information
1407                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1408                 try:
1409                         self.report_download_webpage(video_id)
1410                         webpage = urllib2.urlopen(request).read()
1411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1413                         return
1414
1415                 # Extract URL, uploader, and title from webpage
1416                 self.report_extraction(video_id)
1417                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1418                 if mobj is None:
1419                         video_extension = 'flv'
1420                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1421                 if mobj is None:
1422                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1423                         return
1424                 mediaURL = urllib.unquote(mobj.group(1))
1425                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1426                 mediaURL = mediaURL.replace('\\x26', '\x26')
1427
1428                 video_url = mediaURL
1429
1430                 mobj = re.search(r'<title>(.*)</title>', webpage)
1431                 if mobj is None:
1432                         self._downloader.trouble(u'ERROR: unable to extract title')
1433                         return
1434                 video_title = mobj.group(1).decode('utf-8')
1435                 video_title = sanitize_title(video_title)
1436                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1437
1438                 # Extract video description
1439                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: unable to extract video description')
1442                         return
1443                 video_description = mobj.group(1).decode('utf-8')
1444                 if not video_description:
1445                         video_description = 'No description available.'
1446
1447                 # Extract video thumbnail
1448                 if self._downloader.params.get('forcethumbnail', False):
1449                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1450                         try:
1451                                 webpage = urllib2.urlopen(request).read()
1452                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1454                                 return
1455                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1456                         if mobj is None:
1457                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1458                                 return
1459                         video_thumbnail = mobj.group(1)
1460                 else:   # we need something to pass to process_info
1461                         video_thumbnail = ''
1462
1463
1464                 try:
1465                         # Process video information
1466                         self._downloader.process_info({
1467                                 'id':           video_id.decode('utf-8'),
1468                                 'url':          video_url.decode('utf-8'),
1469                                 'uploader':     u'NA',
1470                                 'upload_date':  u'NA',
1471                                 'title':        video_title,
1472                                 'stitle':       simple_title,
1473                                 'ext':          video_extension.decode('utf-8'),
1474                                 'format':       u'NA',
1475                                 'player_url':   None,
1476                         })
1477                 except UnavailableVideoError:
1478                         self._downloader.trouble(u'\nERROR: unable to download video')
1479
1480
1481 class PhotobucketIE(InfoExtractor):
1482         """Information extractor for photobucket.com."""
1483
1484         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1485
1486         def __init__(self, downloader=None):
1487                 InfoExtractor.__init__(self, downloader)
1488
1489         @staticmethod
1490         def suitable(url):
1491                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1492
1493         def report_download_webpage(self, video_id):
1494                 """Report webpage download."""
1495                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1496
1497         def report_extraction(self, video_id):
1498                 """Report information extraction."""
1499                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1500
1501         def _real_initialize(self):
1502                 return
1503
1504         def _real_extract(self, url):
1505                 # Extract id from URL
1506                 mobj = re.match(self._VALID_URL, url)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1509                         return
1510
1511                 # At this point we have a new video
1512                 self._downloader.increment_downloads()
1513                 video_id = mobj.group(1)
1514
1515                 video_extension = 'flv'
1516
1517                 # Retrieve video webpage to extract further information
1518                 request = urllib2.Request(url)
1519                 try:
1520                         self.report_download_webpage(video_id)
1521                         webpage = urllib2.urlopen(request).read()
1522                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1523                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1524                         return
1525
1526                 # Extract URL, uploader, and title from webpage
1527                 self.report_extraction(video_id)
1528                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1529                 if mobj is None:
1530                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1531                         return
1532                 mediaURL = urllib.unquote(mobj.group(1))
1533
1534                 video_url = mediaURL
1535
1536                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1537                 if mobj is None:
1538                         self._downloader.trouble(u'ERROR: unable to extract title')
1539                         return
1540                 video_title = mobj.group(1).decode('utf-8')
1541                 video_title = sanitize_title(video_title)
1542                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1543
1544                 video_uploader = mobj.group(2).decode('utf-8')
1545
1546                 try:
1547                         # Process video information
1548                         self._downloader.process_info({
1549                                 'id':           video_id.decode('utf-8'),
1550                                 'url':          video_url.decode('utf-8'),
1551                                 'uploader':     video_uploader,
1552                                 'upload_date':  u'NA',
1553                                 'title':        video_title,
1554                                 'stitle':       simple_title,
1555                                 'ext':          video_extension.decode('utf-8'),
1556                                 'format':       u'NA',
1557                                 'player_url':   None,
1558                         })
1559                 except UnavailableVideoError:
1560                         self._downloader.trouble(u'\nERROR: unable to download video')
1561
1562
1563 class YahooIE(InfoExtractor):
1564         """Information extractor for video.yahoo.com."""
1565
1566         # _VALID_URL matches all Yahoo! Video URLs
1567         # _VPAGE_URL matches only the extractable '/watch/' URLs
1568         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1569         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1570
1571         def __init__(self, downloader=None):
1572                 InfoExtractor.__init__(self, downloader)
1573
1574         @staticmethod
1575         def suitable(url):
1576                 return (re.match(YahooIE._VALID_URL, url) is not None)
1577
1578         def report_download_webpage(self, video_id):
1579                 """Report webpage download."""
1580                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1581
1582         def report_extraction(self, video_id):
1583                 """Report information extraction."""
1584                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1585
1586         def _real_initialize(self):
1587                 return
1588
1589         def _real_extract(self, url, new_video=True):
1590                 # Extract ID from URL
1591                 mobj = re.match(self._VALID_URL, url)
1592                 if mobj is None:
1593                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1594                         return
1595
1596                 # At this point we have a new video
1597                 self._downloader.increment_downloads()
1598                 video_id = mobj.group(2)
1599                 video_extension = 'flv'
1600
1601                 # Rewrite valid but non-extractable URLs as
1602                 # extractable English language /watch/ URLs
1603                 if re.match(self._VPAGE_URL, url) is None:
1604                         request = urllib2.Request(url)
1605                         try:
1606                                 webpage = urllib2.urlopen(request).read()
1607                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1608                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1609                                 return
1610
1611                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1612                         if mobj is None:
1613                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1614                                 return
1615                         yahoo_id = mobj.group(1)
1616
1617                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1618                         if mobj is None:
1619                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1620                                 return
1621                         yahoo_vid = mobj.group(1)
1622
1623                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1624                         return self._real_extract(url, new_video=False)
1625
1626                 # Retrieve video webpage to extract further information
1627                 request = urllib2.Request(url)
1628                 try:
1629                         self.report_download_webpage(video_id)
1630                         webpage = urllib2.urlopen(request).read()
1631                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1632                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1633                         return
1634
1635                 # Extract uploader and title from webpage
1636                 self.report_extraction(video_id)
1637                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: unable to extract video title')
1640                         return
1641                 video_title = mobj.group(1).decode('utf-8')
1642                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1643
1644                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1645                 if mobj is None:
1646                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1647                         return
1648                 video_uploader = mobj.group(1).decode('utf-8')
1649
1650                 # Extract video thumbnail
1651                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1654                         return
1655                 video_thumbnail = mobj.group(1).decode('utf-8')
1656
1657                 # Extract video description
1658                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract video description')
1661                         return
1662                 video_description = mobj.group(1).decode('utf-8')
1663                 if not video_description: video_description = 'No description available.'
1664
1665                 # Extract video height and width
1666                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1667                 if mobj is None:
1668                         self._downloader.trouble(u'ERROR: unable to extract video height')
1669                         return
1670                 yv_video_height = mobj.group(1)
1671
1672                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1673                 if mobj is None:
1674                         self._downloader.trouble(u'ERROR: unable to extract video width')
1675                         return
1676                 yv_video_width = mobj.group(1)
1677
1678                 # Retrieve video playlist to extract media URL
1679                 # I'm not completely sure what all these options are, but we
1680                 # seem to need most of them, otherwise the server sends a 401.
1681                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1682                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1683                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1684                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1685                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1686                 try:
1687                         self.report_download_webpage(video_id)
1688                         webpage = urllib2.urlopen(request).read()
1689                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1691                         return
1692
1693                 # Extract media URL from playlist XML
1694                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1695                 if mobj is None:
1696                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1697                         return
1698                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1699                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1700
1701                 try:
1702                         # Process video information
1703                         self._downloader.process_info({
1704                                 'id':           video_id.decode('utf-8'),
1705                                 'url':          video_url,
1706                                 'uploader':     video_uploader,
1707                                 'upload_date':  u'NA',
1708                                 'title':        video_title,
1709                                 'stitle':       simple_title,
1710                                 'ext':          video_extension.decode('utf-8'),
1711                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1712                                 'description':  video_description,
1713                                 'thumbnail':    video_thumbnail,
1714                                 'description':  video_description,
1715                                 'player_url':   None,
1716                         })
1717                 except UnavailableVideoError:
1718                         self._downloader.trouble(u'\nERROR: unable to download video')
1719
1720
1721 class VimeoIE(InfoExtractor):
1722         """Information extractor for vimeo.com."""
1723
1724         # _VALID_URL matches Vimeo URLs
1725         _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)'
1726
1727         def __init__(self, downloader=None):
1728                 InfoExtractor.__init__(self, downloader)
1729
1730         @staticmethod
1731         def suitable(url):
1732                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1733
1734         def report_download_webpage(self, video_id):
1735                 """Report webpage download."""
1736                 self._downloader.to_screen(u'[video.vimeo] %s: Downloading webpage' % video_id)
1737
1738         def report_extraction(self, video_id):
1739                 """Report information extraction."""
1740                 self._downloader.to_screen(u'[video.vimeo] %s: Extracting information' % video_id)
1741
1742         def _real_initialize(self):
1743                 return
1744
1745         def _real_extract(self, url, new_video=True):
1746                 # Extract ID from URL
1747                 mobj = re.match(self._VALID_URL, url)
1748                 if mobj is None:
1749                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1750                         return
1751
1752                 # At this point we have a new video
1753                 self._downloader.increment_downloads()
1754                 video_id = mobj.group(1)
1755                 video_extension = 'flv' # FIXME
1756
1757                 # Retrieve video webpage to extract further information
1758                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1759                 try:
1760                         self.report_download_webpage(video_id)
1761                         webpage = urllib2.urlopen(request).read()
1762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1763                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1764                         return
1765
1766                 # Extract uploader and title from webpage
1767                 self.report_extraction(video_id)
1768                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1769                 if mobj is None:
1770                         self._downloader.trouble(u'ERROR: unable to extract video title')
1771                         return
1772                 video_title = mobj.group(1).decode('utf-8')
1773                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1774
1775                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1776                 if mobj is None:
1777                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1778                         return
1779                 video_uploader = mobj.group(1).decode('utf-8')
1780
1781                 # Extract video thumbnail
1782                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1783                 if mobj is None:
1784                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1785                         return
1786                 video_thumbnail = mobj.group(1).decode('utf-8')
1787
1788                 # # Extract video description
1789                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1790                 # if mobj is None:
1791                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1792                 #       return
1793                 # video_description = mobj.group(1).decode('utf-8')
1794                 # if not video_description: video_description = 'No description available.'
1795                 video_description = 'Foo.'
1796
1797                 # Extract request signature
1798                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1799                 if mobj is None:
1800                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1801                         return
1802                 sig = mobj.group(1).decode('utf-8')
1803
1804                 # Extract request signature expiration
1805                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1806                 if mobj is None:
1807                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1808                         return
1809                 sig_exp = mobj.group(1).decode('utf-8')
1810
1811                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
1812
1813                 try:
1814                         # Process video information
1815                         self._downloader.process_info({
1816                                 'id':           video_id.decode('utf-8'),
1817                                 'url':          video_url,
1818                                 'uploader':     video_uploader,
1819                                 'upload_date':  u'NA',
1820                                 'title':        video_title,
1821                                 'stitle':       simple_title,
1822                                 'ext':          video_extension.decode('utf-8'),
1823                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1824                                 'description':  video_description,
1825                                 'thumbnail':    video_thumbnail,
1826                                 'description':  video_description,
1827                                 'player_url':   None,
1828                         })
1829                 except UnavailableVideoError:
1830                         self._downloader.trouble(u'ERROR: unable to download video')
1831
1832
1833 class GenericIE(InfoExtractor):
1834         """Generic last-resort information extractor."""
1835
1836         def __init__(self, downloader=None):
1837                 InfoExtractor.__init__(self, downloader)
1838
1839         @staticmethod
1840         def suitable(url):
1841                 return True
1842
1843         def report_download_webpage(self, video_id):
1844                 """Report webpage download."""
1845                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1846                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1847
1848         def report_extraction(self, video_id):
1849                 """Report information extraction."""
1850                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1851
1852         def _real_initialize(self):
1853                 return
1854
1855         def _real_extract(self, url):
1856                 # At this point we have a new video
1857                 self._downloader.increment_downloads()
1858
1859                 video_id = url.split('/')[-1]
1860                 request = urllib2.Request(url)
1861                 try:
1862                         self.report_download_webpage(video_id)
1863                         webpage = urllib2.urlopen(request).read()
1864                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866                         return
1867                 except ValueError, err:
1868                         # since this is the last-resort InfoExtractor, if
1869                         # this error is thrown, it'll be thrown here
1870                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1871                         return
1872
1873                 self.report_extraction(video_id)
1874                 # Start with something easy: JW Player in SWFObject
1875                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1876                 if mobj is None:
1877                         # Broaden the search a little bit
1878                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1879                 if mobj is None:
1880                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1881                         return
1882
1883                 # It's possible that one of the regexes
1884                 # matched, but returned an empty group:
1885                 if mobj.group(1) is None:
1886                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1887                         return
1888
1889                 video_url = urllib.unquote(mobj.group(1))
1890                 video_id  = os.path.basename(video_url)
1891
1892                 # here's a fun little line of code for you:
1893                 video_extension = os.path.splitext(video_id)[1][1:]
1894                 video_id        = os.path.splitext(video_id)[0]
1895
1896                 # it's tempting to parse this further, but you would
1897                 # have to take into account all the variations like
1898                 #   Video Title - Site Name
1899                 #   Site Name | Video Title
1900                 #   Video Title - Tagline | Site Name
1901                 # and so on and so forth; it's just not practical
1902                 mobj = re.search(r'<title>(.*)</title>', webpage)
1903                 if mobj is None:
1904                         self._downloader.trouble(u'ERROR: unable to extract title')
1905                         return
1906                 video_title = mobj.group(1).decode('utf-8')
1907                 video_title = sanitize_title(video_title)
1908                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1909
1910                 # video uploader is domain name
1911                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1912                 if mobj is None:
1913                         self._downloader.trouble(u'ERROR: unable to extract title')
1914                         return
1915                 video_uploader = mobj.group(1).decode('utf-8')
1916
1917                 try:
1918                         # Process video information
1919                         self._downloader.process_info({
1920                                 'id':           video_id.decode('utf-8'),
1921                                 'url':          video_url.decode('utf-8'),
1922                                 'uploader':     video_uploader,
1923                                 'upload_date':  u'NA',
1924                                 'title':        video_title,
1925                                 'stitle':       simple_title,
1926                                 'ext':          video_extension.decode('utf-8'),
1927                                 'format':       u'NA',
1928                                 'player_url':   None,
1929                         })
1930                 except UnavailableVideoError, err:
1931                         self._downloader.trouble(u'\nERROR: unable to download video')
1932
1933
1934 class YoutubeSearchIE(InfoExtractor):
1935         """Information Extractor for YouTube search queries."""
1936         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1937         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1938         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1939         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1940         _youtube_ie = None
1941         _max_youtube_results = 1000
1942
1943         def __init__(self, youtube_ie, downloader=None):
1944                 InfoExtractor.__init__(self, downloader)
1945                 self._youtube_ie = youtube_ie
1946
1947         @staticmethod
1948         def suitable(url):
1949                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1950
1951         def report_download_page(self, query, pagenum):
1952                 """Report attempt to download playlist page with given number."""
1953                 query = query.decode(preferredencoding())
1954                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1955
1956         def _real_initialize(self):
1957                 self._youtube_ie.initialize()
1958
1959         def _real_extract(self, query):
1960                 mobj = re.match(self._VALID_QUERY, query)
1961                 if mobj is None:
1962                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1963                         return
1964
1965                 prefix, query = query.split(':')
1966                 prefix = prefix[8:]
1967                 query  = query.encode('utf-8')
1968                 if prefix == '':
1969                         self._download_n_results(query, 1)
1970                         return
1971                 elif prefix == 'all':
1972                         self._download_n_results(query, self._max_youtube_results)
1973                         return
1974                 else:
1975                         try:
1976                                 n = long(prefix)
1977                                 if n <= 0:
1978                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1979                                         return
1980                                 elif n > self._max_youtube_results:
1981                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1982                                         n = self._max_youtube_results
1983                                 self._download_n_results(query, n)
1984                                 return
1985                         except ValueError: # parsing prefix as integer fails
1986                                 self._download_n_results(query, 1)
1987                                 return
1988
1989         def _download_n_results(self, query, n):
1990                 """Downloads a specified number of results for a query"""
1991
1992                 video_ids = []
1993                 already_seen = set()
1994                 pagenum = 1
1995
1996                 while True:
1997                         self.report_download_page(query, pagenum)
1998                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1999                         request = urllib2.Request(result_url)
2000                         try:
2001                                 page = urllib2.urlopen(request).read()
2002                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2004                                 return
2005
2006                         # Extract video identifiers
2007                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2008                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2009                                 if video_id not in already_seen:
2010                                         video_ids.append(video_id)
2011                                         already_seen.add(video_id)
2012                                         if len(video_ids) == n:
2013                                                 # Specified n videos reached
2014                                                 for id in video_ids:
2015                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2016                                                 return
2017
2018                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2019                                 for id in video_ids:
2020                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2021                                 return
2022
2023                         pagenum = pagenum + 1
2024
2025 class GoogleSearchIE(InfoExtractor):
2026         """Information Extractor for Google Video search queries."""
2027         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2028         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2029         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2030         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2031         _google_ie = None
2032         _max_google_results = 1000
2033
2034         def __init__(self, google_ie, downloader=None):
2035                 InfoExtractor.__init__(self, downloader)
2036                 self._google_ie = google_ie
2037
2038         @staticmethod
2039         def suitable(url):
2040                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2041
2042         def report_download_page(self, query, pagenum):
2043                 """Report attempt to download playlist page with given number."""
2044                 query = query.decode(preferredencoding())
2045                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2046
2047         def _real_initialize(self):
2048                 self._google_ie.initialize()
2049
2050         def _real_extract(self, query):
2051                 mobj = re.match(self._VALID_QUERY, query)
2052                 if mobj is None:
2053                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2054                         return
2055
2056                 prefix, query = query.split(':')
2057                 prefix = prefix[8:]
2058                 query  = query.encode('utf-8')
2059                 if prefix == '':
2060                         self._download_n_results(query, 1)
2061                         return
2062                 elif prefix == 'all':
2063                         self._download_n_results(query, self._max_google_results)
2064                         return
2065                 else:
2066                         try:
2067                                 n = long(prefix)
2068                                 if n <= 0:
2069                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2070                                         return
2071                                 elif n > self._max_google_results:
2072                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2073                                         n = self._max_google_results
2074                                 self._download_n_results(query, n)
2075                                 return
2076                         except ValueError: # parsing prefix as integer fails
2077                                 self._download_n_results(query, 1)
2078                                 return
2079
2080         def _download_n_results(self, query, n):
2081                 """Downloads a specified number of results for a query"""
2082
2083                 video_ids = []
2084                 already_seen = set()
2085                 pagenum = 1
2086
2087                 while True:
2088                         self.report_download_page(query, pagenum)
2089                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2090                         request = urllib2.Request(result_url)
2091                         try:
2092                                 page = urllib2.urlopen(request).read()
2093                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2094                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2095                                 return
2096
2097                         # Extract video identifiers
2098                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2099                                 video_id = mobj.group(1)
2100                                 if video_id not in already_seen:
2101                                         video_ids.append(video_id)
2102                                         already_seen.add(video_id)
2103                                         if len(video_ids) == n:
2104                                                 # Specified n videos reached
2105                                                 for id in video_ids:
2106                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2107                                                 return
2108
2109                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2110                                 for id in video_ids:
2111                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2112                                 return
2113
2114                         pagenum = pagenum + 1
2115
2116 class YahooSearchIE(InfoExtractor):
2117         """Information Extractor for Yahoo! Video search queries."""
2118         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2119         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2120         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2121         _MORE_PAGES_INDICATOR = r'\s*Next'
2122         _yahoo_ie = None
2123         _max_yahoo_results = 1000
2124
2125         def __init__(self, yahoo_ie, downloader=None):
2126                 InfoExtractor.__init__(self, downloader)
2127                 self._yahoo_ie = yahoo_ie
2128
2129         @staticmethod
2130         def suitable(url):
2131                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2132
2133         def report_download_page(self, query, pagenum):
2134                 """Report attempt to download playlist page with given number."""
2135                 query = query.decode(preferredencoding())
2136                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2137
2138         def _real_initialize(self):
2139                 self._yahoo_ie.initialize()
2140
2141         def _real_extract(self, query):
2142                 mobj = re.match(self._VALID_QUERY, query)
2143                 if mobj is None:
2144                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2145                         return
2146
2147                 prefix, query = query.split(':')
2148                 prefix = prefix[8:]
2149                 query  = query.encode('utf-8')
2150                 if prefix == '':
2151                         self._download_n_results(query, 1)
2152                         return
2153                 elif prefix == 'all':
2154                         self._download_n_results(query, self._max_yahoo_results)
2155                         return
2156                 else:
2157                         try:
2158                                 n = long(prefix)
2159                                 if n <= 0:
2160                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2161                                         return
2162                                 elif n > self._max_yahoo_results:
2163                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2164                                         n = self._max_yahoo_results
2165                                 self._download_n_results(query, n)
2166                                 return
2167                         except ValueError: # parsing prefix as integer fails
2168                                 self._download_n_results(query, 1)
2169                                 return
2170
2171         def _download_n_results(self, query, n):
2172                 """Downloads a specified number of results for a query"""
2173
2174                 video_ids = []
2175                 already_seen = set()
2176                 pagenum = 1
2177
2178                 while True:
2179                         self.report_download_page(query, pagenum)
2180                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2181                         request = urllib2.Request(result_url)
2182                         try:
2183                                 page = urllib2.urlopen(request).read()
2184                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2185                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2186                                 return
2187
2188                         # Extract video identifiers
2189                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2190                                 video_id = mobj.group(1)
2191                                 if video_id not in already_seen:
2192                                         video_ids.append(video_id)
2193                                         already_seen.add(video_id)
2194                                         if len(video_ids) == n:
2195                                                 # Specified n videos reached
2196                                                 for id in video_ids:
2197                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2198                                                 return
2199
2200                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2201                                 for id in video_ids:
2202                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2203                                 return
2204
2205                         pagenum = pagenum + 1
2206
2207 class YoutubePlaylistIE(InfoExtractor):
2208         """Information Extractor for YouTube playlists."""
2209
2210         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2211         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2212         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2213         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2214         _youtube_ie = None
2215
2216         def __init__(self, youtube_ie, downloader=None):
2217                 InfoExtractor.__init__(self, downloader)
2218                 self._youtube_ie = youtube_ie
2219
2220         @staticmethod
2221         def suitable(url):
2222                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2223
2224         def report_download_page(self, playlist_id, pagenum):
2225                 """Report attempt to download playlist page with given number."""
2226                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2227
2228         def _real_initialize(self):
2229                 self._youtube_ie.initialize()
2230
2231         def _real_extract(self, url):
2232                 # Extract playlist id
2233                 mobj = re.match(self._VALID_URL, url)
2234                 if mobj is None:
2235                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2236                         return
2237
2238                 # Download playlist pages
2239                 playlist_id = mobj.group(1)
2240                 video_ids = []
2241                 pagenum = 1
2242
2243                 while True:
2244                         self.report_download_page(playlist_id, pagenum)
2245                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2246                         try:
2247                                 page = urllib2.urlopen(request).read()
2248                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2249                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2250                                 return
2251
2252                         # Extract video identifiers
2253                         ids_in_page = []
2254                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2255                                 if mobj.group(1) not in ids_in_page:
2256                                         ids_in_page.append(mobj.group(1))
2257                         video_ids.extend(ids_in_page)
2258
2259                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2260                                 break
2261                         pagenum = pagenum + 1
2262
2263                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2264                 playlistend = self._downloader.params.get('playlistend', -1)
2265                 video_ids = video_ids[playliststart:playlistend]
2266
2267                 for id in video_ids:
2268                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2269                 return
2270
2271 class YoutubeUserIE(InfoExtractor):
2272         """Information Extractor for YouTube users."""
2273
2274         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2275         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2276         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2277         _youtube_ie = None
2278
2279         def __init__(self, youtube_ie, downloader=None):
2280                 InfoExtractor.__init__(self, downloader)
2281                 self._youtube_ie = youtube_ie
2282
2283         @staticmethod
2284         def suitable(url):
2285                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2286
2287         def report_download_page(self, username):
2288                 """Report attempt to download user page."""
2289                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2290
2291         def _real_initialize(self):
2292                 self._youtube_ie.initialize()
2293
2294         def _real_extract(self, url):
2295                 # Extract username
2296                 mobj = re.match(self._VALID_URL, url)
2297                 if mobj is None:
2298                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2299                         return
2300
2301                 # Download user page
2302                 username = mobj.group(1)
2303                 video_ids = []
2304                 pagenum = 1
2305
2306                 self.report_download_page(username)
2307                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2308                 try:
2309                         page = urllib2.urlopen(request).read()
2310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2312                         return
2313
2314                 # Extract video identifiers
2315                 ids_in_page = []
2316
2317                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2318                         if mobj.group(1) not in ids_in_page:
2319                                 ids_in_page.append(mobj.group(1))
2320                 video_ids.extend(ids_in_page)
2321
2322                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2323                 playlistend = self._downloader.params.get('playlistend', -1)
2324                 video_ids = video_ids[playliststart:playlistend]
2325
2326                 for id in video_ids:
2327                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2328                 return
2329
2330 class DepositFilesIE(InfoExtractor):
2331         """Information extractor for depositfiles.com"""
2332
2333         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2334
2335         def __init__(self, downloader=None):
2336                 InfoExtractor.__init__(self, downloader)
2337
2338         @staticmethod
2339         def suitable(url):
2340                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2341
2342         def report_download_webpage(self, file_id):
2343                 """Report webpage download."""
2344                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2345
2346         def report_extraction(self, file_id):
2347                 """Report information extraction."""
2348                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2349
2350         def _real_initialize(self):
2351                 return
2352
2353         def _real_extract(self, url):
2354                 # At this point we have a new file
2355                 self._downloader.increment_downloads()
2356
2357                 file_id = url.split('/')[-1]
2358                 # Rebuild url in english locale
2359                 url = 'http://depositfiles.com/en/files/' + file_id
2360
2361                 # Retrieve file webpage with 'Free download' button pressed
2362                 free_download_indication = { 'gateway_result' : '1' }
2363                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2364                 try:
2365                         self.report_download_webpage(file_id)
2366                         webpage = urllib2.urlopen(request).read()
2367                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2368                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2369                         return
2370
2371                 # Search for the real file URL
2372                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2373                 if (mobj is None) or (mobj.group(1) is None):
2374                         # Try to figure out reason of the error.
2375                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2376                         if (mobj is not None) and (mobj.group(1) is not None):
2377                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2378                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2379                         else:
2380                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2381                         return
2382
2383                 file_url = mobj.group(1)
2384                 file_extension = os.path.splitext(file_url)[1][1:]
2385
2386                 # Search for file title
2387                 mobj = re.search(r'<b title="(.*?)">', webpage)
2388                 if mobj is None:
2389                         self._downloader.trouble(u'ERROR: unable to extract title')
2390                         return
2391                 file_title = mobj.group(1).decode('utf-8')
2392
2393                 try:
2394                         # Process file information
2395                         self._downloader.process_info({
2396                                 'id':           file_id.decode('utf-8'),
2397                                 'url':          file_url.decode('utf-8'),
2398                                 'uploader':     u'NA',
2399                                 'upload_date':  u'NA',
2400                                 'title':        file_title,
2401                                 'stitle':       file_title,
2402                                 'ext':          file_extension.decode('utf-8'),
2403                                 'format':       u'NA',
2404                                 'player_url':   None,
2405                         })
2406                 except UnavailableVideoError, err:
2407                         self._downloader.trouble(u'ERROR: unable to download file')
2408
2409 class PostProcessor(object):
2410         """Post Processor class.
2411
2412         PostProcessor objects can be added to downloaders with their
2413         add_post_processor() method. When the downloader has finished a
2414         successful download, it will take its internal chain of PostProcessors
2415         and start calling the run() method on each one of them, first with
2416         an initial argument and then with the returned value of the previous
2417         PostProcessor.
2418
2419         The chain will be stopped if one of them ever returns None or the end
2420         of the chain is reached.
2421
2422         PostProcessor objects follow a "mutual registration" process similar
2423         to InfoExtractor objects.
2424         """
2425
2426         _downloader = None
2427
2428         def __init__(self, downloader=None):
2429                 self._downloader = downloader
2430
2431         def set_downloader(self, downloader):
2432                 """Sets the downloader for this PP."""
2433                 self._downloader = downloader
2434
2435         def run(self, information):
2436                 """Run the PostProcessor.
2437
2438                 The "information" argument is a dictionary like the ones
2439                 composed by InfoExtractors. The only difference is that this
2440                 one has an extra field called "filepath" that points to the
2441                 downloaded file.
2442
2443                 When this method returns None, the postprocessing chain is
2444                 stopped. However, this method may return an information
2445                 dictionary that will be passed to the next postprocessing
2446                 object in the chain. It can be the one it received after
2447                 changing some fields.
2448
2449                 In addition, this method may raise a PostProcessingError
2450                 exception that will be taken into account by the downloader
2451                 it was called from.
2452                 """
2453                 return information # by default, do nothing
2454
2455 ### MAIN PROGRAM ###
2456 if __name__ == '__main__':
2457         try:
2458                 # Modules needed only when running the main program
2459                 import getpass
2460                 import optparse
2461
2462                 # Function to update the program file with the latest version from the repository.
2463                 def update_self(downloader, filename):
2464                         # Note: downloader only used for options
2465                         if not os.access(filename, os.W_OK):
2466                                 sys.exit('ERROR: no write permissions on %s' % filename)
2467
2468                         downloader.to_screen('Updating to latest stable version...')
2469                         try:
2470                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2471                                 latest_version = urllib.urlopen(latest_url).read().strip()
2472                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2473                                 newcontent = urllib.urlopen(prog_url).read()
2474                         except (IOError, OSError), err:
2475                                 sys.exit('ERROR: unable to download latest version')
2476                         try:
2477                                 stream = open(filename, 'w')
2478                                 stream.write(newcontent)
2479                                 stream.close()
2480                         except (IOError, OSError), err:
2481                                 sys.exit('ERROR: unable to overwrite current version')
2482                         downloader.to_screen('Updated to version %s' % latest_version)
2483
2484                 # Parse command line
2485                 parser = optparse.OptionParser(
2486                         usage='Usage: %prog [options] url...',
2487                         version='2010.12.09',
2488                         conflict_handler='resolve',
2489                 )
2490
2491                 parser.add_option('-h', '--help',
2492                                 action='help', help='print this help text and exit')
2493                 parser.add_option('-v', '--version',
2494                                 action='version', help='print program version and exit')
2495                 parser.add_option('-U', '--update',
2496                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2497                 parser.add_option('-i', '--ignore-errors',
2498                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2499                 parser.add_option('-r', '--rate-limit',
2500                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2501                 parser.add_option('-R', '--retries',
2502                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2503                 parser.add_option('--playlist-start',
2504                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2505                 parser.add_option('--playlist-end',
2506                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2507                 parser.add_option('--dump-user-agent',
2508                                 action='store_true', dest='dump_user_agent',
2509                                 help='display the current browser identification', default=False)
2510
2511                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2512                 authentication.add_option('-u', '--username',
2513                                 dest='username', metavar='USERNAME', help='account username')
2514                 authentication.add_option('-p', '--password',
2515                                 dest='password', metavar='PASSWORD', help='account password')
2516                 authentication.add_option('-n', '--netrc',
2517                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2518                 parser.add_option_group(authentication)
2519
2520                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2521                 video_format.add_option('-f', '--format',
2522                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2523                 video_format.add_option('--all-formats',
2524                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2525                 video_format.add_option('--max-quality',
2526                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2527                 parser.add_option_group(video_format)
2528
2529                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2530                 verbosity.add_option('-q', '--quiet',
2531                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2532                 verbosity.add_option('-s', '--simulate',
2533                                 action='store_true', dest='simulate', help='do not download video', default=False)
2534                 verbosity.add_option('-g', '--get-url',
2535                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2536                 verbosity.add_option('-e', '--get-title',
2537                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2538                 verbosity.add_option('--get-thumbnail',
2539                                 action='store_true', dest='getthumbnail',
2540                                 help='simulate, quiet but print thumbnail URL', default=False)
2541                 verbosity.add_option('--get-description',
2542                                 action='store_true', dest='getdescription',
2543                                 help='simulate, quiet but print video description', default=False)
2544                 verbosity.add_option('--get-filename',
2545                                 action='store_true', dest='getfilename',
2546                                 help='simulate, quiet but print output filename', default=False)
2547                 verbosity.add_option('--no-progress',
2548                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2549                 verbosity.add_option('--console-title',
2550                                 action='store_true', dest='consoletitle',
2551                                 help='display progress in console titlebar', default=False)
2552                 parser.add_option_group(verbosity)
2553
2554                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2555                 filesystem.add_option('-t', '--title',
2556                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2557                 filesystem.add_option('-l', '--literal',
2558                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2559                 filesystem.add_option('-A', '--auto-number',
2560                                 action='store_true', dest='autonumber',
2561                                 help='number downloaded files starting from 00000', default=False)
2562                 filesystem.add_option('-o', '--output',
2563                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2564                 filesystem.add_option('-a', '--batch-file',
2565                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2566                 filesystem.add_option('-w', '--no-overwrites',
2567                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2568                 filesystem.add_option('-c', '--continue',
2569                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2570                 filesystem.add_option('--cookies',
2571                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2572                 filesystem.add_option('--no-part',
2573                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2574                 filesystem.add_option('--no-mtime',
2575                                 action='store_false', dest='updatetime',
2576                                 help='do not use the Last-modified header to set the file modification time', default=True)
2577                 parser.add_option_group(filesystem)
2578
2579                 (opts, args) = parser.parse_args()
2580
2581                 # Open appropriate CookieJar
2582                 if opts.cookiefile is None:
2583                         jar = cookielib.CookieJar()
2584                 else:
2585                         try:
2586                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2587                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2588                                         jar.load()
2589                         except (IOError, OSError), err:
2590                                 sys.exit(u'ERROR: unable to open cookie file')
2591
2592                 # Dump user agent
2593                 if opts.dump_user_agent:
2594                         print std_headers['User-Agent']
2595                         sys.exit(0)
2596
2597                 # General configuration
2598                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2599                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2600                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2601
2602                 # Batch file verification
2603                 batchurls = []
2604                 if opts.batchfile is not None:
2605                         try:
2606                                 if opts.batchfile == '-':
2607                                         batchfd = sys.stdin
2608                                 else:
2609                                         batchfd = open(opts.batchfile, 'r')
2610                                 batchurls = batchfd.readlines()
2611                                 batchurls = [x.strip() for x in batchurls]
2612                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2613                         except IOError:
2614                                 sys.exit(u'ERROR: batch file could not be read')
2615                 all_urls = batchurls + args
2616
2617                 # Conflicting, missing and erroneous options
2618                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2619                         parser.error(u'using .netrc conflicts with giving username/password')
2620                 if opts.password is not None and opts.username is None:
2621                         parser.error(u'account username missing')
2622                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2623                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2624                 if opts.usetitle and opts.useliteral:
2625                         parser.error(u'using title conflicts with using literal title')
2626                 if opts.username is not None and opts.password is None:
2627                         opts.password = getpass.getpass(u'Type account password and press return:')
2628                 if opts.ratelimit is not None:
2629                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2630                         if numeric_limit is None:
2631                                 parser.error(u'invalid rate limit specified')
2632                         opts.ratelimit = numeric_limit
2633                 if opts.retries is not None:
2634                         try:
2635                                 opts.retries = long(opts.retries)
2636                         except (TypeError, ValueError), err:
2637                                 parser.error(u'invalid retry count specified')
2638                 try:
2639                         opts.playliststart = long(opts.playliststart)
2640                         if opts.playliststart <= 0:
2641                                 raise ValueError
2642                 except (TypeError, ValueError), err:
2643                         parser.error(u'invalid playlist start number specified')
2644                 try:
2645                         opts.playlistend = long(opts.playlistend)
2646                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2647                                 raise ValueError
2648                 except (TypeError, ValueError), err:
2649                         parser.error(u'invalid playlist end number specified')
2650
2651                 # Information extractors
2652                 vimeo_ie = VimeoIE()
2653                 youtube_ie = YoutubeIE()
2654                 metacafe_ie = MetacafeIE(youtube_ie)
2655                 dailymotion_ie = DailymotionIE()
2656                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2657                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2658                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2659                 google_ie = GoogleIE()
2660                 google_search_ie = GoogleSearchIE(google_ie)
2661                 photobucket_ie = PhotobucketIE()
2662                 yahoo_ie = YahooIE()
2663                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2664                 deposit_files_ie = DepositFilesIE()
2665                 generic_ie = GenericIE()
2666
2667                 # File downloader
2668                 fd = FileDownloader({
2669                         'usenetrc': opts.usenetrc,
2670                         'username': opts.username,
2671                         'password': opts.password,
2672                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2673                         'forceurl': opts.geturl,
2674                         'forcetitle': opts.gettitle,
2675                         'forcethumbnail': opts.getthumbnail,
2676                         'forcedescription': opts.getdescription,
2677                         'forcefilename': opts.getfilename,
2678                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2679                         'format': opts.format,
2680                         'format_limit': opts.format_limit,
2681                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2682                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2683                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2684                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2685                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2686                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2687                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2688                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2689                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2690                                 or u'%(id)s.%(ext)s'),
2691                         'ignoreerrors': opts.ignoreerrors,
2692                         'ratelimit': opts.ratelimit,
2693                         'nooverwrites': opts.nooverwrites,
2694                         'retries': opts.retries,
2695                         'continuedl': opts.continue_dl,
2696                         'noprogress': opts.noprogress,
2697                         'playliststart': opts.playliststart,
2698                         'playlistend': opts.playlistend,
2699                         'logtostderr': opts.outtmpl == '-',
2700                         'consoletitle': opts.consoletitle,
2701                         'nopart': opts.nopart,
2702                         'updatetime': opts.updatetime,
2703                         })
2704                 fd.add_info_extractor(vimeo_ie)
2705                 fd.add_info_extractor(youtube_search_ie)
2706                 fd.add_info_extractor(youtube_pl_ie)
2707                 fd.add_info_extractor(youtube_user_ie)
2708                 fd.add_info_extractor(metacafe_ie)
2709                 fd.add_info_extractor(dailymotion_ie)
2710                 fd.add_info_extractor(youtube_ie)
2711                 fd.add_info_extractor(google_ie)
2712                 fd.add_info_extractor(google_search_ie)
2713                 fd.add_info_extractor(photobucket_ie)
2714                 fd.add_info_extractor(yahoo_ie)
2715                 fd.add_info_extractor(yahoo_search_ie)
2716                 fd.add_info_extractor(deposit_files_ie)
2717
2718                 # This must come last since it's the
2719                 # fallback if none of the others work
2720                 fd.add_info_extractor(generic_ie)
2721
2722                 # Update version
2723                 if opts.update_self:
2724                         update_self(fd, sys.argv[0])
2725
2726                 # Maybe do nothing
2727                 if len(all_urls) < 1:
2728                         if not opts.update_self:
2729                                 parser.error(u'you must provide at least one URL')
2730                         else:
2731                                 sys.exit()
2732                 retcode = fd.download(all_urls)
2733
2734                 # Dump cookie jar if requested
2735                 if opts.cookiefile is not None:
2736                         try:
2737                                 jar.save()
2738                         except (IOError, OSError), err:
2739                                 sys.exit(u'ERROR: unable to save cookie jar')
2740
2741                 sys.exit(retcode)
2742
2743         except DownloadError:
2744                 sys.exit(1)
2745         except SameFileError:
2746                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2747         except KeyboardInterrupt:
2748                 sys.exit(u'\nERROR: Interrupted by user')