youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Firefox/3.6.7',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableVideoError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         format_limit:   Highest quality format to try.
 193         outtmpl:        Template for output names.
 194         ignoreerrors:   Do not stop on download errors.
 195         ratelimit:      Download speed limit, in bytes/sec.
 196         nooverwrites:   Prevent overwriting files.
 197         retries:        Number of times to retry for HTTP error 503
 198         continuedl:     Try to continue downloads if possible.
 199         noprogress:     Do not print the progress bar.
 200         """
 201
 202         params = None
 203         _ies = []
 204         _pps = []
 205         _download_retcode = None
 206         _num_downloads = None
 207
 208         def __init__(self, params):
 209                 """Create a FileDownloader object with the given options."""
 210                 self._ies = []
 211                 self._pps = []
 212                 self._download_retcode = 0
 213                 self._num_downloads = 0
 214                 self.params = params
 215
 216         @staticmethod
 217         def pmkdir(filename):
 218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 219                 components = filename.split(os.sep)
 220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 222                 for dir in aggregate:
 223                         if not os.path.exists(dir):
 224                                 os.mkdir(dir)
 225
 226         @staticmethod
 227         def format_bytes(bytes):
 228                 if bytes is None:
 229                         return 'N/A'
 230                 if type(bytes) is str:
 231                         bytes = float(bytes)
 232                 if bytes == 0.0:
 233                         exponent = 0
 234                 else:
 235                         exponent = long(math.log(bytes, 1024.0))
 236                 suffix = 'bkMGTPEZY'[exponent]
 237                 converted = float(bytes) / float(1024**exponent)
 238                 return '%.2f%s' % (converted, suffix)
 239
 240         @staticmethod
 241         def calc_percent(byte_counter, data_len):
 242                 if data_len is None:
 243                         return '---.-%'
 244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 245
 246         @staticmethod
 247         def calc_eta(start, now, total, current):
 248                 if total is None:
 249                         return '--:--'
 250                 dif = now - start
 251                 if current == 0 or dif < 0.001: # One millisecond
 252                         return '--:--'
 253                 rate = float(current) / dif
 254                 eta = long((float(total) - float(current)) / rate)
 255                 (eta_mins, eta_secs) = divmod(eta, 60)
 256                 if eta_mins > 99:
 257                         return '--:--'
 258                 return '%02d:%02d' % (eta_mins, eta_secs)
 259
 260         @staticmethod
 261         def calc_speed(start, now, bytes):
 262                 dif = now - start
 263                 if bytes == 0 or dif < 0.001: # One millisecond
 264                         return '%10s' % '---b/s'
 265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 266
 267         @staticmethod
 268         def best_block_size(elapsed_time, bytes):
 269                 new_min = max(bytes / 2.0, 1.0)
 270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 271                 if elapsed_time < 0.001:
 272                         return long(new_max)
 273                 rate = bytes / elapsed_time
 274                 if rate > new_max:
 275                         return long(new_max)
 276                 if rate < new_min:
 277                         return long(new_min)
 278                 return long(rate)
 279
 280         @staticmethod
 281         def parse_bytes(bytestr):
 282                 """Parse a string indicating a byte quantity into a long integer."""
 283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 284                 if matchobj is None:
 285                         return None
 286                 number = float(matchobj.group(1))
 287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 288                 return long(round(number * multiplier))
 289
 290         def add_info_extractor(self, ie):
 291                 """Add an InfoExtractor object to the end of the list."""
 292                 self._ies.append(ie)
 293                 ie.set_downloader(self)
 294
 295         def add_post_processor(self, pp):
 296                 """Add a PostProcessor object to the end of the chain."""
 297                 self._pps.append(pp)
 298                 pp.set_downloader(self)
 299
 300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 301                 """Print message to stdout if not in quiet mode."""
 302                 try:
 303                         if not self.params.get('quiet', False):
 304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 305                         sys.stdout.flush()
 306                 except (UnicodeEncodeError), err:
 307                         if not ignore_encoding_errors:
 308                                 raise
 309
 310         def to_stderr(self, message):
 311                 """Print message to stderr."""
 312                 print >>sys.stderr, message.encode(preferredencoding())
 313
 314         def fixed_template(self):
 315                 """Checks if the output template is fixed."""
 316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 317
 318         def trouble(self, message=None):
 319                 """Determine action to take when a download problem appears.
 320
 321                 Depending on if the downloader has been configured to ignore
 322                 download errors or not, this method may throw an exception or
 323                 not when errors are found, after printing the message.
 324                 """
 325                 if message is not None:
 326                         self.to_stderr(message)
 327                 if not self.params.get('ignoreerrors', False):
 328                         raise DownloadError(message)
 329                 self._download_retcode = 1
 330
 331         def slow_down(self, start_time, byte_counter):
 332                 """Sleep if the download speed is over the rate limit."""
 333                 rate_limit = self.params.get('ratelimit', None)
 334                 if rate_limit is None or byte_counter == 0:
 335                         return
 336                 now = time.time()
 337                 elapsed = now - start_time
 338                 if elapsed <= 0.0:
 339                         return
 340                 speed = float(byte_counter) / elapsed
 341                 if speed > rate_limit:
 342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 343
 344         def report_destination(self, filename):
 345                 """Report destination filename."""
 346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 347
 348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 349                 """Report download progress."""
 350                 if self.params.get('noprogress', False):
 351                         return
 352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 354
 355         def report_resuming_byte(self, resume_len):
 356                 """Report attemtp to resume at given byte."""
 357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 358
 359         def report_retry(self, count, retries):
 360                 """Report retry in case of HTTP error 503"""
 361                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 362
 363         def report_file_already_downloaded(self, file_name):
 364                 """Report file has already been fully downloaded."""
 365                 try:
 366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 367                 except (UnicodeEncodeError), err:
 368                         self.to_stdout(u'[download] The file has already been downloaded')
 369
 370         def report_unable_to_resume(self):
 371                 """Report it was impossible to resume download."""
 372                 self.to_stdout(u'[download] Unable to resume')
 373
 374         def report_finish(self):
 375                 """Report download finished."""
 376                 if self.params.get('noprogress', False):
 377                         self.to_stdout(u'[download] Download completed')
 378                 else:
 379                         self.to_stdout(u'')
 380
 381         def increment_downloads(self):
 382                 """Increment the ordinal that assigns a number to each file."""
 383                 self._num_downloads += 1
 384
 385         def process_info(self, info_dict):
 386                 """Process a single dictionary returned by an InfoExtractor."""
 387                 # Do nothing else if in simulate mode
 388                 if self.params.get('simulate', False):
 389                         # Forced printings
 390                         if self.params.get('forcetitle', False):
 391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 392                         if self.params.get('forceurl', False):
 393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 398
 399                         return
 400
 401                 try:
 402                         template_dict = dict(info_dict)
 403                         template_dict['epoch'] = unicode(long(time.time()))
 404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 405                         filename = self.params['outtmpl'] % template_dict
 406                 except (ValueError, KeyError), err:
 407                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 408                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 409                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 410                         return
 411
 412                 try:
 413                         self.pmkdir(filename)
 414                 except (OSError, IOError), err:
 415                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 416                         return
 417
 418                 try:
 419                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 420                 except (OSError, IOError), err:
 421                         raise UnavailableVideoError
 422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 423                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 424                         return
 425                 except (ContentTooShortError, ), err:
 426                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 427                         return
 428
 429                 if success:
 430                         try:
 431                                 self.post_process(filename, info_dict)
 432                         except (PostProcessingError), err:
 433                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 434                                 return
 435
 436         def download(self, url_list):
 437                 """Download a given list of URLs."""
 438                 if len(url_list) > 1 and self.fixed_template():
 439                         raise SameFileError(self.params['outtmpl'])
 440
 441                 for url in url_list:
 442                         suitable_found = False
 443                         for ie in self._ies:
 444                                 # Go to next InfoExtractor if not suitable
 445                                 if not ie.suitable(url):
 446                                         continue
 447
 448                                 # Suitable InfoExtractor found
 449                                 suitable_found = True
 450
 451                                 # Extract information from URL and process it
 452                                 ie.extract(url)
 453
 454                                 # Suitable InfoExtractor had been found; go to next URL
 455                                 break
 456
 457                         if not suitable_found:
 458                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 459
 460                 return self._download_retcode
 461
 462         def post_process(self, filename, ie_info):
 463                 """Run the postprocessing chain on the given file."""
 464                 info = dict(ie_info)
 465                 info['filepath'] = filename
 466                 for pp in self._pps:
 467                         info = pp.run(info)
 468                         if info is None:
 469                                 break
 470
 471         def _download_with_rtmpdump(self, filename, url, player_url):
 472                 self.report_destination(filename)
 473
 474                 # Check for rtmpdump first
 475                 try:
 476                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 477                 except (OSError, IOError):
 478                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 479                         return False
 480
 481                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 482                 # the connection was interrumpted and resuming appears to be
 483                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 484                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 485                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 486                 while retval == 2 or retval == 1:
 487                         prevsize = os.path.getsize(filename)
 488                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 489                         time.sleep(5.0) # This seems to be needed
 490                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 491                         cursize = os.path.getsize(filename)
 492                         if prevsize == cursize and retval == 1:
 493                                 break
 494                 if retval == 0:
 495                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 496                         return True
 497                 else:
 498                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 499                         return False
 500
 501         def _do_download(self, filename, url, player_url):
 502                 # Attempt to download using rtmpdump
 503                 if url.startswith('rtmp'):
 504                         return self._download_with_rtmpdump(filename, url, player_url)
 505
 506                 stream = None
 507                 open_mode = 'wb'
 508                 basic_request = urllib2.Request(url, None, std_headers)
 509                 request = urllib2.Request(url, None, std_headers)
 510
 511                 # Establish possible resume length
 512                 if os.path.isfile(filename):
 513                         resume_len = os.path.getsize(filename)
 514                 else:
 515                         resume_len = 0
 516
 517                 # Request parameters in case of being able to resume
 518                 if self.params.get('continuedl', False) and resume_len != 0:
 519                         self.report_resuming_byte(resume_len)
 520                         request.add_header('Range','bytes=%d-' % resume_len)
 521                         open_mode = 'ab'
 522
 523                 count = 0
 524                 retries = self.params.get('retries', 0)
 525                 while count <= retries:
 526                         # Establish connection
 527                         try:
 528                                 data = urllib2.urlopen(request)
 529                                 break
 530                         except (urllib2.HTTPError, ), err:
 531                                 if err.code != 503 and err.code != 416:
 532                                         # Unexpected HTTP error
 533                                         raise
 534                                 elif err.code == 416:
 535                                         # Unable to resume (requested range not satisfiable)
 536                                         try:
 537                                                 # Open the connection again without the range header
 538                                                 data = urllib2.urlopen(basic_request)
 539                                                 content_length = data.info()['Content-Length']
 540                                         except (urllib2.HTTPError, ), err:
 541                                                 if err.code != 503:
 542                                                         raise
 543                                         else:
 544                                                 # Examine the reported length
 545                                                 if content_length is not None and long(content_length) == resume_len:
 546                                                         # The file had already been fully downloaded
 547                                                         self.report_file_already_downloaded(filename)
 548                                                         return True
 549                                                 else:
 550                                                         # The length does not match, we start the download over
 551                                                         self.report_unable_to_resume()
 552                                                         open_mode = 'wb'
 553                                                         break
 554                         # Retry
 555                         count += 1
 556                         if count <= retries:
 557                                 self.report_retry(count, retries)
 558
 559                 if count > retries:
 560                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 561                         return False
 562
 563                 data_len = data.info().get('Content-length', None)
 564                 data_len_str = self.format_bytes(data_len)
 565                 byte_counter = 0
 566                 block_size = 1024
 567                 start = time.time()
 568                 while True:
 569                         # Download and write
 570                         before = time.time()
 571                         data_block = data.read(block_size)
 572                         after = time.time()
 573                         data_block_len = len(data_block)
 574                         if data_block_len == 0:
 575                                 break
 576                         byte_counter += data_block_len
 577
 578                         # Open file just in time
 579                         if stream is None:
 580                                 try:
 581                                         (stream, filename) = sanitize_open(filename, open_mode)
 582                                         self.report_destination(filename)
 583                                 except (OSError, IOError), err:
 584                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 585                                         return False
 586                         try:
 587                                 stream.write(data_block)
 588                         except (IOError, OSError), err:
 589                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 590                         block_size = self.best_block_size(after - before, data_block_len)
 591
 592                         # Progress message
 593                         percent_str = self.calc_percent(byte_counter, data_len)
 594                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 595                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 596                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 597
 598                         # Apply rate limit
 599                         self.slow_down(start, byte_counter)
 600
 601                 self.report_finish()
 602                 if data_len is not None and str(byte_counter) != data_len:
 603                         raise ContentTooShortError(byte_counter, long(data_len))
 604                 return True
 605
 606 class InfoExtractor(object):
 607         """Information Extractor class.
 608
 609         Information extractors are the classes that, given a URL, extract
 610         information from the video (or videos) the URL refers to. This
 611         information includes the real video URL, the video title and simplified
 612         title, author and others. The information is stored in a dictionary
 613         which is then passed to the FileDownloader. The FileDownloader
 614         processes this information possibly downloading the video to the file
 615         system, among other possible outcomes. The dictionaries must include
 616         the following fields:
 617
 618         id:             Video identifier.
 619         url:            Final video URL.
 620         uploader:       Nickname of the video uploader.
 621         title:          Literal title.
 622         stitle:         Simplified title.
 623         ext:            Video filename extension.
 624         format:         Video format.
 625         player_url:     SWF Player URL (may be None).
 626
 627         The following fields are optional. Their primary purpose is to allow
 628         youtube-dl to serve as the backend for a video search function, such
 629         as the one in youtube2mp3.  They are only used when their respective
 630         forced printing functions are called:
 631
 632         thumbnail:      Full URL to a video thumbnail image.
 633         description:    One-line video description.
 634
 635         Subclasses of this one should re-define the _real_initialize() and
 636         _real_extract() methods, as well as the suitable() static method.
 637         Probably, they should also be instantiated and added to the main
 638         downloader.
 639         """
 640
 641         _ready = False
 642         _downloader = None
 643
 644         def __init__(self, downloader=None):
 645                 """Constructor. Receives an optional downloader."""
 646                 self._ready = False
 647                 self.set_downloader(downloader)
 648
 649         @staticmethod
 650         def suitable(url):
 651                 """Receives a URL and returns True if suitable for this IE."""
 652                 return False
 653
 654         def initialize(self):
 655                 """Initializes an instance (authentication, etc)."""
 656                 if not self._ready:
 657                         self._real_initialize()
 658                         self._ready = True
 659
 660         def extract(self, url):
 661                 """Extracts URL information and returns it in list of dicts."""
 662                 self.initialize()
 663                 return self._real_extract(url)
 664
 665         def set_downloader(self, downloader):
 666                 """Sets the downloader for this IE."""
 667                 self._downloader = downloader
 668
 669         def _real_initialize(self):
 670                 """Real initialization process. Redefine in subclasses."""
 671                 pass
 672
 673         def _real_extract(self, url):
 674                 """Real extraction process. Redefine in subclasses."""
 675                 pass
 676
 677 class YoutubeIE(InfoExtractor):
 678         """Information extractor for youtube.com."""
 679
 680         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
 681         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 682         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 683         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 684         _NETRC_MACHINE = 'youtube'
 685         # Listed in order of quality
 686         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 687         _video_extensions = {
 688                 '13': '3gp',
 689                 '17': 'mp4',
 690                 '18': 'mp4',
 691                 '22': 'mp4',
 692                 '37': 'mp4',
 693                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 694                 '43': 'webm',
 695                 '45': 'webm',
 696         }
 697
 698         @staticmethod
 699         def suitable(url):
 700                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 701
 702         def report_lang(self):
 703                 """Report attempt to set language."""
 704                 self._downloader.to_stdout(u'[youtube] Setting language')
 705
 706         def report_login(self):
 707                 """Report attempt to log in."""
 708                 self._downloader.to_stdout(u'[youtube] Logging in')
 709
 710         def report_age_confirmation(self):
 711                 """Report attempt to confirm age."""
 712                 self._downloader.to_stdout(u'[youtube] Confirming age')
 713
 714         def report_video_webpage_download(self, video_id):
 715                 """Report attempt to download video webpage."""
 716                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 717
 718         def report_video_info_webpage_download(self, video_id):
 719                 """Report attempt to download video info webpage."""
 720                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 721
 722         def report_information_extraction(self, video_id):
 723                 """Report attempt to extract video information."""
 724                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 725
 726         def report_unavailable_format(self, video_id, format):
 727                 """Report extracted video URL."""
 728                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 729
 730         def report_rtmp_download(self):
 731                 """Indicate the download will use the RTMP protocol."""
 732                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 733
 734         def _real_initialize(self):
 735                 if self._downloader is None:
 736                         return
 737
 738                 username = None
 739                 password = None
 740                 downloader_params = self._downloader.params
 741
 742                 # Attempt to use provided username and password or .netrc data
 743                 if downloader_params.get('username', None) is not None:
 744                         username = downloader_params['username']
 745                         password = downloader_params['password']
 746                 elif downloader_params.get('usenetrc', False):
 747                         try:
 748                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 749                                 if info is not None:
 750                                         username = info[0]
 751                                         password = info[2]
 752                                 else:
 753                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 754                         except (IOError, netrc.NetrcParseError), err:
 755                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 756                                 return
 757
 758                 # Set language
 759                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 760                 try:
 761                         self.report_lang()
 762                         urllib2.urlopen(request).read()
 763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 764                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 765                         return
 766
 767                 # No authentication to be performed
 768                 if username is None:
 769                         return
 770
 771                 # Log in
 772                 login_form = {
 773                                 'current_form': 'loginForm',
 774                                 'next':         '/',
 775                                 'action_login': 'Log In',
 776                                 'username':     username,
 777                                 'password':     password,
 778                                 }
 779                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 780                 try:
 781                         self.report_login()
 782                         login_results = urllib2.urlopen(request).read()
 783                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 784                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 785                                 return
 786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 787                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 788                         return
 789
 790                 # Confirm age
 791                 age_form = {
 792                                 'next_url':             '/',
 793                                 'action_confirm':       'Confirm',
 794                                 }
 795                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 796                 try:
 797                         self.report_age_confirmation()
 798                         age_results = urllib2.urlopen(request).read()
 799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 800                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 801                         return
 802
 803         def _real_extract(self, url):
 804                 # Extract video id from URL
 805                 mobj = re.match(self._VALID_URL, url)
 806                 if mobj is None:
 807                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 808                         return
 809                 video_id = mobj.group(2)
 810
 811                 # Get video webpage
 812                 self.report_video_webpage_download(video_id)
 813                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 814                 try:
 815                         video_webpage = urllib2.urlopen(request).read()
 816                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 817                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 818                         return
 819
 820                 # Attempt to extract SWF player URL
 821                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
 822                 if mobj is not None:
 823                         player_url = mobj.group(1)
 824                 else:
 825                         player_url = None
 826
 827                 # Get video info
 828                 self.report_video_info_webpage_download(video_id)
 829                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 830                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 831                                            % (video_id, el_type))
 832                         request = urllib2.Request(video_info_url, None, std_headers)
 833                         try:
 834                                 video_info_webpage = urllib2.urlopen(request).read()
 835                                 video_info = parse_qs(video_info_webpage)
 836                                 if 'token' in video_info:
 837                                         break
 838                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 839                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 840                                 return
 841                 if 'token' not in video_info:
 842                         if 'reason' in video_info:
 843                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 844                         else:
 845                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 846                         return
 847
 848                 # Start extracting information
 849                 self.report_information_extraction(video_id)
 850
 851                 # uploader
 852                 if 'author' not in video_info:
 853                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 854                         return
 855                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 856
 857                 # title
 858                 if 'title' not in video_info:
 859                         self._downloader.trouble(u'ERROR: unable to extract video title')
 860                         return
 861                 video_title = urllib.unquote_plus(video_info['title'][0])
 862                 video_title = video_title.decode('utf-8')
 863                 video_title = sanitize_title(video_title)
 864
 865                 # simplified title
 866                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 867                 simple_title = simple_title.strip(ur'_')
 868
 869                 # thumbnail image
 870                 if 'thumbnail_url' not in video_info:
 871                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 872                         video_thumbnail = ''
 873                 else:   # don't panic if we can't find it
 874                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 875
 876                 # description
 877                 video_description = 'No description available.'
 878                 if self._downloader.params.get('forcedescription', False):
 879                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 880                         if mobj is not None:
 881                                 video_description = mobj.group(1)
 882
 883                 # token
 884                 video_token = urllib.unquote_plus(video_info['token'][0])
 885
 886                 # Decide which formats to download
 887                 requested_format = self._downloader.params.get('format', None)
 888                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
 889
 890                 if 'fmt_url_map' in video_info:
 891                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
 892                         format_limit = self._downloader.params.get('format_limit', None)
 893                         if format_limit is not None and format_limit in self._available_formats:
 894                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
 895                         else:
 896                                 format_list = self._available_formats
 897                         existing_formats = [x for x in format_list if x in url_map]
 898                         if len(existing_formats) == 0:
 899                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 900                                 return
 901                         if requested_format is None:
 902                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
 903                         elif requested_format == '-1':
 904                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
 905                         else:
 906                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
 907
 908                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 909                         self.report_rtmp_download()
 910                         video_url_list = [(None, video_info['conn'][0])]
 911
 912                 else:
 913                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
 914                         return
 915
 916                 for format_param, video_real_url in video_url_list:
 917                         # At this point we have a new video
 918                         self._downloader.increment_downloads()
 919
 920                         # Extension
 921                         video_extension = self._video_extensions.get(format_param, 'flv')
 922
 923                         # Find the video URL in fmt_url_map or conn paramters
 924                         try:
 925                                 # Process video information
 926                                 self._downloader.process_info({
 927                                         'id':           video_id.decode('utf-8'),
 928                                         'url':          video_real_url.decode('utf-8'),
 929                                         'uploader':     video_uploader.decode('utf-8'),
 930                                         'title':        video_title,
 931                                         'stitle':       simple_title,
 932                                         'ext':          video_extension.decode('utf-8'),
 933                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 934                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 935                                         'description':  video_description.decode('utf-8'),
 936                                         'player_url':   player_url,
 937                                 })
 938                         except UnavailableVideoError, err:
 939                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
 940
 941
 942 class MetacafeIE(InfoExtractor):
 943         """Information Extractor for metacafe.com."""
 944
 945         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 946         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 947         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 948         _youtube_ie = None
 949
 950         def __init__(self, youtube_ie, downloader=None):
 951                 InfoExtractor.__init__(self, downloader)
 952                 self._youtube_ie = youtube_ie
 953
 954         @staticmethod
 955         def suitable(url):
 956                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 957
 958         def report_disclaimer(self):
 959                 """Report disclaimer retrieval."""
 960                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 961
 962         def report_age_confirmation(self):
 963                 """Report attempt to confirm age."""
 964                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 965
 966         def report_download_webpage(self, video_id):
 967                 """Report webpage download."""
 968                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 969
 970         def report_extraction(self, video_id):
 971                 """Report information extraction."""
 972                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 973
 974         def _real_initialize(self):
 975                 # Retrieve disclaimer
 976                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 977                 try:
 978                         self.report_disclaimer()
 979                         disclaimer = urllib2.urlopen(request).read()
 980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 981                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 982                         return
 983
 984                 # Confirm age
 985                 disclaimer_form = {
 986                         'filters': '0',
 987                         'submit': "Continue - I'm over 18",
 988                         }
 989                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 990                 try:
 991                         self.report_age_confirmation()
 992                         disclaimer = urllib2.urlopen(request).read()
 993                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 994                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 995                         return
 996
 997         def _real_extract(self, url):
 998                 # Extract id and simplified title from URL
 999                 mobj = re.match(self._VALID_URL, url)
1000                 if mobj is None:
1001                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1002                         return
1003
1004                 video_id = mobj.group(1)
1005
1006                 # Check if video comes from YouTube
1007                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1008                 if mobj2 is not None:
1009                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1010                         return
1011
1012                 # At this point we have a new video
1013                 self._downloader.increment_downloads()
1014
1015                 simple_title = mobj.group(2).decode('utf-8')
1016                 video_extension = 'flv'
1017
1018                 # Retrieve video webpage to extract further information
1019                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1020                 try:
1021                         self.report_download_webpage(video_id)
1022                         webpage = urllib2.urlopen(request).read()
1023                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1024                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1025                         return
1026
1027                 # Extract URL, uploader and title from webpage
1028                 self.report_extraction(video_id)
1029                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1030                 if mobj is None:
1031                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1032                         return
1033                 mediaURL = urllib.unquote(mobj.group(1))
1034
1035                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1036                 #if mobj is None:
1037                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1038                 #       return
1039                 #gdaKey = mobj.group(1)
1040                 #
1041                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1042
1043                 video_url = mediaURL
1044
1045                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1046                 if mobj is None:
1047                         self._downloader.trouble(u'ERROR: unable to extract title')
1048                         return
1049                 video_title = mobj.group(1).decode('utf-8')
1050                 video_title = sanitize_title(video_title)
1051
1052                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1053                 if mobj is None:
1054                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1055                         return
1056                 video_uploader = mobj.group(1)
1057
1058                 try:
1059                         # Process video information
1060                         self._downloader.process_info({
1061                                 'id':           video_id.decode('utf-8'),
1062                                 'url':          video_url.decode('utf-8'),
1063                                 'uploader':     video_uploader.decode('utf-8'),
1064                                 'title':        video_title,
1065                                 'stitle':       simple_title,
1066                                 'ext':          video_extension.decode('utf-8'),
1067                                 'format':       u'NA',
1068                                 'player_url':   None,
1069                         })
1070                 except UnavailableVideoError:
1071                         self._downloader.trouble(u'ERROR: unable to download video')
1072
1073
1074 class DailymotionIE(InfoExtractor):
1075         """Information Extractor for Dailymotion"""
1076
1077         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1078
1079         def __init__(self, downloader=None):
1080                 InfoExtractor.__init__(self, downloader)
1081
1082         @staticmethod
1083         def suitable(url):
1084                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1085
1086         def report_download_webpage(self, video_id):
1087                 """Report webpage download."""
1088                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1089
1090         def report_extraction(self, video_id):
1091                 """Report information extraction."""
1092                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1093
1094         def _real_initialize(self):
1095                 return
1096
1097         def _real_extract(self, url):
1098                 # Extract id and simplified title from URL
1099                 mobj = re.match(self._VALID_URL, url)
1100                 if mobj is None:
1101                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1102                         return
1103
1104                 # At this point we have a new video
1105                 self._downloader.increment_downloads()
1106                 video_id = mobj.group(1)
1107
1108                 simple_title = mobj.group(2).decode('utf-8')
1109                 video_extension = 'flv'
1110
1111                 # Retrieve video webpage to extract further information
1112                 request = urllib2.Request(url)
1113                 try:
1114                         self.report_download_webpage(video_id)
1115                         webpage = urllib2.urlopen(request).read()
1116                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1117                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1118                         return
1119
1120                 # Extract URL, uploader and title from webpage
1121                 self.report_extraction(video_id)
1122                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1123                 if mobj is None:
1124                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1125                         return
1126                 mediaURL = urllib.unquote(mobj.group(1))
1127
1128                 # if needed add http://www.dailymotion.com/ if relative URL
1129
1130                 video_url = mediaURL
1131
1132                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1133                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1134                 if mobj is None:
1135                         self._downloader.trouble(u'ERROR: unable to extract title')
1136                         return
1137                 video_title = mobj.group(1).decode('utf-8')
1138                 video_title = sanitize_title(video_title)
1139
1140                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1141                 if mobj is None:
1142                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1143                         return
1144                 video_uploader = mobj.group(1)
1145
1146                 try:
1147                         # Process video information
1148                         self._downloader.process_info({
1149                                 'id':           video_id.decode('utf-8'),
1150                                 'url':          video_url.decode('utf-8'),
1151                                 'uploader':     video_uploader.decode('utf-8'),
1152                                 'title':        video_title,
1153                                 'stitle':       simple_title,
1154                                 'ext':          video_extension.decode('utf-8'),
1155                                 'format':       u'NA',
1156                                 'player_url':   None,
1157                         })
1158                 except UnavailableVideoError:
1159                         self._downloader.trouble(u'ERROR: unable to download video')
1160
1161 class GoogleIE(InfoExtractor):
1162         """Information extractor for video.google.com."""
1163
1164         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1165
1166         def __init__(self, downloader=None):
1167                 InfoExtractor.__init__(self, downloader)
1168
1169         @staticmethod
1170         def suitable(url):
1171                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1172
1173         def report_download_webpage(self, video_id):
1174                 """Report webpage download."""
1175                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1176
1177         def report_extraction(self, video_id):
1178                 """Report information extraction."""
1179                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1180
1181         def _real_initialize(self):
1182                 return
1183
1184         def _real_extract(self, url):
1185                 # Extract id from URL
1186                 mobj = re.match(self._VALID_URL, url)
1187                 if mobj is None:
1188                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189                         return
1190
1191                 # At this point we have a new video
1192                 self._downloader.increment_downloads()
1193                 video_id = mobj.group(1)
1194
1195                 video_extension = 'mp4'
1196
1197                 # Retrieve video webpage to extract further information
1198                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1199                 try:
1200                         self.report_download_webpage(video_id)
1201                         webpage = urllib2.urlopen(request).read()
1202                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1204                         return
1205
1206                 # Extract URL, uploader, and title from webpage
1207                 self.report_extraction(video_id)
1208                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1209                 if mobj is None:
1210                         video_extension = 'flv'
1211                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1214                         return
1215                 mediaURL = urllib.unquote(mobj.group(1))
1216                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1217                 mediaURL = mediaURL.replace('\\x26', '\x26')
1218
1219                 video_url = mediaURL
1220
1221                 mobj = re.search(r'<title>(.*)</title>', webpage)
1222                 if mobj is None:
1223                         self._downloader.trouble(u'ERROR: unable to extract title')
1224                         return
1225                 video_title = mobj.group(1).decode('utf-8')
1226                 video_title = sanitize_title(video_title)
1227                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1228
1229                 # Extract video description
1230                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract video description')
1233                         return
1234                 video_description = mobj.group(1).decode('utf-8')
1235                 if not video_description:
1236                         video_description = 'No description available.'
1237
1238                 # Extract video thumbnail
1239                 if self._downloader.params.get('forcethumbnail', False):
1240                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1241                         try:
1242                                 webpage = urllib2.urlopen(request).read()
1243                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1244                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1245                                 return
1246                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1247                         if mobj is None:
1248                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1249                                 return
1250                         video_thumbnail = mobj.group(1)
1251                 else:   # we need something to pass to process_info
1252                         video_thumbnail = ''
1253
1254
1255                 try:
1256                         # Process video information
1257                         self._downloader.process_info({
1258                                 'id':           video_id.decode('utf-8'),
1259                                 'url':          video_url.decode('utf-8'),
1260                                 'uploader':     u'NA',
1261                                 'title':        video_title,
1262                                 'stitle':       simple_title,
1263                                 'ext':          video_extension.decode('utf-8'),
1264                                 'format':       u'NA',
1265                                 'player_url':   None,
1266                         })
1267                 except UnavailableVideoError:
1268                         self._downloader.trouble(u'ERROR: unable to download video')
1269
1270
1271 class PhotobucketIE(InfoExtractor):
1272         """Information extractor for photobucket.com."""
1273
1274         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1275
1276         def __init__(self, downloader=None):
1277                 InfoExtractor.__init__(self, downloader)
1278
1279         @staticmethod
1280         def suitable(url):
1281                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1282
1283         def report_download_webpage(self, video_id):
1284                 """Report webpage download."""
1285                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1286
1287         def report_extraction(self, video_id):
1288                 """Report information extraction."""
1289                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1290
1291         def _real_initialize(self):
1292                 return
1293
1294         def _real_extract(self, url):
1295                 # Extract id from URL
1296                 mobj = re.match(self._VALID_URL, url)
1297                 if mobj is None:
1298                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1299                         return
1300
1301                 # At this point we have a new video
1302                 self._downloader.increment_downloads()
1303                 video_id = mobj.group(1)
1304
1305                 video_extension = 'flv'
1306
1307                 # Retrieve video webpage to extract further information
1308                 request = urllib2.Request(url)
1309                 try:
1310                         self.report_download_webpage(video_id)
1311                         webpage = urllib2.urlopen(request).read()
1312                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1313                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1314                         return
1315
1316                 # Extract URL, uploader, and title from webpage
1317                 self.report_extraction(video_id)
1318                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1319                 if mobj is None:
1320                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1321                         return
1322                 mediaURL = urllib.unquote(mobj.group(1))
1323
1324                 video_url = mediaURL
1325
1326                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1327                 if mobj is None:
1328                         self._downloader.trouble(u'ERROR: unable to extract title')
1329                         return
1330                 video_title = mobj.group(1).decode('utf-8')
1331                 video_title = sanitize_title(video_title)
1332                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1333
1334                 video_uploader = mobj.group(2).decode('utf-8')
1335
1336                 try:
1337                         # Process video information
1338                         self._downloader.process_info({
1339                                 'id':           video_id.decode('utf-8'),
1340                                 'url':          video_url.decode('utf-8'),
1341                                 'uploader':     video_uploader,
1342                                 'title':        video_title,
1343                                 'stitle':       simple_title,
1344                                 'ext':          video_extension.decode('utf-8'),
1345                                 'format':       u'NA',
1346                                 'player_url':   None,
1347                         })
1348                 except UnavailableVideoError:
1349                         self._downloader.trouble(u'ERROR: unable to download video')
1350
1351
1352 class YahooIE(InfoExtractor):
1353         """Information extractor for video.yahoo.com."""
1354
1355         # _VALID_URL matches all Yahoo! Video URLs
1356         # _VPAGE_URL matches only the extractable '/watch/' URLs
1357         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1358         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1359
1360         def __init__(self, downloader=None):
1361                 InfoExtractor.__init__(self, downloader)
1362
1363         @staticmethod
1364         def suitable(url):
1365                 return (re.match(YahooIE._VALID_URL, url) is not None)
1366
1367         def report_download_webpage(self, video_id):
1368                 """Report webpage download."""
1369                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1370
1371         def report_extraction(self, video_id):
1372                 """Report information extraction."""
1373                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1374
1375         def _real_initialize(self):
1376                 return
1377
1378         def _real_extract(self, url, new_video=True):
1379                 # Extract ID from URL
1380                 mobj = re.match(self._VALID_URL, url)
1381                 if mobj is None:
1382                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383                         return
1384
1385                 # At this point we have a new video
1386                 self._downloader.increment_downloads()
1387                 video_id = mobj.group(2)
1388                 video_extension = 'flv'
1389
1390                 # Rewrite valid but non-extractable URLs as
1391                 # extractable English language /watch/ URLs
1392                 if re.match(self._VPAGE_URL, url) is None:
1393                         request = urllib2.Request(url)
1394                         try:
1395                                 webpage = urllib2.urlopen(request).read()
1396                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1397                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1398                                 return
1399
1400                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1401                         if mobj is None:
1402                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1403                                 return
1404                         yahoo_id = mobj.group(1)
1405
1406                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1407                         if mobj is None:
1408                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1409                                 return
1410                         yahoo_vid = mobj.group(1)
1411
1412                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1413                         return self._real_extract(url, new_video=False)
1414
1415                 # Retrieve video webpage to extract further information
1416                 request = urllib2.Request(url)
1417                 try:
1418                         self.report_download_webpage(video_id)
1419                         webpage = urllib2.urlopen(request).read()
1420                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1422                         return
1423
1424                 # Extract uploader and title from webpage
1425                 self.report_extraction(video_id)
1426                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1427                 if mobj is None:
1428                         self._downloader.trouble(u'ERROR: unable to extract video title')
1429                         return
1430                 video_title = mobj.group(1).decode('utf-8')
1431                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1432
1433                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1434                 if mobj is None:
1435                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1436                         return
1437                 video_uploader = mobj.group(1).decode('utf-8')
1438
1439                 # Extract video thumbnail
1440                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1441                 if mobj is None:
1442                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1443                         return
1444                 video_thumbnail = mobj.group(1).decode('utf-8')
1445
1446                 # Extract video description
1447                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1448                 if mobj is None:
1449                         self._downloader.trouble(u'ERROR: unable to extract video description')
1450                         return
1451                 video_description = mobj.group(1).decode('utf-8')
1452                 if not video_description: video_description = 'No description available.'
1453
1454                 # Extract video height and width
1455                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract video height')
1458                         return
1459                 yv_video_height = mobj.group(1)
1460
1461                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract video width')
1464                         return
1465                 yv_video_width = mobj.group(1)
1466
1467                 # Retrieve video playlist to extract media URL
1468                 # I'm not completely sure what all these options are, but we
1469                 # seem to need most of them, otherwise the server sends a 401.
1470                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1471                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1472                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1473                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1474                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1475                 try:
1476                         self.report_download_webpage(video_id)
1477                         webpage = urllib2.urlopen(request).read()
1478                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1479                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1480                         return
1481
1482                 # Extract media URL from playlist XML
1483                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1486                         return
1487                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1488                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1489
1490                 try:
1491                         # Process video information
1492                         self._downloader.process_info({
1493                                 'id':           video_id.decode('utf-8'),
1494                                 'url':          video_url,
1495                                 'uploader':     video_uploader,
1496                                 'title':        video_title,
1497                                 'stitle':       simple_title,
1498                                 'ext':          video_extension.decode('utf-8'),
1499                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1500                                 'description':  video_description,
1501                                 'thumbnail':    video_thumbnail,
1502                                 'description':  video_description,
1503                                 'player_url':   None,
1504                         })
1505                 except UnavailableVideoError:
1506                         self._downloader.trouble(u'ERROR: unable to download video')
1507
1508
1509 class GenericIE(InfoExtractor):
1510         """Generic last-resort information extractor."""
1511
1512         def __init__(self, downloader=None):
1513                 InfoExtractor.__init__(self, downloader)
1514
1515         @staticmethod
1516         def suitable(url):
1517                 return True
1518
1519         def report_download_webpage(self, video_id):
1520                 """Report webpage download."""
1521                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1522                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1523
1524         def report_extraction(self, video_id):
1525                 """Report information extraction."""
1526                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1527
1528         def _real_initialize(self):
1529                 return
1530
1531         def _real_extract(self, url):
1532                 # At this point we have a new video
1533                 self._downloader.increment_downloads()
1534
1535                 video_id = url.split('/')[-1]
1536                 request = urllib2.Request(url)
1537                 try:
1538                         self.report_download_webpage(video_id)
1539                         webpage = urllib2.urlopen(request).read()
1540                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1542                         return
1543                 except ValueError, err:
1544                         # since this is the last-resort InfoExtractor, if
1545                         # this error is thrown, it'll be thrown here
1546                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1547                         return
1548
1549                 # Start with something easy: JW Player in SWFObject
1550                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1551                 if mobj is None:
1552                         # Broaden the search a little bit
1553                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1554                 if mobj is None:
1555                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1556                         return
1557
1558                 # It's possible that one of the regexes
1559                 # matched, but returned an empty group:
1560                 if mobj.group(1) is None:
1561                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1562                         return
1563
1564                 video_url = urllib.unquote(mobj.group(1))
1565                 video_id  = os.path.basename(video_url)
1566
1567                 # here's a fun little line of code for you:
1568                 video_extension = os.path.splitext(video_id)[1][1:]
1569                 video_id        = os.path.splitext(video_id)[0]
1570
1571                 # it's tempting to parse this further, but you would
1572                 # have to take into account all the variations like
1573                 #   Video Title - Site Name
1574                 #   Site Name | Video Title
1575                 #   Video Title - Tagline | Site Name
1576                 # and so on and so forth; it's just not practical
1577                 mobj = re.search(r'<title>(.*)</title>', webpage)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: unable to extract title')
1580                         return
1581                 video_title = mobj.group(1).decode('utf-8')
1582                 video_title = sanitize_title(video_title)
1583                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1584
1585                 # video uploader is domain name
1586                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1587                 if mobj is None:
1588                         self._downloader.trouble(u'ERROR: unable to extract title')
1589                         return
1590                 video_uploader = mobj.group(1).decode('utf-8')
1591
1592                 try:
1593                         # Process video information
1594                         self._downloader.process_info({
1595                                 'id':           video_id.decode('utf-8'),
1596                                 'url':          video_url.decode('utf-8'),
1597                                 'uploader':     video_uploader,
1598                                 'title':        video_title,
1599                                 'stitle':       simple_title,
1600                                 'ext':          video_extension.decode('utf-8'),
1601                                 'format':       u'NA',
1602                                 'player_url':   None,
1603                         })
1604                 except UnavailableVideoError, err:
1605                         self._downloader.trouble(u'ERROR: unable to download video')
1606
1607
1608 class YoutubeSearchIE(InfoExtractor):
1609         """Information Extractor for YouTube search queries."""
1610         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1611         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1612         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1613         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1614         _youtube_ie = None
1615         _max_youtube_results = 1000
1616
1617         def __init__(self, youtube_ie, downloader=None):
1618                 InfoExtractor.__init__(self, downloader)
1619                 self._youtube_ie = youtube_ie
1620
1621         @staticmethod
1622         def suitable(url):
1623                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1624
1625         def report_download_page(self, query, pagenum):
1626                 """Report attempt to download playlist page with given number."""
1627                 query = query.decode(preferredencoding())
1628                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1629
1630         def _real_initialize(self):
1631                 self._youtube_ie.initialize()
1632
1633         def _real_extract(self, query):
1634                 mobj = re.match(self._VALID_QUERY, query)
1635                 if mobj is None:
1636                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1637                         return
1638
1639                 prefix, query = query.split(':')
1640                 prefix = prefix[8:]
1641                 query  = query.encode('utf-8')
1642                 if prefix == '':
1643                         self._download_n_results(query, 1)
1644                         return
1645                 elif prefix == 'all':
1646                         self._download_n_results(query, self._max_youtube_results)
1647                         return
1648                 else:
1649                         try:
1650                                 n = long(prefix)
1651                                 if n <= 0:
1652                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1653                                         return
1654                                 elif n > self._max_youtube_results:
1655                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1656                                         n = self._max_youtube_results
1657                                 self._download_n_results(query, n)
1658                                 return
1659                         except ValueError: # parsing prefix as integer fails
1660                                 self._download_n_results(query, 1)
1661                                 return
1662
1663         def _download_n_results(self, query, n):
1664                 """Downloads a specified number of results for a query"""
1665
1666                 video_ids = []
1667                 already_seen = set()
1668                 pagenum = 1
1669
1670                 while True:
1671                         self.report_download_page(query, pagenum)
1672                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1673                         request = urllib2.Request(result_url, None, std_headers)
1674                         try:
1675                                 page = urllib2.urlopen(request).read()
1676                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1677                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1678                                 return
1679
1680                         # Extract video identifiers
1681                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1682                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1683                                 if video_id not in already_seen:
1684                                         video_ids.append(video_id)
1685                                         already_seen.add(video_id)
1686                                         if len(video_ids) == n:
1687                                                 # Specified n videos reached
1688                                                 for id in video_ids:
1689                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1690                                                 return
1691
1692                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1693                                 for id in video_ids:
1694                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1695                                 return
1696
1697                         pagenum = pagenum + 1
1698
1699 class GoogleSearchIE(InfoExtractor):
1700         """Information Extractor for Google Video search queries."""
1701         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1702         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1703         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1704         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1705         _google_ie = None
1706         _max_google_results = 1000
1707
1708         def __init__(self, google_ie, downloader=None):
1709                 InfoExtractor.__init__(self, downloader)
1710                 self._google_ie = google_ie
1711
1712         @staticmethod
1713         def suitable(url):
1714                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1715
1716         def report_download_page(self, query, pagenum):
1717                 """Report attempt to download playlist page with given number."""
1718                 query = query.decode(preferredencoding())
1719                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1720
1721         def _real_initialize(self):
1722                 self._google_ie.initialize()
1723
1724         def _real_extract(self, query):
1725                 mobj = re.match(self._VALID_QUERY, query)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1728                         return
1729
1730                 prefix, query = query.split(':')
1731                 prefix = prefix[8:]
1732                 query  = query.encode('utf-8')
1733                 if prefix == '':
1734                         self._download_n_results(query, 1)
1735                         return
1736                 elif prefix == 'all':
1737                         self._download_n_results(query, self._max_google_results)
1738                         return
1739                 else:
1740                         try:
1741                                 n = long(prefix)
1742                                 if n <= 0:
1743                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1744                                         return
1745                                 elif n > self._max_google_results:
1746                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1747                                         n = self._max_google_results
1748                                 self._download_n_results(query, n)
1749                                 return
1750                         except ValueError: # parsing prefix as integer fails
1751                                 self._download_n_results(query, 1)
1752                                 return
1753
1754         def _download_n_results(self, query, n):
1755                 """Downloads a specified number of results for a query"""
1756
1757                 video_ids = []
1758                 already_seen = set()
1759                 pagenum = 1
1760
1761                 while True:
1762                         self.report_download_page(query, pagenum)
1763                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1764                         request = urllib2.Request(result_url, None, std_headers)
1765                         try:
1766                                 page = urllib2.urlopen(request).read()
1767                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1769                                 return
1770
1771                         # Extract video identifiers
1772                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1773                                 video_id = mobj.group(1)
1774                                 if video_id not in already_seen:
1775                                         video_ids.append(video_id)
1776                                         already_seen.add(video_id)
1777                                         if len(video_ids) == n:
1778                                                 # Specified n videos reached
1779                                                 for id in video_ids:
1780                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1781                                                 return
1782
1783                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1784                                 for id in video_ids:
1785                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1786                                 return
1787
1788                         pagenum = pagenum + 1
1789
1790 class YahooSearchIE(InfoExtractor):
1791         """Information Extractor for Yahoo! Video search queries."""
1792         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1793         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1794         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1795         _MORE_PAGES_INDICATOR = r'\s*Next'
1796         _yahoo_ie = None
1797         _max_yahoo_results = 1000
1798
1799         def __init__(self, yahoo_ie, downloader=None):
1800                 InfoExtractor.__init__(self, downloader)
1801                 self._yahoo_ie = yahoo_ie
1802
1803         @staticmethod
1804         def suitable(url):
1805                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1806
1807         def report_download_page(self, query, pagenum):
1808                 """Report attempt to download playlist page with given number."""
1809                 query = query.decode(preferredencoding())
1810                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1811
1812         def _real_initialize(self):
1813                 self._yahoo_ie.initialize()
1814
1815         def _real_extract(self, query):
1816                 mobj = re.match(self._VALID_QUERY, query)
1817                 if mobj is None:
1818                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1819                         return
1820
1821                 prefix, query = query.split(':')
1822                 prefix = prefix[8:]
1823                 query  = query.encode('utf-8')
1824                 if prefix == '':
1825                         self._download_n_results(query, 1)
1826                         return
1827                 elif prefix == 'all':
1828                         self._download_n_results(query, self._max_yahoo_results)
1829                         return
1830                 else:
1831                         try:
1832                                 n = long(prefix)
1833                                 if n <= 0:
1834                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1835                                         return
1836                                 elif n > self._max_yahoo_results:
1837                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1838                                         n = self._max_yahoo_results
1839                                 self._download_n_results(query, n)
1840                                 return
1841                         except ValueError: # parsing prefix as integer fails
1842                                 self._download_n_results(query, 1)
1843                                 return
1844
1845         def _download_n_results(self, query, n):
1846                 """Downloads a specified number of results for a query"""
1847
1848                 video_ids = []
1849                 already_seen = set()
1850                 pagenum = 1
1851
1852                 while True:
1853                         self.report_download_page(query, pagenum)
1854                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1855                         request = urllib2.Request(result_url, None, std_headers)
1856                         try:
1857                                 page = urllib2.urlopen(request).read()
1858                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1860                                 return
1861
1862                         # Extract video identifiers
1863                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864                                 video_id = mobj.group(1)
1865                                 if video_id not in already_seen:
1866                                         video_ids.append(video_id)
1867                                         already_seen.add(video_id)
1868                                         if len(video_ids) == n:
1869                                                 # Specified n videos reached
1870                                                 for id in video_ids:
1871                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1872                                                 return
1873
1874                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1875                                 for id in video_ids:
1876                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1877                                 return
1878
1879                         pagenum = pagenum + 1
1880
1881 class YoutubePlaylistIE(InfoExtractor):
1882         """Information Extractor for YouTube playlists."""
1883
1884         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1885         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1886         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1887         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1888         _youtube_ie = None
1889
1890         def __init__(self, youtube_ie, downloader=None):
1891                 InfoExtractor.__init__(self, downloader)
1892                 self._youtube_ie = youtube_ie
1893
1894         @staticmethod
1895         def suitable(url):
1896                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1897
1898         def report_download_page(self, playlist_id, pagenum):
1899                 """Report attempt to download playlist page with given number."""
1900                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1901
1902         def _real_initialize(self):
1903                 self._youtube_ie.initialize()
1904
1905         def _real_extract(self, url):
1906                 # Extract playlist id
1907                 mobj = re.match(self._VALID_URL, url)
1908                 if mobj is None:
1909                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1910                         return
1911
1912                 # Download playlist pages
1913                 playlist_id = mobj.group(1)
1914                 video_ids = []
1915                 pagenum = 1
1916
1917                 while True:
1918                         self.report_download_page(playlist_id, pagenum)
1919                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1920                         try:
1921                                 page = urllib2.urlopen(request).read()
1922                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1923                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1924                                 return
1925
1926                         # Extract video identifiers
1927                         ids_in_page = []
1928                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1929                                 if mobj.group(1) not in ids_in_page:
1930                                         ids_in_page.append(mobj.group(1))
1931                         video_ids.extend(ids_in_page)
1932
1933                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1934                                 break
1935                         pagenum = pagenum + 1
1936
1937                 for id in video_ids:
1938                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1939                 return
1940
1941 class YoutubeUserIE(InfoExtractor):
1942         """Information Extractor for YouTube users."""
1943
1944         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1945         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1946         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1947         _youtube_ie = None
1948
1949         def __init__(self, youtube_ie, downloader=None):
1950                 InfoExtractor.__init__(self, downloader)
1951                 self._youtube_ie = youtube_ie
1952
1953         @staticmethod
1954         def suitable(url):
1955                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1956
1957         def report_download_page(self, username):
1958                 """Report attempt to download user page."""
1959                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1960
1961         def _real_initialize(self):
1962                 self._youtube_ie.initialize()
1963
1964         def _real_extract(self, url):
1965                 # Extract username
1966                 mobj = re.match(self._VALID_URL, url)
1967                 if mobj is None:
1968                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1969                         return
1970
1971                 # Download user page
1972                 username = mobj.group(1)
1973                 video_ids = []
1974                 pagenum = 1
1975
1976                 self.report_download_page(username)
1977                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1978                 try:
1979                         page = urllib2.urlopen(request).read()
1980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1981                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1982                         return
1983
1984                 # Extract video identifiers
1985                 ids_in_page = []
1986
1987                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1988                         if mobj.group(1) not in ids_in_page:
1989                                 ids_in_page.append(mobj.group(1))
1990                 video_ids.extend(ids_in_page)
1991
1992                 for id in video_ids:
1993                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1994                 return
1995
1996 class PostProcessor(object):
1997         """Post Processor class.
1998
1999         PostProcessor objects can be added to downloaders with their
2000         add_post_processor() method. When the downloader has finished a
2001         successful download, it will take its internal chain of PostProcessors
2002         and start calling the run() method on each one of them, first with
2003         an initial argument and then with the returned value of the previous
2004         PostProcessor.
2005
2006         The chain will be stopped if one of them ever returns None or the end
2007         of the chain is reached.
2008
2009         PostProcessor objects follow a "mutual registration" process similar
2010         to InfoExtractor objects.
2011         """
2012
2013         _downloader = None
2014
2015         def __init__(self, downloader=None):
2016                 self._downloader = downloader
2017
2018         def set_downloader(self, downloader):
2019                 """Sets the downloader for this PP."""
2020                 self._downloader = downloader
2021
2022         def run(self, information):
2023                 """Run the PostProcessor.
2024
2025                 The "information" argument is a dictionary like the ones
2026                 composed by InfoExtractors. The only difference is that this
2027                 one has an extra field called "filepath" that points to the
2028                 downloaded file.
2029
2030                 When this method returns None, the postprocessing chain is
2031                 stopped. However, this method may return an information
2032                 dictionary that will be passed to the next postprocessing
2033                 object in the chain. It can be the one it received after
2034                 changing some fields.
2035
2036                 In addition, this method may raise a PostProcessingError
2037                 exception that will be taken into account by the downloader
2038                 it was called from.
2039                 """
2040                 return information # by default, do nothing
2041
2042 ### MAIN PROGRAM ###
2043 if __name__ == '__main__':
2044         try:
2045                 # Modules needed only when running the main program
2046                 import getpass
2047                 import optparse
2048
2049                 # Function to update the program file with the latest version from bitbucket.org
2050                 def update_self(downloader, filename):
2051                         # Note: downloader only used for options
2052                         if not os.access (filename, os.W_OK):
2053                                 sys.exit('ERROR: no write permissions on %s' % filename)
2054
2055                         downloader.to_stdout('Updating to latest stable version...')
2056                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2057                         latest_version = urllib.urlopen(latest_url).read().strip()
2058                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2059                         newcontent = urllib.urlopen(prog_url).read()
2060                         stream = open(filename, 'w')
2061                         stream.write(newcontent)
2062                         stream.close()
2063                         downloader.to_stdout('Updated to version %s' % latest_version)
2064
2065                 # General configuration
2066                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2067                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2068                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2069
2070                 # Parse command line
2071                 parser = optparse.OptionParser(
2072                         usage='Usage: %prog [options] url...',
2073                         version='2010.07.24',
2074                         conflict_handler='resolve',
2075                 )
2076
2077                 parser.add_option('-h', '--help',
2078                                 action='help', help='print this help text and exit')
2079                 parser.add_option('-v', '--version',
2080                                 action='version', help='print program version and exit')
2081                 parser.add_option('-U', '--update',
2082                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2083                 parser.add_option('-i', '--ignore-errors',
2084                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2085                 parser.add_option('-r', '--rate-limit',
2086                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2087                 parser.add_option('-R', '--retries',
2088                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2089
2090                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2091                 authentication.add_option('-u', '--username',
2092                                 dest='username', metavar='USERNAME', help='account username')
2093                 authentication.add_option('-p', '--password',
2094                                 dest='password', metavar='PASSWORD', help='account password')
2095                 authentication.add_option('-n', '--netrc',
2096                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2097                 parser.add_option_group(authentication)
2098
2099                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2100                 video_format.add_option('-f', '--format',
2101                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2102                 video_format.add_option('-m', '--mobile-version',
2103                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2104                 video_format.add_option('--all-formats',
2105                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2106                 video_format.add_option('--max-quality',
2107                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2108                 parser.add_option_group(video_format)
2109
2110                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2111                 verbosity.add_option('-q', '--quiet',
2112                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2113                 verbosity.add_option('-s', '--simulate',
2114                                 action='store_true', dest='simulate', help='do not download video', default=False)
2115                 verbosity.add_option('-g', '--get-url',
2116                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2117                 verbosity.add_option('-e', '--get-title',
2118                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2119                 verbosity.add_option('--get-thumbnail',
2120                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2121                 verbosity.add_option('--get-description',
2122                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2123                 verbosity.add_option('--no-progress',
2124                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2125                 parser.add_option_group(verbosity)
2126
2127                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2128                 filesystem.add_option('-t', '--title',
2129                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2130                 filesystem.add_option('-l', '--literal',
2131                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2132                 filesystem.add_option('-o', '--output',
2133                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2134                 filesystem.add_option('-a', '--batch-file',
2135                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2136                 filesystem.add_option('-w', '--no-overwrites',
2137                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2138                 filesystem.add_option('-c', '--continue',
2139                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2140                 parser.add_option_group(filesystem)
2141
2142                 (opts, args) = parser.parse_args()
2143
2144                 # Batch file verification
2145                 batchurls = []
2146                 if opts.batchfile is not None:
2147                         try:
2148                                 if opts.batchfile == '-':
2149                                         batchfd = sys.stdin
2150                                 else:
2151                                         batchfd = open(opts.batchfile, 'r')
2152                                 batchurls = batchfd.readlines()
2153                                 batchurls = [x.strip() for x in batchurls]
2154                                 batchurls = [x for x in batchurls if len(x) > 0]
2155                         except IOError:
2156                                 sys.exit(u'ERROR: batch file could not be read')
2157                 all_urls = batchurls + args
2158
2159                 # Conflicting, missing and erroneous options
2160                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2161                         parser.error(u'using .netrc conflicts with giving username/password')
2162                 if opts.password is not None and opts.username is None:
2163                         parser.error(u'account username missing')
2164                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2165                         parser.error(u'using output template conflicts with using title or literal title')
2166                 if opts.usetitle and opts.useliteral:
2167                         parser.error(u'using title conflicts with using literal title')
2168                 if opts.username is not None and opts.password is None:
2169                         opts.password = getpass.getpass(u'Type account password and press return:')
2170                 if opts.ratelimit is not None:
2171                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2172                         if numeric_limit is None:
2173                                 parser.error(u'invalid rate limit specified')
2174                         opts.ratelimit = numeric_limit
2175                 if opts.retries is not None:
2176                         try:
2177                                 opts.retries = long(opts.retries)
2178                         except (TypeError, ValueError), err:
2179                                 parser.error(u'invalid retry count specified')
2180
2181                 # Information extractors
2182                 youtube_ie = YoutubeIE()
2183                 metacafe_ie = MetacafeIE(youtube_ie)
2184                 dailymotion_ie = DailymotionIE()
2185                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2186                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2187                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2188                 google_ie = GoogleIE()
2189                 google_search_ie = GoogleSearchIE(google_ie)
2190                 photobucket_ie = PhotobucketIE()
2191                 yahoo_ie = YahooIE()
2192                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2193                 generic_ie = GenericIE()
2194
2195                 # File downloader
2196                 fd = FileDownloader({
2197                         'usenetrc': opts.usenetrc,
2198                         'username': opts.username,
2199                         'password': opts.password,
2200                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2201                         'forceurl': opts.geturl,
2202                         'forcetitle': opts.gettitle,
2203                         'forcethumbnail': opts.getthumbnail,
2204                         'forcedescription': opts.getdescription,
2205                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2206                         'format': opts.format,
2207                         'format_limit': opts.format_limit,
2208                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2209                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2210                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2211                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2212                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2213                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2214                                 or u'%(id)s.%(ext)s'),
2215                         'ignoreerrors': opts.ignoreerrors,
2216                         'ratelimit': opts.ratelimit,
2217                         'nooverwrites': opts.nooverwrites,
2218                         'retries': opts.retries,
2219                         'continuedl': opts.continue_dl,
2220                         'noprogress': opts.noprogress,
2221                         })
2222                 fd.add_info_extractor(youtube_search_ie)
2223                 fd.add_info_extractor(youtube_pl_ie)
2224                 fd.add_info_extractor(youtube_user_ie)
2225                 fd.add_info_extractor(metacafe_ie)
2226                 fd.add_info_extractor(dailymotion_ie)
2227                 fd.add_info_extractor(youtube_ie)
2228                 fd.add_info_extractor(google_ie)
2229                 fd.add_info_extractor(google_search_ie)
2230                 fd.add_info_extractor(photobucket_ie)
2231                 fd.add_info_extractor(yahoo_ie)
2232                 fd.add_info_extractor(yahoo_search_ie)
2233
2234                 # This must come last since it's the
2235                 # fallback if none of the others work
2236                 fd.add_info_extractor(generic_ie)
2237
2238                 # Update version
2239                 if opts.update_self:
2240                         update_self(fd, sys.argv[0])
2241
2242                 # Maybe do nothing
2243                 if len(all_urls) < 1:
2244                         if not opts.update_self:
2245                                 parser.error(u'you must provide at least one URL')
2246                         else:
2247                                 sys.exit()
2248                 retcode = fd.download(all_urls)
2249                 sys.exit(retcode)
2250
2251         except DownloadError:
2252                 sys.exit(1)
2253         except SameFileError:
2254                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2255         except KeyboardInterrupt:
2256                 sys.exit(u'\nERROR: Interrupted by user')