youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204
 205         def __init__(self, params):
 206                 """Create a FileDownloader object with the given options."""
 207                 self._ies = []
 208                 self._pps = []
 209                 self._download_retcode = 0
 210                 self.params = params
 211
 212         @staticmethod
 213         def pmkdir(filename):
 214                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 215                 components = filename.split(os.sep)
 216                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 217                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 218                 for dir in aggregate:
 219                         if not os.path.exists(dir):
 220                                 os.mkdir(dir)
 221
 222         @staticmethod
 223         def format_bytes(bytes):
 224                 if bytes is None:
 225                         return 'N/A'
 226                 if type(bytes) is str:
 227                         bytes = float(bytes)
 228                 if bytes == 0.0:
 229                         exponent = 0
 230                 else:
 231                         exponent = long(math.log(bytes, 1024.0))
 232                 suffix = 'bkMGTPEZY'[exponent]
 233                 converted = float(bytes) / float(1024**exponent)
 234                 return '%.2f%s' % (converted, suffix)
 235
 236         @staticmethod
 237         def calc_percent(byte_counter, data_len):
 238                 if data_len is None:
 239                         return '---.-%'
 240                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 241
 242         @staticmethod
 243         def calc_eta(start, now, total, current):
 244                 if total is None:
 245                         return '--:--'
 246                 dif = now - start
 247                 if current == 0 or dif < 0.001: # One millisecond
 248                         return '--:--'
 249                 rate = float(current) / dif
 250                 eta = long((float(total) - float(current)) / rate)
 251                 (eta_mins, eta_secs) = divmod(eta, 60)
 252                 if eta_mins > 99:
 253                         return '--:--'
 254                 return '%02d:%02d' % (eta_mins, eta_secs)
 255
 256         @staticmethod
 257         def calc_speed(start, now, bytes):
 258                 dif = now - start
 259                 if bytes == 0 or dif < 0.001: # One millisecond
 260                         return '%10s' % '---b/s'
 261                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 262
 263         @staticmethod
 264         def best_block_size(elapsed_time, bytes):
 265                 new_min = max(bytes / 2.0, 1.0)
 266                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 267                 if elapsed_time < 0.001:
 268                         return long(new_max)
 269                 rate = bytes / elapsed_time
 270                 if rate > new_max:
 271                         return long(new_max)
 272                 if rate < new_min:
 273                         return long(new_min)
 274                 return long(rate)
 275
 276         @staticmethod
 277         def parse_bytes(bytestr):
 278                 """Parse a string indicating a byte quantity into a long integer."""
 279                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 280                 if matchobj is None:
 281                         return None
 282                 number = float(matchobj.group(1))
 283                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 284                 return long(round(number * multiplier))
 285
 286         @staticmethod
 287         def verify_url(url):
 288                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 289                 request = urllib2.Request(url, None, std_headers)
 290                 data = urllib2.urlopen(request)
 291                 data.read(1)
 292                 url = data.geturl()
 293                 data.close()
 294                 return url
 295
 296         def add_info_extractor(self, ie):
 297                 """Add an InfoExtractor object to the end of the list."""
 298                 self._ies.append(ie)
 299                 ie.set_downloader(self)
 300
 301         def add_post_processor(self, pp):
 302                 """Add a PostProcessor object to the end of the chain."""
 303                 self._pps.append(pp)
 304                 pp.set_downloader(self)
 305
 306         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 307                 """Print message to stdout if not in quiet mode."""
 308                 try:
 309                         if not self.params.get('quiet', False):
 310                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 311                         sys.stdout.flush()
 312                 except (UnicodeEncodeError), err:
 313                         if not ignore_encoding_errors:
 314                                 raise
 315
 316         def to_stderr(self, message):
 317                 """Print message to stderr."""
 318                 print >>sys.stderr, message.encode(preferredencoding())
 319
 320         def fixed_template(self):
 321                 """Checks if the output template is fixed."""
 322                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 323
 324         def trouble(self, message=None):
 325                 """Determine action to take when a download problem appears.
 326
 327                 Depending on if the downloader has been configured to ignore
 328                 download errors or not, this method may throw an exception or
 329                 not when errors are found, after printing the message.
 330                 """
 331                 if message is not None:
 332                         self.to_stderr(message)
 333                 if not self.params.get('ignoreerrors', False):
 334                         raise DownloadError(message)
 335                 self._download_retcode = 1
 336
 337         def slow_down(self, start_time, byte_counter):
 338                 """Sleep if the download speed is over the rate limit."""
 339                 rate_limit = self.params.get('ratelimit', None)
 340                 if rate_limit is None or byte_counter == 0:
 341                         return
 342                 now = time.time()
 343                 elapsed = now - start_time
 344                 if elapsed <= 0.0:
 345                         return
 346                 speed = float(byte_counter) / elapsed
 347                 if speed > rate_limit:
 348                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 349
 350         def report_destination(self, filename):
 351                 """Report destination filename."""
 352                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 353
 354         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 355                 """Report download progress."""
 356                 if self.params.get('noprogress', False):
 357                         return
 358                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 359                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 360
 361         def report_resuming_byte(self, resume_len):
 362                 """Report attemtp to resume at given byte."""
 363                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 364
 365         def report_file_already_downloaded(self, file_name):
 366                 """Report file has already been fully downloaded."""
 367                 try:
 368                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 369                 except (UnicodeEncodeError), err:
 370                         self.to_stdout(u'[download] The file has already been downloaded')
 371
 372         def report_unable_to_resume(self):
 373                 """Report it was impossible to resume download."""
 374                 self.to_stdout(u'[download] Unable to resume')
 375
 376         def report_finish(self):
 377                 """Report download finished."""
 378                 if self.params.get('noprogress', False):
 379                         self.to_stdout(u'[download] Download completed')
 380                 else:
 381                         self.to_stdout(u'')
 382
 383         def process_info(self, info_dict):
 384                 """Process a single dictionary returned by an InfoExtractor."""
 385                 # Do nothing else if in simulate mode
 386                 if self.params.get('simulate', False):
 387                         # Verify URL if it's an HTTP one
 388                         if info_dict['url'].startswith('http'):
 389                                 try:
 390                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 391                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                                         raise UnavailableFormatError
 393
 394                         # Forced printings
 395                         if self.params.get('forcetitle', False):
 396                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 397                         if self.params.get('forceurl', False):
 398                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 399
 400                         return
 401
 402                 try:
 403                         template_dict = dict(info_dict)
 404                         template_dict['epoch'] = unicode(long(time.time()))
 405                         filename = self.params['outtmpl'] % template_dict
 406                 except (ValueError, KeyError), err:
 407                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 408                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 409                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 410                         return
 411
 412                 try:
 413                         self.pmkdir(filename)
 414                 except (OSError, IOError), err:
 415                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 416                         return
 417
 418                 try:
 419                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 420                 except (OSError, IOError), err:
 421                         raise UnavailableFormatError
 422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 423                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 424                         return
 425                 except (ContentTooShortError, ), err:
 426                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 427                         return
 428
 429                 if success:
 430                         try:
 431                                 self.post_process(filename, info_dict)
 432                         except (PostProcessingError), err:
 433                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 434                                 return
 435
 436         def download(self, url_list):
 437                 """Download a given list of URLs."""
 438                 if len(url_list) > 1 and self.fixed_template():
 439                         raise SameFileError(self.params['outtmpl'])
 440
 441                 for url in url_list:
 442                         suitable_found = False
 443                         for ie in self._ies:
 444                                 # Go to next InfoExtractor if not suitable
 445                                 if not ie.suitable(url):
 446                                         continue
 447
 448                                 # Suitable InfoExtractor found
 449                                 suitable_found = True
 450
 451                                 # Extract information from URL and process it
 452                                 ie.extract(url)
 453
 454                                 # Suitable InfoExtractor had been found; go to next URL
 455                                 break
 456
 457                         if not suitable_found:
 458                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 459
 460                 return self._download_retcode
 461
 462         def post_process(self, filename, ie_info):
 463                 """Run the postprocessing chain on the given file."""
 464                 info = dict(ie_info)
 465                 info['filepath'] = filename
 466                 for pp in self._pps:
 467                         info = pp.run(info)
 468                         if info is None:
 469                                 break
 470
 471         def _download_with_rtmpdump(self, filename, url):
 472                 self.report_destination(filename)
 473
 474                 # Check for rtmpdump first
 475                 try:
 476                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 477                 except (OSError, IOError):
 478                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 479                         return False
 480
 481                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 482                 # the connection was interrumpted and resuming appears to be
 483                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 484                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 485                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 486                 while retval == 2 or retval == 1:
 487                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 488                         time.sleep(2.0) # This seems to be needed
 489                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 490                 if retval == 0:
 491                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 492                         return True
 493                 else:
 494                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 495                         return False
 496
 497         def _do_download(self, filename, url):
 498                 # Attempt to download using rtmpdump
 499                 if url.startswith('rtmp'):
 500                         return self._download_with_rtmpdump(filename, url)
 501
 502                 stream = None
 503                 open_mode = 'wb'
 504                 basic_request = urllib2.Request(url, None, std_headers)
 505                 request = urllib2.Request(url, None, std_headers)
 506
 507                 # Establish possible resume length
 508                 if os.path.isfile(filename):
 509                         resume_len = os.path.getsize(filename)
 510                 else:
 511                         resume_len = 0
 512
 513                 # Request parameters in case of being able to resume
 514                 if self.params.get('continuedl', False) and resume_len != 0:
 515                         self.report_resuming_byte(resume_len)
 516                         request.add_header('Range','bytes=%d-' % resume_len)
 517                         open_mode = 'ab'
 518
 519                 # Establish connection
 520                 try:
 521                         data = urllib2.urlopen(request)
 522                 except (urllib2.HTTPError, ), err:
 523                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 524                                 raise
 525                         # Unable to resume
 526                         data = urllib2.urlopen(basic_request)
 527                         content_length = data.info()['Content-Length']
 528
 529                         if content_length is not None and long(content_length) == resume_len:
 530                                 # Because the file had already been fully downloaded
 531                                 self.report_file_already_downloaded(filename)
 532                                 return True
 533                         else:
 534                                 # Because the server didn't let us
 535                                 self.report_unable_to_resume()
 536                                 open_mode = 'wb'
 537
 538                 data_len = data.info().get('Content-length', None)
 539                 data_len_str = self.format_bytes(data_len)
 540                 byte_counter = 0
 541                 block_size = 1024
 542                 start = time.time()
 543                 while True:
 544                         # Download and write
 545                         before = time.time()
 546                         data_block = data.read(block_size)
 547                         after = time.time()
 548                         data_block_len = len(data_block)
 549                         if data_block_len == 0:
 550                                 break
 551                         byte_counter += data_block_len
 552
 553                         # Open file just in time
 554                         if stream is None:
 555                                 try:
 556                                         (stream, filename) = sanitize_open(filename, open_mode)
 557                                         self.report_destination(filename)
 558                                 except (OSError, IOError), err:
 559                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 560                                         return False
 561                         stream.write(data_block)
 562                         block_size = self.best_block_size(after - before, data_block_len)
 563
 564                         # Progress message
 565                         percent_str = self.calc_percent(byte_counter, data_len)
 566                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 567                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 568                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 569
 570                         # Apply rate limit
 571                         self.slow_down(start, byte_counter)
 572
 573                 self.report_finish()
 574                 if data_len is not None and str(byte_counter) != data_len:
 575                         raise ContentTooShortError(byte_counter, long(data_len))
 576                 return True
 577
 578 class InfoExtractor(object):
 579         """Information Extractor class.
 580
 581         Information extractors are the classes that, given a URL, extract
 582         information from the video (or videos) the URL refers to. This
 583         information includes the real video URL, the video title and simplified
 584         title, author and others. The information is stored in a dictionary
 585         which is then passed to the FileDownloader. The FileDownloader
 586         processes this information possibly downloading the video to the file
 587         system, among other possible outcomes. The dictionaries must include
 588         the following fields:
 589
 590         id:             Video identifier.
 591         url:            Final video URL.
 592         uploader:       Nickname of the video uploader.
 593         title:          Literal title.
 594         stitle:         Simplified title.
 595         ext:            Video filename extension.
 596         format:         Video format.
 597
 598         Subclasses of this one should re-define the _real_initialize() and
 599         _real_extract() methods, as well as the suitable() static method.
 600         Probably, they should also be instantiated and added to the main
 601         downloader.
 602         """
 603
 604         _ready = False
 605         _downloader = None
 606
 607         def __init__(self, downloader=None):
 608                 """Constructor. Receives an optional downloader."""
 609                 self._ready = False
 610                 self.set_downloader(downloader)
 611
 612         @staticmethod
 613         def suitable(url):
 614                 """Receives a URL and returns True if suitable for this IE."""
 615                 return False
 616
 617         def initialize(self):
 618                 """Initializes an instance (authentication, etc)."""
 619                 if not self._ready:
 620                         self._real_initialize()
 621                         self._ready = True
 622
 623         def extract(self, url):
 624                 """Extracts URL information and returns it in list of dicts."""
 625                 self.initialize()
 626                 return self._real_extract(url)
 627
 628         def set_downloader(self, downloader):
 629                 """Sets the downloader for this IE."""
 630                 self._downloader = downloader
 631
 632         def _real_initialize(self):
 633                 """Real initialization process. Redefine in subclasses."""
 634                 pass
 635
 636         def _real_extract(self, url):
 637                 """Real extraction process. Redefine in subclasses."""
 638                 pass
 639
 640 class YoutubeIE(InfoExtractor):
 641         """Information extractor for youtube.com."""
 642
 643         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 644         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 645         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 646         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 647         _NETRC_MACHINE = 'youtube'
 648         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 649         _video_extensions = {
 650                 '13': '3gp',
 651                 '17': 'mp4',
 652                 '18': 'mp4',
 653                 '22': 'mp4',
 654                 '37': 'mp4',
 655         }
 656
 657         @staticmethod
 658         def suitable(url):
 659                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 660
 661         def report_lang(self):
 662                 """Report attempt to set language."""
 663                 self._downloader.to_stdout(u'[youtube] Setting language')
 664
 665         def report_login(self):
 666                 """Report attempt to log in."""
 667                 self._downloader.to_stdout(u'[youtube] Logging in')
 668
 669         def report_age_confirmation(self):
 670                 """Report attempt to confirm age."""
 671                 self._downloader.to_stdout(u'[youtube] Confirming age')
 672
 673         def report_video_info_webpage_download(self, video_id):
 674                 """Report attempt to download video info webpage."""
 675                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 676
 677         def report_information_extraction(self, video_id):
 678                 """Report attempt to extract video information."""
 679                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 680
 681         def report_unavailable_format(self, video_id, format):
 682                 """Report extracted video URL."""
 683                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 684
 685         def report_rtmp_download(self):
 686                 """Indicate the download will use the RTMP protocol."""
 687                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 688
 689         def _real_initialize(self):
 690                 if self._downloader is None:
 691                         return
 692
 693                 username = None
 694                 password = None
 695                 downloader_params = self._downloader.params
 696
 697                 # Attempt to use provided username and password or .netrc data
 698                 if downloader_params.get('username', None) is not None:
 699                         username = downloader_params['username']
 700                         password = downloader_params['password']
 701                 elif downloader_params.get('usenetrc', False):
 702                         try:
 703                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 704                                 if info is not None:
 705                                         username = info[0]
 706                                         password = info[2]
 707                                 else:
 708                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 709                         except (IOError, netrc.NetrcParseError), err:
 710                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 711                                 return
 712
 713                 # Set language
 714                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 715                 try:
 716                         self.report_lang()
 717                         urllib2.urlopen(request).read()
 718                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 719                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 720                         return
 721
 722                 # No authentication to be performed
 723                 if username is None:
 724                         return
 725
 726                 # Log in
 727                 login_form = {
 728                                 'current_form': 'loginForm',
 729                                 'next':         '/',
 730                                 'action_login': 'Log In',
 731                                 'username':     username,
 732                                 'password':     password,
 733                                 }
 734                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 735                 try:
 736                         self.report_login()
 737                         login_results = urllib2.urlopen(request).read()
 738                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 739                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 740                                 return
 741                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 742                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 743                         return
 744
 745                 # Confirm age
 746                 age_form = {
 747                                 'next_url':             '/',
 748                                 'action_confirm':       'Confirm',
 749                                 }
 750                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 751                 try:
 752                         self.report_age_confirmation()
 753                         age_results = urllib2.urlopen(request).read()
 754                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 755                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 756                         return
 757
 758         def _real_extract(self, url):
 759                 # Extract video id from URL
 760                 mobj = re.match(self._VALID_URL, url)
 761                 if mobj is None:
 762                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 763                         return
 764                 video_id = mobj.group(2)
 765
 766                 # Downloader parameters
 767                 best_quality = False
 768                 all_formats = False
 769                 format_param = None
 770                 quality_index = 0
 771                 if self._downloader is not None:
 772                         params = self._downloader.params
 773                         format_param = params.get('format', None)
 774                         if format_param == '0':
 775                                 format_param = self._available_formats[quality_index]
 776                                 best_quality = True
 777                         elif format_param == '-1':
 778                                 format_param = self._available_formats[quality_index]
 779                                 all_formats = True
 780
 781                 while True:
 782                         # Extension
 783                         video_extension = self._video_extensions.get(format_param, 'flv')
 784
 785                         # Get video info
 786                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=embedded&ps=default&eurl=&gl=US&hl=en' % video_id
 787                         request = urllib2.Request(video_info_url, None, std_headers)
 788                         try:
 789                                 self.report_video_info_webpage_download(video_id)
 790                                 video_info_webpage = urllib2.urlopen(request).read()
 791                                 video_info = parse_qs(video_info_webpage)
 792                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 793                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 794                                 return
 795                         self.report_information_extraction(video_id)
 796
 797                         # "t" param
 798                         if 'token' not in video_info:
 799                                 # Attempt to see if YouTube has issued an error message
 800                                 if 'reason' not in video_info:
 801                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 802                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 803                                         stream.write(video_info_webpage)
 804                                         stream.close()
 805                                 else:
 806                                         reason = urllib.unquote_plus(video_info['reason'][0])
 807                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 808                                 return
 809                         token = urllib.unquote_plus(video_info['token'][0])
 810                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 811                         if format_param is not None:
 812                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 813
 814                         # Check possible RTMP download
 815                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 816                                 self.report_rtmp_download()
 817                                 video_real_url = video_info['conn'][0]
 818
 819                         # uploader
 820                         if 'author' not in video_info:
 821                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 822                                 return
 823                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 824
 825                         # title
 826                         if 'title' not in video_info:
 827                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 828                                 return
 829                         video_title = urllib.unquote_plus(video_info['title'][0])
 830                         video_title = video_title.decode('utf-8')
 831                         video_title = sanitize_title(video_title)
 832
 833                         # simplified title
 834                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 835                         simple_title = simple_title.strip(ur'_')
 836
 837                         try:
 838                                 # Process video information
 839                                 self._downloader.process_info({
 840                                         'id':           video_id.decode('utf-8'),
 841                                         'url':          video_real_url.decode('utf-8'),
 842                                         'uploader':     video_uploader.decode('utf-8'),
 843                                         'title':        video_title,
 844                                         'stitle':       simple_title,
 845                                         'ext':          video_extension.decode('utf-8'),
 846                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 847                                 })
 848
 849                                 if all_formats:
 850                                         if quality_index == len(self._available_formats) - 1:
 851                                                 # None left to get
 852                                                 return
 853                                         else:
 854                                                 quality_index += 1
 855                                                 format_param = self._available_formats[quality_index]
 856                                                 if format_param == None:
 857                                                         return
 858                                                 continue
 859
 860                                 return
 861
 862                         except UnavailableFormatError, err:
 863                                 if best_quality or all_formats:
 864                                         if quality_index == len(self._available_formats) - 1:
 865                                                 # I don't ever expect this to happen
 866                                                 if not all_formats:
 867                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 868                                                 return
 869                                         else:
 870                                                 self.report_unavailable_format(video_id, format_param)
 871                                                 quality_index += 1
 872                                                 format_param = self._available_formats[quality_index]
 873                                                 if format_param == None:
 874                                                         return
 875                                                 continue
 876                                 else:
 877                                         self._downloader.trouble('ERROR: format not available for video')
 878                                         return
 879
 880
 881 class MetacafeIE(InfoExtractor):
 882         """Information Extractor for metacafe.com."""
 883
 884         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 885         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 886         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 887         _youtube_ie = None
 888
 889         def __init__(self, youtube_ie, downloader=None):
 890                 InfoExtractor.__init__(self, downloader)
 891                 self._youtube_ie = youtube_ie
 892
 893         @staticmethod
 894         def suitable(url):
 895                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 896
 897         def report_disclaimer(self):
 898                 """Report disclaimer retrieval."""
 899                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 900
 901         def report_age_confirmation(self):
 902                 """Report attempt to confirm age."""
 903                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 904
 905         def report_download_webpage(self, video_id):
 906                 """Report webpage download."""
 907                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 908
 909         def report_extraction(self, video_id):
 910                 """Report information extraction."""
 911                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 912
 913         def _real_initialize(self):
 914                 # Retrieve disclaimer
 915                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 916                 try:
 917                         self.report_disclaimer()
 918                         disclaimer = urllib2.urlopen(request).read()
 919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 920                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 921                         return
 922
 923                 # Confirm age
 924                 disclaimer_form = {
 925                         'filters': '0',
 926                         'submit': "Continue - I'm over 18",
 927                         }
 928                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 929                 try:
 930                         self.report_age_confirmation()
 931                         disclaimer = urllib2.urlopen(request).read()
 932                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 933                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 934                         return
 935
 936         def _real_extract(self, url):
 937                 # Extract id and simplified title from URL
 938                 mobj = re.match(self._VALID_URL, url)
 939                 if mobj is None:
 940                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 941                         return
 942
 943                 video_id = mobj.group(1)
 944
 945                 # Check if video comes from YouTube
 946                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 947                 if mobj2 is not None:
 948                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 949                         return
 950
 951                 simple_title = mobj.group(2).decode('utf-8')
 952                 video_extension = 'flv'
 953
 954                 # Retrieve video webpage to extract further information
 955                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 956                 try:
 957                         self.report_download_webpage(video_id)
 958                         webpage = urllib2.urlopen(request).read()
 959                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 960                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 961                         return
 962
 963                 # Extract URL, uploader and title from webpage
 964                 self.report_extraction(video_id)
 965                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 966                 if mobj is None:
 967                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 968                         return
 969                 mediaURL = urllib.unquote(mobj.group(1))
 970
 971                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 972                 #if mobj is None:
 973                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 974                 #       return
 975                 #gdaKey = mobj.group(1)
 976                 #
 977                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 978
 979                 video_url = mediaURL
 980
 981                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 982                 if mobj is None:
 983                         self._downloader.trouble(u'ERROR: unable to extract title')
 984                         return
 985                 video_title = mobj.group(1).decode('utf-8')
 986                 video_title = sanitize_title(video_title)
 987
 988                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 989                 if mobj is None:
 990                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 991                         return
 992                 video_uploader = mobj.group(1)
 993
 994                 try:
 995                         # Process video information
 996                         self._downloader.process_info({
 997                                 'id':           video_id.decode('utf-8'),
 998                                 'url':          video_url.decode('utf-8'),
 999                                 'uploader':     video_uploader.decode('utf-8'),
1000                                 'title':        video_title,
1001                                 'stitle':       simple_title,
1002                                 'ext':          video_extension.decode('utf-8'),
1003                                 'format':       u'NA',
1004                         })
1005                 except UnavailableFormatError:
1006                         self._downloader.trouble(u'ERROR: format not available for video')
1007
1008
1009 class GoogleIE(InfoExtractor):
1010         """Information extractor for video.google.com."""
1011
1012         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1013
1014         def __init__(self, downloader=None):
1015                 InfoExtractor.__init__(self, downloader)
1016
1017         @staticmethod
1018         def suitable(url):
1019                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1020
1021         def report_download_webpage(self, video_id):
1022                 """Report webpage download."""
1023                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1024
1025         def report_extraction(self, video_id):
1026                 """Report information extraction."""
1027                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1028
1029         def _real_initialize(self):
1030                 return
1031
1032         def _real_extract(self, url):
1033                 # Extract id from URL
1034                 mobj = re.match(self._VALID_URL, url)
1035                 if mobj is None:
1036                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1037                         return
1038
1039                 video_id = mobj.group(1)
1040
1041                 video_extension = 'mp4'
1042
1043                 # Retrieve video webpage to extract further information
1044                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1045                 try:
1046                         self.report_download_webpage(video_id)
1047                         webpage = urllib2.urlopen(request).read()
1048                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1049                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1050                         return
1051
1052                 # Extract URL, uploader, and title from webpage
1053                 self.report_extraction(video_id)
1054                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1055                 if mobj is None:
1056                         video_extension = 'flv'
1057                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1058                 if mobj is None:
1059                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1060                         return
1061                 mediaURL = urllib.unquote(mobj.group(1))
1062                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1063                 mediaURL = mediaURL.replace('\\x26', '\x26')
1064
1065                 video_url = mediaURL
1066
1067                 mobj = re.search(r'<title>(.*)</title>', webpage)
1068                 if mobj is None:
1069                         self._downloader.trouble(u'ERROR: unable to extract title')
1070                         return
1071                 video_title = mobj.group(1).decode('utf-8')
1072                 video_title = sanitize_title(video_title)
1073                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1074
1075                 try:
1076                         # Process video information
1077                         self._downloader.process_info({
1078                                 'id':           video_id.decode('utf-8'),
1079                                 'url':          video_url.decode('utf-8'),
1080                                 'uploader':     u'NA',
1081                                 'title':        video_title,
1082                                 'stitle':       simple_title,
1083                                 'ext':          video_extension.decode('utf-8'),
1084                                 'format':       u'NA',
1085                         })
1086                 except UnavailableFormatError:
1087                         self._downloader.trouble(u'ERROR: format not available for video')
1088
1089
1090 class PhotobucketIE(InfoExtractor):
1091         """Information extractor for photobucket.com."""
1092
1093         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1094
1095         def __init__(self, downloader=None):
1096                 InfoExtractor.__init__(self, downloader)
1097
1098         @staticmethod
1099         def suitable(url):
1100                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1101
1102         def report_download_webpage(self, video_id):
1103                 """Report webpage download."""
1104                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1105
1106         def report_extraction(self, video_id):
1107                 """Report information extraction."""
1108                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1109
1110         def _real_initialize(self):
1111                 return
1112
1113         def _real_extract(self, url):
1114                 # Extract id from URL
1115                 mobj = re.match(self._VALID_URL, url)
1116                 if mobj is None:
1117                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1118                         return
1119
1120                 video_id = mobj.group(1)
1121
1122                 video_extension = 'flv'
1123
1124                 # Retrieve video webpage to extract further information
1125                 request = urllib2.Request(url)
1126                 try:
1127                         self.report_download_webpage(video_id)
1128                         webpage = urllib2.urlopen(request).read()
1129                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1130                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1131                         return
1132
1133                 # Extract URL, uploader, and title from webpage
1134                 self.report_extraction(video_id)
1135                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1136                 if mobj is None:
1137                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1138                         return
1139                 mediaURL = urllib.unquote(mobj.group(1))
1140
1141                 video_url = mediaURL
1142
1143                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1144                 if mobj is None:
1145                         self._downloader.trouble(u'ERROR: unable to extract title')
1146                         return
1147                 video_title = mobj.group(1).decode('utf-8')
1148                 video_title = sanitize_title(video_title)
1149                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1150
1151                 video_uploader = mobj.group(2).decode('utf-8')
1152
1153                 try:
1154                         # Process video information
1155                         self._downloader.process_info({
1156                                 'id':           video_id.decode('utf-8'),
1157                                 'url':          video_url.decode('utf-8'),
1158                                 'uploader':     video_uploader,
1159                                 'title':        video_title,
1160                                 'stitle':       simple_title,
1161                                 'ext':          video_extension.decode('utf-8'),
1162                                 'format':       u'NA',
1163                         })
1164                 except UnavailableFormatError:
1165                         self._downloader.trouble(u'ERROR: format not available for video')
1166
1167
1168 class GenericIE(InfoExtractor):
1169         """Generic last-resort information extractor."""
1170
1171         def __init__(self, downloader=None):
1172                 InfoExtractor.__init__(self, downloader)
1173
1174         @staticmethod
1175         def suitable(url):
1176                 return True
1177
1178         def report_download_webpage(self, video_id):
1179                 """Report webpage download."""
1180                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1181                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1182
1183         def report_extraction(self, video_id):
1184                 """Report information extraction."""
1185                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1186
1187         def _real_initialize(self):
1188                 return
1189
1190         def _real_extract(self, url):
1191                 video_id = url.split('/')[-1]
1192                 request = urllib2.Request(url)
1193                 try:
1194                         self.report_download_webpage(video_id)
1195                         webpage = urllib2.urlopen(request).read()
1196                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1197                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1198                         return
1199                 except ValueError, err:
1200                         # since this is the last-resort InfoExtractor, if
1201                         # this error is thrown, it'll be thrown here
1202                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1203                         return
1204
1205                 # Start with something easy: JW Player in SWFObject
1206                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1207                 if mobj is None:
1208                         # Broaden the search a little bit
1209                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1212                         return
1213
1214                 # It's possible that one of the regexes
1215                 # matched, but returned an empty group:
1216                 if mobj.group(1) is None:
1217                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1218                         return
1219
1220                 video_url = urllib.unquote(mobj.group(1))
1221                 video_id  = os.path.basename(video_url)
1222
1223                 # here's a fun little line of code for you:
1224                 video_extension = os.path.splitext(video_id)[1][1:]
1225                 video_id        = os.path.splitext(video_id)[0]
1226
1227                 # it's tempting to parse this further, but you would
1228                 # have to take into account all the variations like
1229                 #   Video Title - Site Name
1230                 #   Site Name | Video Title
1231                 #   Video Title - Tagline | Site Name
1232                 # and so on and so forth; it's just not practical
1233                 mobj = re.search(r'<title>(.*)</title>', webpage)
1234                 if mobj is None:
1235                         self._downloader.trouble(u'ERROR: unable to extract title')
1236                         return
1237                 video_title = mobj.group(1).decode('utf-8')
1238                 video_title = sanitize_title(video_title)
1239                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1240
1241                 # video uploader is domain name
1242                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1243                 if mobj is None:
1244                         self._downloader.trouble(u'ERROR: unable to extract title')
1245                         return
1246                 video_uploader = mobj.group(1).decode('utf-8')
1247
1248                 try:
1249                         # Process video information
1250                         self._downloader.process_info({
1251                                 'id':           video_id.decode('utf-8'),
1252                                 'url':          video_url.decode('utf-8'),
1253                                 'uploader':     video_uploader,
1254                                 'title':        video_title,
1255                                 'stitle':       simple_title,
1256                                 'ext':          video_extension.decode('utf-8'),
1257                                 'format':       u'NA',
1258                         })
1259                 except UnavailableFormatError:
1260                         self._downloader.trouble(u'ERROR: format not available for video')
1261
1262
1263 class YoutubeSearchIE(InfoExtractor):
1264         """Information Extractor for YouTube search queries."""
1265         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1266         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1267         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1268         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1269         _youtube_ie = None
1270         _max_youtube_results = 1000
1271
1272         def __init__(self, youtube_ie, downloader=None):
1273                 InfoExtractor.__init__(self, downloader)
1274                 self._youtube_ie = youtube_ie
1275
1276         @staticmethod
1277         def suitable(url):
1278                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1279
1280         def report_download_page(self, query, pagenum):
1281                 """Report attempt to download playlist page with given number."""
1282                 query = query.decode(preferredencoding())
1283                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1284
1285         def _real_initialize(self):
1286                 self._youtube_ie.initialize()
1287
1288         def _real_extract(self, query):
1289                 mobj = re.match(self._VALID_QUERY, query)
1290                 if mobj is None:
1291                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1292                         return
1293
1294                 prefix, query = query.split(':')
1295                 prefix = prefix[8:]
1296                 query  = query.encode('utf-8')
1297                 if prefix == '':
1298                         self._download_n_results(query, 1)
1299                         return
1300                 elif prefix == 'all':
1301                         self._download_n_results(query, self._max_youtube_results)
1302                         return
1303                 else:
1304                         try:
1305                                 n = long(prefix)
1306                                 if n <= 0:
1307                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1308                                         return
1309                                 elif n > self._max_youtube_results:
1310                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1311                                         n = self._max_youtube_results
1312                                 self._download_n_results(query, n)
1313                                 return
1314                         except ValueError: # parsing prefix as integer fails
1315                                 self._download_n_results(query, 1)
1316                                 return
1317
1318         def _download_n_results(self, query, n):
1319                 """Downloads a specified number of results for a query"""
1320
1321                 video_ids = []
1322                 already_seen = set()
1323                 pagenum = 1
1324
1325                 while True:
1326                         self.report_download_page(query, pagenum)
1327                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1328                         request = urllib2.Request(result_url, None, std_headers)
1329                         try:
1330                                 page = urllib2.urlopen(request).read()
1331                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1332                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1333                                 return
1334
1335                         # Extract video identifiers
1336                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1337                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1338                                 if video_id not in already_seen:
1339                                         video_ids.append(video_id)
1340                                         already_seen.add(video_id)
1341                                         if len(video_ids) == n:
1342                                                 # Specified n videos reached
1343                                                 for id in video_ids:
1344                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1345                                                 return
1346
1347                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1348                                 for id in video_ids:
1349                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1350                                 return
1351
1352                         pagenum = pagenum + 1
1353
1354 class YoutubePlaylistIE(InfoExtractor):
1355         """Information Extractor for YouTube playlists."""
1356
1357         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1358         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1359         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1360         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1361         _youtube_ie = None
1362
1363         def __init__(self, youtube_ie, downloader=None):
1364                 InfoExtractor.__init__(self, downloader)
1365                 self._youtube_ie = youtube_ie
1366
1367         @staticmethod
1368         def suitable(url):
1369                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1370
1371         def report_download_page(self, playlist_id, pagenum):
1372                 """Report attempt to download playlist page with given number."""
1373                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1374
1375         def _real_initialize(self):
1376                 self._youtube_ie.initialize()
1377
1378         def _real_extract(self, url):
1379                 # Extract playlist id
1380                 mobj = re.match(self._VALID_URL, url)
1381                 if mobj is None:
1382                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1383                         return
1384
1385                 # Download playlist pages
1386                 playlist_id = mobj.group(1)
1387                 video_ids = []
1388                 pagenum = 1
1389
1390                 while True:
1391                         self.report_download_page(playlist_id, pagenum)
1392                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1393                         try:
1394                                 page = urllib2.urlopen(request).read()
1395                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1396                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1397                                 return
1398
1399                         # Extract video identifiers
1400                         ids_in_page = []
1401                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1402                                 if mobj.group(1) not in ids_in_page:
1403                                         ids_in_page.append(mobj.group(1))
1404                         video_ids.extend(ids_in_page)
1405
1406                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1407                                 break
1408                         pagenum = pagenum + 1
1409
1410                 for id in video_ids:
1411                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1412                 return
1413
1414 class YoutubeUserIE(InfoExtractor):
1415         """Information Extractor for YouTube users."""
1416
1417         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1418         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1419         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1420         _youtube_ie = None
1421
1422         def __init__(self, youtube_ie, downloader=None):
1423                 InfoExtractor.__init__(self, downloader)
1424                 self._youtube_ie = youtube_ie
1425
1426         @staticmethod
1427         def suitable(url):
1428                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1429
1430         def report_download_page(self, username):
1431                 """Report attempt to download user page."""
1432                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1433
1434         def _real_initialize(self):
1435                 self._youtube_ie.initialize()
1436
1437         def _real_extract(self, url):
1438                 # Extract username
1439                 mobj = re.match(self._VALID_URL, url)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1442                         return
1443
1444                 # Download user page
1445                 username = mobj.group(1)
1446                 video_ids = []
1447                 pagenum = 1
1448
1449                 self.report_download_page(username)
1450                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1451                 try:
1452                         page = urllib2.urlopen(request).read()
1453                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1455                         return
1456
1457                 # Extract video identifiers
1458                 ids_in_page = []
1459
1460                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1461                         if mobj.group(1) not in ids_in_page:
1462                                 ids_in_page.append(mobj.group(1))
1463                 video_ids.extend(ids_in_page)
1464
1465                 for id in video_ids:
1466                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1467                 return
1468
1469 class PostProcessor(object):
1470         """Post Processor class.
1471
1472         PostProcessor objects can be added to downloaders with their
1473         add_post_processor() method. When the downloader has finished a
1474         successful download, it will take its internal chain of PostProcessors
1475         and start calling the run() method on each one of them, first with
1476         an initial argument and then with the returned value of the previous
1477         PostProcessor.
1478
1479         The chain will be stopped if one of them ever returns None or the end
1480         of the chain is reached.
1481
1482         PostProcessor objects follow a "mutual registration" process similar
1483         to InfoExtractor objects.
1484         """
1485
1486         _downloader = None
1487
1488         def __init__(self, downloader=None):
1489                 self._downloader = downloader
1490
1491         def set_downloader(self, downloader):
1492                 """Sets the downloader for this PP."""
1493                 self._downloader = downloader
1494
1495         def run(self, information):
1496                 """Run the PostProcessor.
1497
1498                 The "information" argument is a dictionary like the ones
1499                 composed by InfoExtractors. The only difference is that this
1500                 one has an extra field called "filepath" that points to the
1501                 downloaded file.
1502
1503                 When this method returns None, the postprocessing chain is
1504                 stopped. However, this method may return an information
1505                 dictionary that will be passed to the next postprocessing
1506                 object in the chain. It can be the one it received after
1507                 changing some fields.
1508
1509                 In addition, this method may raise a PostProcessingError
1510                 exception that will be taken into account by the downloader
1511                 it was called from.
1512                 """
1513                 return information # by default, do nothing
1514
1515 ### MAIN PROGRAM ###
1516 if __name__ == '__main__':
1517         try:
1518                 # Modules needed only when running the main program
1519                 import getpass
1520                 import optparse
1521
1522                 # Function to update the program file with the latest version from bitbucket.org
1523                 def update_self(downloader, filename):
1524                         # Note: downloader only used for options
1525                         if not os.access (filename, os.W_OK):
1526                                 sys.exit('ERROR: no write permissions on %s' % filename)
1527
1528                         downloader.to_stdout('Updating to latest stable version...')
1529                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1530                         latest_version = urllib.urlopen(latest_url).read().strip()
1531                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1532                         newcontent = urllib.urlopen(prog_url).read()
1533                         stream = open(filename, 'w')
1534                         stream.write(newcontent)
1535                         stream.close()
1536                         downloader.to_stdout('Updated to version %s' % latest_version)
1537
1538                 # General configuration
1539                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1540                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1541                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1542
1543                 # Parse command line
1544                 parser = optparse.OptionParser(
1545                         usage='Usage: %prog [options] url...',
1546                         version='2010.03.13',
1547                         conflict_handler='resolve',
1548                 )
1549
1550                 parser.add_option('-h', '--help',
1551                                 action='help', help='print this help text and exit')
1552                 parser.add_option('-v', '--version',
1553                                 action='version', help='print program version and exit')
1554                 parser.add_option('-U', '--update',
1555                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1556                 parser.add_option('-i', '--ignore-errors',
1557                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1558                 parser.add_option('-r', '--rate-limit',
1559                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1560
1561                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1562                 authentication.add_option('-u', '--username',
1563                                 dest='username', metavar='UN', help='account username')
1564                 authentication.add_option('-p', '--password',
1565                                 dest='password', metavar='PW', help='account password')
1566                 authentication.add_option('-n', '--netrc',
1567                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1568                 parser.add_option_group(authentication)
1569
1570                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1571                 video_format.add_option('-f', '--format',
1572                                 action='store', dest='format', metavar='FMT', help='video format code')
1573                 video_format.add_option('-b', '--best-quality',
1574                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1575                 video_format.add_option('-m', '--mobile-version',
1576                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1577                 video_format.add_option('-d', '--high-def',
1578                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1579                 video_format.add_option('--all-formats',
1580                                 action='store_const', dest='format', help='download all available video formats', const='-1')
1581                 parser.add_option_group(video_format)
1582
1583                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1584                 verbosity.add_option('-q', '--quiet',
1585                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1586                 verbosity.add_option('-s', '--simulate',
1587                                 action='store_true', dest='simulate', help='do not download video', default=False)
1588                 verbosity.add_option('-g', '--get-url',
1589                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1590                 verbosity.add_option('-e', '--get-title',
1591                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1592                 verbosity.add_option('--no-progress',
1593                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1594                 parser.add_option_group(verbosity)
1595
1596                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1597                 filesystem.add_option('-t', '--title',
1598                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1599                 filesystem.add_option('-l', '--literal',
1600                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1601                 filesystem.add_option('-o', '--output',
1602                                 dest='outtmpl', metavar='TPL', help='output filename template')
1603                 filesystem.add_option('-a', '--batch-file',
1604                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1605                 filesystem.add_option('-w', '--no-overwrites',
1606                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1607                 filesystem.add_option('-c', '--continue',
1608                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1609                 parser.add_option_group(filesystem)
1610
1611                 (opts, args) = parser.parse_args()
1612
1613                 # Batch file verification
1614                 batchurls = []
1615                 if opts.batchfile is not None:
1616                         try:
1617                                 batchurls = open(opts.batchfile, 'r').readlines()
1618                                 batchurls = [x.strip() for x in batchurls]
1619                                 batchurls = [x for x in batchurls if len(x) > 0]
1620                         except IOError:
1621                                 sys.exit(u'ERROR: batch file could not be read')
1622                 all_urls = batchurls + args
1623
1624                 # Conflicting, missing and erroneous options
1625                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1626                         parser.error(u'using .netrc conflicts with giving username/password')
1627                 if opts.password is not None and opts.username is None:
1628                         parser.error(u'account username missing')
1629                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1630                         parser.error(u'using output template conflicts with using title or literal title')
1631                 if opts.usetitle and opts.useliteral:
1632                         parser.error(u'using title conflicts with using literal title')
1633                 if opts.username is not None and opts.password is None:
1634                         opts.password = getpass.getpass(u'Type account password and press return:')
1635                 if opts.ratelimit is not None:
1636                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1637                         if numeric_limit is None:
1638                                 parser.error(u'invalid rate limit specified')
1639                         opts.ratelimit = numeric_limit
1640
1641                 # Information extractors
1642                 youtube_ie = YoutubeIE()
1643                 metacafe_ie = MetacafeIE(youtube_ie)
1644                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1645                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1646                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1647                 google_ie = GoogleIE()
1648                 photobucket_ie = PhotobucketIE()
1649                 generic_ie = GenericIE()
1650
1651                 # File downloader
1652                 fd = FileDownloader({
1653                         'usenetrc': opts.usenetrc,
1654                         'username': opts.username,
1655                         'password': opts.password,
1656                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1657                         'forceurl': opts.geturl,
1658                         'forcetitle': opts.gettitle,
1659                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1660                         'format': opts.format,
1661                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1662                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
1663                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
1664                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
1665                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1666                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1667                                 or u'%(id)s.%(ext)s'),
1668                         'ignoreerrors': opts.ignoreerrors,
1669                         'ratelimit': opts.ratelimit,
1670                         'nooverwrites': opts.nooverwrites,
1671                         'continuedl': opts.continue_dl,
1672                         'noprogress': opts.noprogress,
1673                         })
1674                 fd.add_info_extractor(youtube_search_ie)
1675                 fd.add_info_extractor(youtube_pl_ie)
1676                 fd.add_info_extractor(youtube_user_ie)
1677                 fd.add_info_extractor(metacafe_ie)
1678                 fd.add_info_extractor(youtube_ie)
1679                 fd.add_info_extractor(google_ie)
1680                 fd.add_info_extractor(photobucket_ie)
1681
1682                 # This must come last since it's the
1683                 # fallback if none of the others work
1684                 fd.add_info_extractor(generic_ie)
1685
1686                 # Update version
1687                 if opts.update_self:
1688                         update_self(fd, sys.argv[0])
1689
1690                 # Maybe do nothing
1691                 if len(all_urls) < 1:
1692                         if not opts.update_self:
1693                                 parser.error(u'you must provide at least one URL')
1694                         else:
1695                                 sys.exit()
1696                 retcode = fd.download(all_urls)
1697                 sys.exit(retcode)
1698
1699         except DownloadError:
1700                 sys.exit(1)
1701         except SameFileError:
1702                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1703         except KeyboardInterrupt:
1704                 sys.exit(u'\nERROR: Interrupted by user')