youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204
 205         def __init__(self, params):
 206                 """Create a FileDownloader object with the given options."""
 207                 self._ies = []
 208                 self._pps = []
 209                 self._download_retcode = 0
 210                 self.params = params
 211
 212         @staticmethod
 213         def pmkdir(filename):
 214                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 215                 components = filename.split(os.sep)
 216                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 217                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 218                 for dir in aggregate:
 219                         if not os.path.exists(dir):
 220                                 os.mkdir(dir)
 221
 222         @staticmethod
 223         def format_bytes(bytes):
 224                 if bytes is None:
 225                         return 'N/A'
 226                 if type(bytes) is str:
 227                         bytes = float(bytes)
 228                 if bytes == 0.0:
 229                         exponent = 0
 230                 else:
 231                         exponent = long(math.log(bytes, 1024.0))
 232                 suffix = 'bkMGTPEZY'[exponent]
 233                 converted = float(bytes) / float(1024**exponent)
 234                 return '%.2f%s' % (converted, suffix)
 235
 236         @staticmethod
 237         def calc_percent(byte_counter, data_len):
 238                 if data_len is None:
 239                         return '---.-%'
 240                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 241
 242         @staticmethod
 243         def calc_eta(start, now, total, current):
 244                 if total is None:
 245                         return '--:--'
 246                 dif = now - start
 247                 if current == 0 or dif < 0.001: # One millisecond
 248                         return '--:--'
 249                 rate = float(current) / dif
 250                 eta = long((float(total) - float(current)) / rate)
 251                 (eta_mins, eta_secs) = divmod(eta, 60)
 252                 if eta_mins > 99:
 253                         return '--:--'
 254                 return '%02d:%02d' % (eta_mins, eta_secs)
 255
 256         @staticmethod
 257         def calc_speed(start, now, bytes):
 258                 dif = now - start
 259                 if bytes == 0 or dif < 0.001: # One millisecond
 260                         return '%10s' % '---b/s'
 261                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 262
 263         @staticmethod
 264         def best_block_size(elapsed_time, bytes):
 265                 new_min = max(bytes / 2.0, 1.0)
 266                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 267                 if elapsed_time < 0.001:
 268                         return long(new_max)
 269                 rate = bytes / elapsed_time
 270                 if rate > new_max:
 271                         return long(new_max)
 272                 if rate < new_min:
 273                         return long(new_min)
 274                 return long(rate)
 275
 276         @staticmethod
 277         def parse_bytes(bytestr):
 278                 """Parse a string indicating a byte quantity into a long integer."""
 279                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 280                 if matchobj is None:
 281                         return None
 282                 number = float(matchobj.group(1))
 283                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 284                 return long(round(number * multiplier))
 285
 286         @staticmethod
 287         def verify_url(url):
 288                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 289                 request = urllib2.Request(url, None, std_headers)
 290                 data = urllib2.urlopen(request)
 291                 data.read(1)
 292                 url = data.geturl()
 293                 data.close()
 294                 return url
 295
 296         def add_info_extractor(self, ie):
 297                 """Add an InfoExtractor object to the end of the list."""
 298                 self._ies.append(ie)
 299                 ie.set_downloader(self)
 300
 301         def add_post_processor(self, pp):
 302                 """Add a PostProcessor object to the end of the chain."""
 303                 self._pps.append(pp)
 304                 pp.set_downloader(self)
 305
 306         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 307                 """Print message to stdout if not in quiet mode."""
 308                 try:
 309                         if not self.params.get('quiet', False):
 310                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 311                         sys.stdout.flush()
 312                 except (UnicodeEncodeError), err:
 313                         if not ignore_encoding_errors:
 314                                 raise
 315
 316         def to_stderr(self, message):
 317                 """Print message to stderr."""
 318                 print >>sys.stderr, message.encode(preferredencoding())
 319
 320         def fixed_template(self):
 321                 """Checks if the output template is fixed."""
 322                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 323
 324         def trouble(self, message=None):
 325                 """Determine action to take when a download problem appears.
 326
 327                 Depending on if the downloader has been configured to ignore
 328                 download errors or not, this method may throw an exception or
 329                 not when errors are found, after printing the message.
 330                 """
 331                 if message is not None:
 332                         self.to_stderr(message)
 333                 if not self.params.get('ignoreerrors', False):
 334                         raise DownloadError(message)
 335                 self._download_retcode = 1
 336
 337         def slow_down(self, start_time, byte_counter):
 338                 """Sleep if the download speed is over the rate limit."""
 339                 rate_limit = self.params.get('ratelimit', None)
 340                 if rate_limit is None or byte_counter == 0:
 341                         return
 342                 now = time.time()
 343                 elapsed = now - start_time
 344                 if elapsed <= 0.0:
 345                         return
 346                 speed = float(byte_counter) / elapsed
 347                 if speed > rate_limit:
 348                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 349
 350         def report_destination(self, filename):
 351                 """Report destination filename."""
 352                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 353
 354         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 355                 """Report download progress."""
 356                 if self.params.get('noprogress', False):
 357                         return
 358                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 359                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 360
 361         def report_resuming_byte(self, resume_len):
 362                 """Report attemtp to resume at given byte."""
 363                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 364
 365         def report_file_already_downloaded(self, file_name):
 366                 """Report file has already been fully downloaded."""
 367                 try:
 368                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 369                 except (UnicodeEncodeError), err:
 370                         self.to_stdout(u'[download] The file has already been downloaded')
 371
 372         def report_unable_to_resume(self):
 373                 """Report it was impossible to resume download."""
 374                 self.to_stdout(u'[download] Unable to resume')
 375
 376         def report_finish(self):
 377                 """Report download finished."""
 378                 if self.params.get('noprogress', False):
 379                         self.to_stdout(u'[download] Download completed')
 380                 else:
 381                         self.to_stdout(u'')
 382
 383         def process_info(self, info_dict):
 384                 """Process a single dictionary returned by an InfoExtractor."""
 385                 # Do nothing else if in simulate mode
 386                 if self.params.get('simulate', False):
 387                         # Verify URL if it's an HTTP one
 388                         if info_dict['url'].startswith('http'):
 389                                 try:
 390                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 391                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                                         raise UnavailableFormatError
 393
 394                         # Forced printings
 395                         if self.params.get('forcetitle', False):
 396                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 397                         if self.params.get('forceurl', False):
 398                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 399
 400                         return
 401
 402                 try:
 403                         template_dict = dict(info_dict)
 404                         template_dict['epoch'] = unicode(long(time.time()))
 405                         filename = self.params['outtmpl'] % template_dict
 406                 except (ValueError, KeyError), err:
 407                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 408                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 409                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 410                         return
 411
 412                 try:
 413                         self.pmkdir(filename)
 414                 except (OSError, IOError), err:
 415                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 416                         return
 417
 418                 try:
 419                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 420                 except (OSError, IOError), err:
 421                         raise UnavailableFormatError
 422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 423                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 424                         return
 425                 except (ContentTooShortError, ), err:
 426                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 427                         return
 428
 429                 if success:
 430                         try:
 431                                 self.post_process(filename, info_dict)
 432                         except (PostProcessingError), err:
 433                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 434                                 return
 435
 436         def download(self, url_list):
 437                 """Download a given list of URLs."""
 438                 if len(url_list) > 1 and self.fixed_template():
 439                         raise SameFileError(self.params['outtmpl'])
 440
 441                 for url in url_list:
 442                         suitable_found = False
 443                         for ie in self._ies:
 444                                 # Go to next InfoExtractor if not suitable
 445                                 if not ie.suitable(url):
 446                                         continue
 447
 448                                 # Suitable InfoExtractor found
 449                                 suitable_found = True
 450
 451                                 # Extract information from URL and process it
 452                                 ie.extract(url)
 453
 454                                 # Suitable InfoExtractor had been found; go to next URL
 455                                 break
 456
 457                         if not suitable_found:
 458                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 459
 460                 return self._download_retcode
 461
 462         def post_process(self, filename, ie_info):
 463                 """Run the postprocessing chain on the given file."""
 464                 info = dict(ie_info)
 465                 info['filepath'] = filename
 466                 for pp in self._pps:
 467                         info = pp.run(info)
 468                         if info is None:
 469                                 break
 470
 471         def _download_with_rtmpdump(self, filename, url):
 472                 self.report_destination(filename)
 473
 474                 # Check for rtmpdump first
 475                 try:
 476                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 477                 except (OSError, IOError):
 478                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 479                         return False
 480
 481                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 482                 # the connection was interrumpted and resuming appears to be
 483                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 484                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 485                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 486                 while retval == 2 or retval == 1:
 487                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 488                         time.sleep(2.0) # This seems to be needed
 489                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 490                 if retval == 0:
 491                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 492                         return True
 493                 else:
 494                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
 495                         return False
 496
 497         def _do_download(self, filename, url):
 498                 # Attempt to download using rtmpdump
 499                 if url.startswith('rtmp'):
 500                         return self._download_with_rtmpdump(filename, url)
 501
 502                 stream = None
 503                 open_mode = 'wb'
 504                 basic_request = urllib2.Request(url, None, std_headers)
 505                 request = urllib2.Request(url, None, std_headers)
 506
 507                 # Establish possible resume length
 508                 if os.path.isfile(filename):
 509                         resume_len = os.path.getsize(filename)
 510                 else:
 511                         resume_len = 0
 512
 513                 # Request parameters in case of being able to resume
 514                 if self.params.get('continuedl', False) and resume_len != 0:
 515                         self.report_resuming_byte(resume_len)
 516                         request.add_header('Range','bytes=%d-' % resume_len)
 517                         open_mode = 'ab'
 518
 519                 # Establish connection
 520                 try:
 521                         data = urllib2.urlopen(request)
 522                 except (urllib2.HTTPError, ), err:
 523                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 524                                 raise
 525                         # Unable to resume
 526                         data = urllib2.urlopen(basic_request)
 527                         content_length = data.info()['Content-Length']
 528
 529                         if content_length is not None and long(content_length) == resume_len:
 530                                 # Because the file had already been fully downloaded
 531                                 self.report_file_already_downloaded(filename)
 532                                 return True
 533                         else:
 534                                 # Because the server didn't let us
 535                                 self.report_unable_to_resume()
 536                                 open_mode = 'wb'
 537
 538                 data_len = data.info().get('Content-length', None)
 539                 data_len_str = self.format_bytes(data_len)
 540                 byte_counter = 0
 541                 block_size = 1024
 542                 start = time.time()
 543                 while True:
 544                         # Download and write
 545                         before = time.time()
 546                         data_block = data.read(block_size)
 547                         after = time.time()
 548                         data_block_len = len(data_block)
 549                         if data_block_len == 0:
 550                                 break
 551                         byte_counter += data_block_len
 552
 553                         # Open file just in time
 554                         if stream is None:
 555                                 try:
 556                                         (stream, filename) = sanitize_open(filename, open_mode)
 557                                         self.report_destination(filename)
 558                                 except (OSError, IOError), err:
 559                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 560                                         return False
 561                         stream.write(data_block)
 562                         block_size = self.best_block_size(after - before, data_block_len)
 563
 564                         # Progress message
 565                         percent_str = self.calc_percent(byte_counter, data_len)
 566                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 567                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 568                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 569
 570                         # Apply rate limit
 571                         self.slow_down(start, byte_counter)
 572
 573                 self.report_finish()
 574                 if data_len is not None and str(byte_counter) != data_len:
 575                         raise ContentTooShortError(byte_counter, long(data_len))
 576                 return True
 577
 578 class InfoExtractor(object):
 579         """Information Extractor class.
 580
 581         Information extractors are the classes that, given a URL, extract
 582         information from the video (or videos) the URL refers to. This
 583         information includes the real video URL, the video title and simplified
 584         title, author and others. The information is stored in a dictionary
 585         which is then passed to the FileDownloader. The FileDownloader
 586         processes this information possibly downloading the video to the file
 587         system, among other possible outcomes. The dictionaries must include
 588         the following fields:
 589
 590         id:             Video identifier.
 591         url:            Final video URL.
 592         uploader:       Nickname of the video uploader.
 593         title:          Literal title.
 594         stitle:         Simplified title.
 595         ext:            Video filename extension.
 596
 597         Subclasses of this one should re-define the _real_initialize() and
 598         _real_extract() methods, as well as the suitable() static method.
 599         Probably, they should also be instantiated and added to the main
 600         downloader.
 601         """
 602
 603         _ready = False
 604         _downloader = None
 605
 606         def __init__(self, downloader=None):
 607                 """Constructor. Receives an optional downloader."""
 608                 self._ready = False
 609                 self.set_downloader(downloader)
 610
 611         @staticmethod
 612         def suitable(url):
 613                 """Receives a URL and returns True if suitable for this IE."""
 614                 return False
 615
 616         def initialize(self):
 617                 """Initializes an instance (authentication, etc)."""
 618                 if not self._ready:
 619                         self._real_initialize()
 620                         self._ready = True
 621
 622         def extract(self, url):
 623                 """Extracts URL information and returns it in list of dicts."""
 624                 self.initialize()
 625                 return self._real_extract(url)
 626
 627         def set_downloader(self, downloader):
 628                 """Sets the downloader for this IE."""
 629                 self._downloader = downloader
 630
 631         def _real_initialize(self):
 632                 """Real initialization process. Redefine in subclasses."""
 633                 pass
 634
 635         def _real_extract(self, url):
 636                 """Real extraction process. Redefine in subclasses."""
 637                 pass
 638
 639 class YoutubeIE(InfoExtractor):
 640         """Information extractor for youtube.com."""
 641
 642         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 643         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 644         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 645         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 646         _NETRC_MACHINE = 'youtube'
 647         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 648         _video_extensions = {
 649                 '13': '3gp',
 650                 '17': 'mp4',
 651                 '18': 'mp4',
 652                 '22': 'mp4',
 653                 '37': 'mp4',
 654         }
 655
 656         @staticmethod
 657         def suitable(url):
 658                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 659
 660         def report_lang(self):
 661                 """Report attempt to set language."""
 662                 self._downloader.to_stdout(u'[youtube] Setting language')
 663
 664         def report_login(self):
 665                 """Report attempt to log in."""
 666                 self._downloader.to_stdout(u'[youtube] Logging in')
 667
 668         def report_age_confirmation(self):
 669                 """Report attempt to confirm age."""
 670                 self._downloader.to_stdout(u'[youtube] Confirming age')
 671
 672         def report_video_info_webpage_download(self, video_id):
 673                 """Report attempt to download video info webpage."""
 674                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 675
 676         def report_information_extraction(self, video_id):
 677                 """Report attempt to extract video information."""
 678                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 679
 680         def report_unavailable_format(self, video_id, format):
 681                 """Report extracted video URL."""
 682                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 683
 684         def report_rtmp_download(self):
 685                 """Indicate the download will use the RTMP protocol."""
 686                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 687
 688         def _real_initialize(self):
 689                 if self._downloader is None:
 690                         return
 691
 692                 username = None
 693                 password = None
 694                 downloader_params = self._downloader.params
 695
 696                 # Attempt to use provided username and password or .netrc data
 697                 if downloader_params.get('username', None) is not None:
 698                         username = downloader_params['username']
 699                         password = downloader_params['password']
 700                 elif downloader_params.get('usenetrc', False):
 701                         try:
 702                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 703                                 if info is not None:
 704                                         username = info[0]
 705                                         password = info[2]
 706                                 else:
 707                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 708                         except (IOError, netrc.NetrcParseError), err:
 709                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 710                                 return
 711
 712                 # Set language
 713                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 714                 try:
 715                         self.report_lang()
 716                         urllib2.urlopen(request).read()
 717                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 718                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 719                         return
 720
 721                 # No authentication to be performed
 722                 if username is None:
 723                         return
 724
 725                 # Log in
 726                 login_form = {
 727                                 'current_form': 'loginForm',
 728                                 'next':         '/',
 729                                 'action_login': 'Log In',
 730                                 'username':     username,
 731                                 'password':     password,
 732                                 }
 733                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 734                 try:
 735                         self.report_login()
 736                         login_results = urllib2.urlopen(request).read()
 737                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 738                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 739                                 return
 740                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 741                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 742                         return
 743
 744                 # Confirm age
 745                 age_form = {
 746                                 'next_url':             '/',
 747                                 'action_confirm':       'Confirm',
 748                                 }
 749                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 750                 try:
 751                         self.report_age_confirmation()
 752                         age_results = urllib2.urlopen(request).read()
 753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 754                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 755                         return
 756
 757         def _real_extract(self, url):
 758                 # Extract video id from URL
 759                 mobj = re.match(self._VALID_URL, url)
 760                 if mobj is None:
 761                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 762                         return
 763                 video_id = mobj.group(2)
 764
 765                 # Downloader parameters
 766                 best_quality = False
 767                 format_param = None
 768                 quality_index = 0
 769                 if self._downloader is not None:
 770                         params = self._downloader.params
 771                         format_param = params.get('format', None)
 772                         if format_param == '0':
 773                                 format_param = self._available_formats[quality_index]
 774                                 best_quality = True
 775
 776                 while True:
 777                         # Extension
 778                         video_extension = self._video_extensions.get(format_param, 'flv')
 779
 780                         # Get video info
 781                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
 782                         request = urllib2.Request(video_info_url, None, std_headers)
 783                         try:
 784                                 self.report_video_info_webpage_download(video_id)
 785                                 video_info_webpage = urllib2.urlopen(request).read()
 786                                 video_info = parse_qs(video_info_webpage)
 787                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 788                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 789                                 return
 790                         self.report_information_extraction(video_id)
 791
 792                         # "t" param
 793                         if 'token' not in video_info:
 794                                 # Attempt to see if YouTube has issued an error message
 795                                 if 'reason' not in video_info:
 796                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 797                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 798                                         stream.write(video_info_webpage)
 799                                         stream.close()
 800                                 else:
 801                                         reason = urllib.unquote_plus(video_info['reason'][0])
 802                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 803                                 return
 804                         token = urllib.unquote_plus(video_info['token'][0])
 805                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 806                         if format_param is not None:
 807                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 808
 809                         # Check possible RTMP download
 810                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 811                                 self.report_rtmp_download()
 812                                 video_real_url = video_info['conn'][0]
 813
 814                         # uploader
 815                         if 'author' not in video_info:
 816                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 817                                 return
 818                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 819
 820                         # title
 821                         if 'title' not in video_info:
 822                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 823                                 return
 824                         video_title = urllib.unquote_plus(video_info['title'][0])
 825                         video_title = video_title.decode('utf-8')
 826                         video_title = sanitize_title(video_title)
 827
 828                         # simplified title
 829                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 830                         simple_title = simple_title.strip(ur'_')
 831
 832                         try:
 833                                 # Process video information
 834                                 self._downloader.process_info({
 835                                         'id':           video_id.decode('utf-8'),
 836                                         'url':          video_real_url.decode('utf-8'),
 837                                         'uploader':     video_uploader.decode('utf-8'),
 838                                         'title':        video_title,
 839                                         'stitle':       simple_title,
 840                                         'ext':          video_extension.decode('utf-8'),
 841                                 })
 842
 843                                 return
 844
 845                         except UnavailableFormatError, err:
 846                                 if best_quality:
 847                                         if quality_index == len(self._available_formats) - 1:
 848                                                 # I don't ever expect this to happen
 849                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 850                                                 return
 851                                         else:
 852                                                 self.report_unavailable_format(video_id, format_param)
 853                                                 quality_index += 1
 854                                                 format_param = self._available_formats[quality_index]
 855                                                 continue
 856                                 else:
 857                                         self._downloader.trouble('ERROR: format not available for video')
 858                                         return
 859
 860
 861 class MetacafeIE(InfoExtractor):
 862         """Information Extractor for metacafe.com."""
 863
 864         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 865         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 866         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 867         _youtube_ie = None
 868
 869         def __init__(self, youtube_ie, downloader=None):
 870                 InfoExtractor.__init__(self, downloader)
 871                 self._youtube_ie = youtube_ie
 872
 873         @staticmethod
 874         def suitable(url):
 875                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 876
 877         def report_disclaimer(self):
 878                 """Report disclaimer retrieval."""
 879                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 880
 881         def report_age_confirmation(self):
 882                 """Report attempt to confirm age."""
 883                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 884
 885         def report_download_webpage(self, video_id):
 886                 """Report webpage download."""
 887                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 888
 889         def report_extraction(self, video_id):
 890                 """Report information extraction."""
 891                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 892
 893         def _real_initialize(self):
 894                 # Retrieve disclaimer
 895                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 896                 try:
 897                         self.report_disclaimer()
 898                         disclaimer = urllib2.urlopen(request).read()
 899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 900                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 901                         return
 902
 903                 # Confirm age
 904                 disclaimer_form = {
 905                         'filters': '0',
 906                         'submit': "Continue - I'm over 18",
 907                         }
 908                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 909                 try:
 910                         self.report_age_confirmation()
 911                         disclaimer = urllib2.urlopen(request).read()
 912                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 913                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 914                         return
 915
 916         def _real_extract(self, url):
 917                 # Extract id and simplified title from URL
 918                 mobj = re.match(self._VALID_URL, url)
 919                 if mobj is None:
 920                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 921                         return
 922
 923                 video_id = mobj.group(1)
 924
 925                 # Check if video comes from YouTube
 926                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 927                 if mobj2 is not None:
 928                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 929                         return
 930
 931                 simple_title = mobj.group(2).decode('utf-8')
 932                 video_extension = 'flv'
 933
 934                 # Retrieve video webpage to extract further information
 935                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 936                 try:
 937                         self.report_download_webpage(video_id)
 938                         webpage = urllib2.urlopen(request).read()
 939                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 940                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 941                         return
 942
 943                 # Extract URL, uploader and title from webpage
 944                 self.report_extraction(video_id)
 945                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 946                 if mobj is None:
 947                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 948                         return
 949                 mediaURL = urllib.unquote(mobj.group(1))
 950
 951                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 952                 #if mobj is None:
 953                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 954                 #       return
 955                 #gdaKey = mobj.group(1)
 956                 #
 957                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 958
 959                 video_url = mediaURL
 960
 961                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 962                 if mobj is None:
 963                         self._downloader.trouble(u'ERROR: unable to extract title')
 964                         return
 965                 video_title = mobj.group(1).decode('utf-8')
 966                 video_title = sanitize_title(video_title)
 967
 968                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 969                 if mobj is None:
 970                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 971                         return
 972                 video_uploader = mobj.group(1)
 973
 974                 try:
 975                         # Process video information
 976                         self._downloader.process_info({
 977                                 'id':           video_id.decode('utf-8'),
 978                                 'url':          video_url.decode('utf-8'),
 979                                 'uploader':     video_uploader.decode('utf-8'),
 980                                 'title':        video_title,
 981                                 'stitle':       simple_title,
 982                                 'ext':          video_extension.decode('utf-8'),
 983                         })
 984                 except UnavailableFormatError:
 985                         self._downloader.trouble(u'ERROR: format not available for video')
 986
 987
 988 class GoogleIE(InfoExtractor):
 989         """Information extractor for video.google.com."""
 990
 991         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 992
 993         def __init__(self, downloader=None):
 994                 InfoExtractor.__init__(self, downloader)
 995
 996         @staticmethod
 997         def suitable(url):
 998                 return (re.match(GoogleIE._VALID_URL, url) is not None)
 999
1000         def report_download_webpage(self, video_id):
1001                 """Report webpage download."""
1002                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1003
1004         def report_extraction(self, video_id):
1005                 """Report information extraction."""
1006                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1007
1008         def _real_initialize(self):
1009                 return
1010
1011         def _real_extract(self, url):
1012                 # Extract id from URL
1013                 mobj = re.match(self._VALID_URL, url)
1014                 if mobj is None:
1015                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1016                         return
1017
1018                 video_id = mobj.group(1)
1019
1020                 video_extension = 'mp4'
1021
1022                 # Retrieve video webpage to extract further information
1023                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1024                 try:
1025                         self.report_download_webpage(video_id)
1026                         webpage = urllib2.urlopen(request).read()
1027                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1028                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1029                         return
1030
1031                 # Extract URL, uploader, and title from webpage
1032                 self.report_extraction(video_id)
1033                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1034                 if mobj is None:
1035                         video_extension = 'flv'
1036                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1037                 if mobj is None:
1038                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1039                         return
1040                 mediaURL = urllib.unquote(mobj.group(1))
1041                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1042                 mediaURL = mediaURL.replace('\\x26', '\x26')
1043
1044                 video_url = mediaURL
1045
1046                 mobj = re.search(r'<title>(.*)</title>', webpage)
1047                 if mobj is None:
1048                         self._downloader.trouble(u'ERROR: unable to extract title')
1049                         return
1050                 video_title = mobj.group(1).decode('utf-8')
1051                 video_title = sanitize_title(video_title)
1052                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1053
1054                 # Google Video doesn't show uploader nicknames?
1055                 video_uploader = 'NA'
1056
1057                 try:
1058                         # Process video information
1059                         self._downloader.process_info({
1060                                 'id':           video_id.decode('utf-8'),
1061                                 'url':          video_url.decode('utf-8'),
1062                                 'uploader':     video_uploader.decode('utf-8'),
1063                                 'title':        video_title,
1064                                 'stitle':       simple_title,
1065                                 'ext':          video_extension.decode('utf-8'),
1066                         })
1067                 except UnavailableFormatError:
1068                         self._downloader.trouble(u'ERROR: format not available for video')
1069
1070
1071 class PhotobucketIE(InfoExtractor):
1072         """Information extractor for photobucket.com."""
1073
1074         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1075
1076         def __init__(self, downloader=None):
1077                 InfoExtractor.__init__(self, downloader)
1078
1079         @staticmethod
1080         def suitable(url):
1081                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1082
1083         def report_download_webpage(self, video_id):
1084                 """Report webpage download."""
1085                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1086
1087         def report_extraction(self, video_id):
1088                 """Report information extraction."""
1089                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1090
1091         def _real_initialize(self):
1092                 return
1093
1094         def _real_extract(self, url):
1095                 # Extract id from URL
1096                 mobj = re.match(self._VALID_URL, url)
1097                 if mobj is None:
1098                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1099                         return
1100
1101                 video_id = mobj.group(1)
1102
1103                 video_extension = 'flv'
1104
1105                 # Retrieve video webpage to extract further information
1106                 request = urllib2.Request(url)
1107                 try:
1108                         self.report_download_webpage(video_id)
1109                         webpage = urllib2.urlopen(request).read()
1110                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1111                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1112                         return
1113
1114                 # Extract URL, uploader, and title from webpage
1115                 self.report_extraction(video_id)
1116                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1117                 if mobj is None:
1118                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1119                         return
1120                 mediaURL = urllib.unquote(mobj.group(1))
1121
1122                 video_url = mediaURL
1123
1124                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1125                 if mobj is None:
1126                         self._downloader.trouble(u'ERROR: unable to extract title')
1127                         return
1128                 video_title = mobj.group(1).decode('utf-8')
1129                 video_title = sanitize_title(video_title)
1130                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1131
1132                 video_uploader = mobj.group(2).decode('utf-8')
1133
1134                 try:
1135                         # Process video information
1136                         self._downloader.process_info({
1137                                 'id':           video_id.decode('utf-8'),
1138                                 'url':          video_url.decode('utf-8'),
1139                                 'uploader':     video_uploader,
1140                                 'title':        video_title,
1141                                 'stitle':       simple_title,
1142                                 'ext':          video_extension.decode('utf-8'),
1143                         })
1144                 except UnavailableFormatError:
1145                         self._downloader.trouble(u'ERROR: format not available for video')
1146
1147
1148 class GenericIE(InfoExtractor):
1149         """Generic last-resort information extractor."""
1150
1151         def __init__(self, downloader=None):
1152                 InfoExtractor.__init__(self, downloader)
1153
1154         @staticmethod
1155         def suitable(url):
1156                 return True
1157
1158         def report_download_webpage(self, video_id):
1159                 """Report webpage download."""
1160                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1161                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1162
1163         def report_extraction(self, video_id):
1164                 """Report information extraction."""
1165                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1166
1167         def _real_initialize(self):
1168                 return
1169
1170         def _real_extract(self, url):
1171                 video_id = url.split('/')[-1]
1172                 request = urllib2.Request(url)
1173                 try:
1174                         self.report_download_webpage(video_id)
1175                         webpage = urllib2.urlopen(request).read()
1176                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1177                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1178                         return
1179                 except ValueError, err:
1180                         # since this is the last-resort InfoExtractor, if
1181                         # this error is thrown, it'll be thrown here
1182                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183                         return
1184
1185                 # Start with something easy: JW Player in SWFObject
1186                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1187                 if mobj is None:
1188                         # Broaden the search a little bit
1189                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1190                 if mobj is None:
1191                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1192                         return
1193
1194                 # It's possible that one of the regexes
1195                 # matched, but returned an empty group:
1196                 if mobj.group(1) is None:
1197                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1198                         return
1199
1200                 video_url = urllib.unquote(mobj.group(1))
1201                 video_id  = os.path.basename(video_url)
1202
1203                 # here's a fun little line of code for you:
1204                 video_extension = os.path.splitext(video_id)[1][1:]
1205                 video_id        = os.path.splitext(video_id)[0]
1206
1207                 # it's tempting to parse this further, but you would
1208                 # have to take into account all the variations like
1209                 #   Video Title - Site Name
1210                 #   Site Name | Video Title
1211                 #   Video Title - Tagline | Site Name
1212                 # and so on and so forth; it's just not practical
1213                 mobj = re.search(r'<title>(.*)</title>', webpage)
1214                 if mobj is None:
1215                         self._downloader.trouble(u'ERROR: unable to extract title')
1216                         return
1217                 video_title = mobj.group(1).decode('utf-8')
1218                 video_title = sanitize_title(video_title)
1219                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1220
1221                 # video uploader is domain name
1222                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1223                 if mobj is None:
1224                         self._downloader.trouble(u'ERROR: unable to extract title')
1225                         return
1226                 video_uploader = mobj.group(1).decode('utf-8')
1227
1228                 try:
1229                         # Process video information
1230                         self._downloader.process_info({
1231                                 'id':           video_id.decode('utf-8'),
1232                                 'url':          video_url.decode('utf-8'),
1233                                 'uploader':     video_uploader,
1234                                 'title':        video_title,
1235                                 'stitle':       simple_title,
1236                                 'ext':          video_extension.decode('utf-8'),
1237                         })
1238                 except UnavailableFormatError:
1239                         self._downloader.trouble(u'ERROR: format not available for video')
1240
1241
1242 class YoutubeSearchIE(InfoExtractor):
1243         """Information Extractor for YouTube search queries."""
1244         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1245         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1246         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1247         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1248         _youtube_ie = None
1249         _max_youtube_results = 1000
1250
1251         def __init__(self, youtube_ie, downloader=None):
1252                 InfoExtractor.__init__(self, downloader)
1253                 self._youtube_ie = youtube_ie
1254
1255         @staticmethod
1256         def suitable(url):
1257                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1258
1259         def report_download_page(self, query, pagenum):
1260                 """Report attempt to download playlist page with given number."""
1261                 query = query.decode(preferredencoding())
1262                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1263
1264         def _real_initialize(self):
1265                 self._youtube_ie.initialize()
1266
1267         def _real_extract(self, query):
1268                 mobj = re.match(self._VALID_QUERY, query)
1269                 if mobj is None:
1270                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1271                         return
1272
1273                 prefix, query = query.split(':')
1274                 prefix = prefix[8:]
1275                 query  = query.encode('utf-8')
1276                 if prefix == '':
1277                         self._download_n_results(query, 1)
1278                         return
1279                 elif prefix == 'all':
1280                         self._download_n_results(query, self._max_youtube_results)
1281                         return
1282                 else:
1283                         try:
1284                                 n = long(prefix)
1285                                 if n <= 0:
1286                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1287                                         return
1288                                 elif n > self._max_youtube_results:
1289                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1290                                         n = self._max_youtube_results
1291                                 self._download_n_results(query, n)
1292                                 return
1293                         except ValueError: # parsing prefix as integer fails
1294                                 self._download_n_results(query, 1)
1295                                 return
1296
1297         def _download_n_results(self, query, n):
1298                 """Downloads a specified number of results for a query"""
1299
1300                 video_ids = []
1301                 already_seen = set()
1302                 pagenum = 1
1303
1304                 while True:
1305                         self.report_download_page(query, pagenum)
1306                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1307                         request = urllib2.Request(result_url, None, std_headers)
1308                         try:
1309                                 page = urllib2.urlopen(request).read()
1310                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1312                                 return
1313
1314                         # Extract video identifiers
1315                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1316                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1317                                 if video_id not in already_seen:
1318                                         video_ids.append(video_id)
1319                                         already_seen.add(video_id)
1320                                         if len(video_ids) == n:
1321                                                 # Specified n videos reached
1322                                                 for id in video_ids:
1323                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1324                                                 return
1325
1326                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1327                                 for id in video_ids:
1328                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1329                                 return
1330
1331                         pagenum = pagenum + 1
1332
1333 class YoutubePlaylistIE(InfoExtractor):
1334         """Information Extractor for YouTube playlists."""
1335
1336         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1337         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1338         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1339         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1340         _youtube_ie = None
1341
1342         def __init__(self, youtube_ie, downloader=None):
1343                 InfoExtractor.__init__(self, downloader)
1344                 self._youtube_ie = youtube_ie
1345
1346         @staticmethod
1347         def suitable(url):
1348                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1349
1350         def report_download_page(self, playlist_id, pagenum):
1351                 """Report attempt to download playlist page with given number."""
1352                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1353
1354         def _real_initialize(self):
1355                 self._youtube_ie.initialize()
1356
1357         def _real_extract(self, url):
1358                 # Extract playlist id
1359                 mobj = re.match(self._VALID_URL, url)
1360                 if mobj is None:
1361                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1362                         return
1363
1364                 # Download playlist pages
1365                 playlist_id = mobj.group(1)
1366                 video_ids = []
1367                 pagenum = 1
1368
1369                 while True:
1370                         self.report_download_page(playlist_id, pagenum)
1371                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1372                         try:
1373                                 page = urllib2.urlopen(request).read()
1374                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1376                                 return
1377
1378                         # Extract video identifiers
1379                         ids_in_page = []
1380                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1381                                 if mobj.group(1) not in ids_in_page:
1382                                         ids_in_page.append(mobj.group(1))
1383                         video_ids.extend(ids_in_page)
1384
1385                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1386                                 break
1387                         pagenum = pagenum + 1
1388
1389                 for id in video_ids:
1390                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1391                 return
1392
1393 class YoutubeUserIE(InfoExtractor):
1394         """Information Extractor for YouTube users."""
1395
1396         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1397         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1398         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1399         _youtube_ie = None
1400
1401         def __init__(self, youtube_ie, downloader=None):
1402                 InfoExtractor.__init__(self, downloader)
1403                 self._youtube_ie = youtube_ie
1404
1405         @staticmethod
1406         def suitable(url):
1407                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1408
1409         def report_download_page(self, username):
1410                 """Report attempt to download user page."""
1411                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1412
1413         def _real_initialize(self):
1414                 self._youtube_ie.initialize()
1415
1416         def _real_extract(self, url):
1417                 # Extract username
1418                 mobj = re.match(self._VALID_URL, url)
1419                 if mobj is None:
1420                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1421                         return
1422
1423                 # Download user page
1424                 username = mobj.group(1)
1425                 video_ids = []
1426                 pagenum = 1
1427
1428                 self.report_download_page(username)
1429                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1430                 try:
1431                         page = urllib2.urlopen(request).read()
1432                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1433                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1434                         return
1435
1436                 # Extract video identifiers
1437                 ids_in_page = []
1438
1439                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1440                         if mobj.group(1) not in ids_in_page:
1441                                 ids_in_page.append(mobj.group(1))
1442                 video_ids.extend(ids_in_page)
1443
1444                 for id in video_ids:
1445                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1446                 return
1447
1448 class PostProcessor(object):
1449         """Post Processor class.
1450
1451         PostProcessor objects can be added to downloaders with their
1452         add_post_processor() method. When the downloader has finished a
1453         successful download, it will take its internal chain of PostProcessors
1454         and start calling the run() method on each one of them, first with
1455         an initial argument and then with the returned value of the previous
1456         PostProcessor.
1457
1458         The chain will be stopped if one of them ever returns None or the end
1459         of the chain is reached.
1460
1461         PostProcessor objects follow a "mutual registration" process similar
1462         to InfoExtractor objects.
1463         """
1464
1465         _downloader = None
1466
1467         def __init__(self, downloader=None):
1468                 self._downloader = downloader
1469
1470         def set_downloader(self, downloader):
1471                 """Sets the downloader for this PP."""
1472                 self._downloader = downloader
1473
1474         def run(self, information):
1475                 """Run the PostProcessor.
1476
1477                 The "information" argument is a dictionary like the ones
1478                 composed by InfoExtractors. The only difference is that this
1479                 one has an extra field called "filepath" that points to the
1480                 downloaded file.
1481
1482                 When this method returns None, the postprocessing chain is
1483                 stopped. However, this method may return an information
1484                 dictionary that will be passed to the next postprocessing
1485                 object in the chain. It can be the one it received after
1486                 changing some fields.
1487
1488                 In addition, this method may raise a PostProcessingError
1489                 exception that will be taken into account by the downloader
1490                 it was called from.
1491                 """
1492                 return information # by default, do nothing
1493
1494 ### MAIN PROGRAM ###
1495 if __name__ == '__main__':
1496         try:
1497                 # Modules needed only when running the main program
1498                 import getpass
1499                 import optparse
1500
1501                 # Function to update the program file with the latest version from bitbucket.org
1502                 def update_self(downloader, filename):
1503                         # Note: downloader only used for options
1504                         if not os.access (filename, os.W_OK):
1505                                 sys.exit('ERROR: no write permissions on %s' % filename)
1506
1507                         downloader.to_stdout('Updating to latest stable version...')
1508                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1509                         latest_version = urllib.urlopen(latest_url).read().strip()
1510                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1511                         newcontent = urllib.urlopen(prog_url).read()
1512                         stream = open(filename, 'w')
1513                         stream.write(newcontent)
1514                         stream.close()
1515                         downloader.to_stdout('Updated to version %s' % latest_version)
1516
1517                 # General configuration
1518                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1519                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1520                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1521
1522                 # Parse command line
1523                 parser = optparse.OptionParser(
1524                         usage='Usage: %prog [options] url...',
1525                         version='2010.03.13',
1526                         conflict_handler='resolve',
1527                 )
1528
1529                 parser.add_option('-h', '--help',
1530                                 action='help', help='print this help text and exit')
1531                 parser.add_option('-v', '--version',
1532                                 action='version', help='print program version and exit')
1533                 parser.add_option('-U', '--update',
1534                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1535                 parser.add_option('-i', '--ignore-errors',
1536                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1537                 parser.add_option('-r', '--rate-limit',
1538                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1539
1540                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1541                 authentication.add_option('-u', '--username',
1542                                 dest='username', metavar='UN', help='account username')
1543                 authentication.add_option('-p', '--password',
1544                                 dest='password', metavar='PW', help='account password')
1545                 authentication.add_option('-n', '--netrc',
1546                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1547                 parser.add_option_group(authentication)
1548
1549                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1550                 video_format.add_option('-f', '--format',
1551                                 action='store', dest='format', metavar='FMT', help='video format code')
1552                 video_format.add_option('-b', '--best-quality',
1553                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1554                 video_format.add_option('-m', '--mobile-version',
1555                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1556                 video_format.add_option('-d', '--high-def',
1557                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1558                 parser.add_option_group(video_format)
1559
1560                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1561                 verbosity.add_option('-q', '--quiet',
1562                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1563                 verbosity.add_option('-s', '--simulate',
1564                                 action='store_true', dest='simulate', help='do not download video', default=False)
1565                 verbosity.add_option('-g', '--get-url',
1566                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1567                 verbosity.add_option('-e', '--get-title',
1568                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1569                 verbosity.add_option('--no-progress',
1570                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1571                 parser.add_option_group(verbosity)
1572
1573                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1574                 filesystem.add_option('-t', '--title',
1575                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1576                 filesystem.add_option('-l', '--literal',
1577                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1578                 filesystem.add_option('-o', '--output',
1579                                 dest='outtmpl', metavar='TPL', help='output filename template')
1580                 filesystem.add_option('-a', '--batch-file',
1581                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1582                 filesystem.add_option('-w', '--no-overwrites',
1583                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1584                 filesystem.add_option('-c', '--continue',
1585                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1586                 parser.add_option_group(filesystem)
1587
1588                 (opts, args) = parser.parse_args()
1589
1590                 # Batch file verification
1591                 batchurls = []
1592                 if opts.batchfile is not None:
1593                         try:
1594                                 batchurls = open(opts.batchfile, 'r').readlines()
1595                                 batchurls = [x.strip() for x in batchurls]
1596                                 batchurls = [x for x in batchurls if len(x) > 0]
1597                         except IOError:
1598                                 sys.exit(u'ERROR: batch file could not be read')
1599                 all_urls = batchurls + args
1600
1601                 # Conflicting, missing and erroneous options
1602                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1603                         parser.error(u'using .netrc conflicts with giving username/password')
1604                 if opts.password is not None and opts.username is None:
1605                         parser.error(u'account username missing')
1606                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1607                         parser.error(u'using output template conflicts with using title or literal title')
1608                 if opts.usetitle and opts.useliteral:
1609                         parser.error(u'using title conflicts with using literal title')
1610                 if opts.username is not None and opts.password is None:
1611                         opts.password = getpass.getpass(u'Type account password and press return:')
1612                 if opts.ratelimit is not None:
1613                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1614                         if numeric_limit is None:
1615                                 parser.error(u'invalid rate limit specified')
1616                         opts.ratelimit = numeric_limit
1617
1618                 # Information extractors
1619                 youtube_ie = YoutubeIE()
1620                 metacafe_ie = MetacafeIE(youtube_ie)
1621                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1622                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1623                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1624                 google_ie = GoogleIE()
1625                 photobucket_ie = PhotobucketIE()
1626                 generic_ie = GenericIE()
1627
1628                 # File downloader
1629                 fd = FileDownloader({
1630                         'usenetrc': opts.usenetrc,
1631                         'username': opts.username,
1632                         'password': opts.password,
1633                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1634                         'forceurl': opts.geturl,
1635                         'forcetitle': opts.gettitle,
1636                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1637                         'format': opts.format,
1638                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1639                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1640                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1641                                 or u'%(id)s.%(ext)s'),
1642                         'ignoreerrors': opts.ignoreerrors,
1643                         'ratelimit': opts.ratelimit,
1644                         'nooverwrites': opts.nooverwrites,
1645                         'continuedl': opts.continue_dl,
1646                         'noprogress': opts.noprogress,
1647                         })
1648                 fd.add_info_extractor(youtube_search_ie)
1649                 fd.add_info_extractor(youtube_pl_ie)
1650                 fd.add_info_extractor(youtube_user_ie)
1651                 fd.add_info_extractor(metacafe_ie)
1652                 fd.add_info_extractor(youtube_ie)
1653                 fd.add_info_extractor(google_ie)
1654                 fd.add_info_extractor(photobucket_ie)
1655
1656                 # This must come last since it's the
1657                 # fallback if none of the others work
1658                 fd.add_info_extractor(generic_ie)
1659
1660                 # Update version
1661                 if opts.update_self:
1662                         update_self(fd, sys.argv[0])
1663
1664                 # Maybe do nothing
1665                 if len(all_urls) < 1:
1666                         if not opts.update_self:
1667                                 parser.error(u'you must provide at least one URL')
1668                         else:
1669                                 sys.exit()
1670                 retcode = fd.download(all_urls)
1671                 sys.exit(retcode)
1672
1673         except DownloadError:
1674                 sys.exit(1)
1675         except SameFileError:
1676                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1677         except KeyboardInterrupt:
1678                 sys.exit(u'\nERROR: Interrupted by user')