youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61         pass
  62
  63 class ContentTooShortError(Exception):
  64         """Content Too Short exception.
  65
  66         This exception may be raised by FileDownloader objects when a file they
  67         download is too small for what the server announced first, indicating
  68         the connection was probably interrupted.
  69         """
  70         # Both in bytes
  71         downloaded = None
  72         expected = None
  73
  74         def __init__(self, downloaded, expected):
  75                 self.downloaded = downloaded
  76                 self.expected = expected
  77
  78 class FileDownloader(object):
  79         """File Downloader class.
  80
  81         File downloader objects are the ones responsible of downloading the
  82         actual video file and writing it to disk if the user has requested
  83         it, among some other tasks. In most cases there should be one per
  84         program. As, given a video URL, the downloader doesn't know how to
  85         extract all the needed information, task that InfoExtractors do, it
  86         has to pass the URL to one of them.
  87
  88         For this, file downloader objects have a method that allows
  89         InfoExtractors to be registered in a given order. When it is passed
  90         a URL, the file downloader handles it to the first InfoExtractor it
  91         finds that reports being able to handle it. The InfoExtractor extracts
  92         all the information about the video or videos the URL refers to, and
  93         asks the FileDownloader to process the video information, possibly
  94         downloading the video.
  95
  96         File downloaders accept a lot of parameters. In order not to saturate
  97         the object constructor with arguments, it receives a dictionary of
  98         options instead. These options are available through the params
  99         attribute for the InfoExtractors to use. The FileDownloader also
 100         registers itself as the downloader in charge for the InfoExtractors
 101         that are added to it, so this is a "mutual registration".
 102
 103         Available options:
 104
 105         username:       Username for authentication purposes.
 106         password:       Password for authentication purposes.
 107         usenetrc:       Use netrc for authentication instead.
 108         quiet:          Do not print messages to stdout.
 109         forceurl:       Force printing final URL.
 110         forcetitle:     Force printing title.
 111         simulate:       Do not download the video files.
 112         format:         Video format code.
 113         outtmpl:        Template for output names.
 114         ignoreerrors:   Do not stop on download errors.
 115         ratelimit:      Download speed limit, in bytes/sec.
 116         nooverwrites:   Prevent overwriting files.
 117         """
 118
 119         params = None
 120         _ies = []
 121         _pps = []
 122         _download_retcode = None
 123
 124         def __init__(self, params):
 125                 """Create a FileDownloader object with the given options."""
 126                 self._ies = []
 127                 self._pps = []
 128                 self._download_retcode = 0
 129                 self.params = params
 130
 131         @staticmethod
 132         def pmkdir(filename):
 133                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 134                 components = filename.split(os.sep)
 135                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 136                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 137                 for dir in aggregate:
 138                         if not os.path.exists(dir):
 139                                 os.mkdir(dir)
 140
 141         @staticmethod
 142         def format_bytes(bytes):
 143                 if bytes is None:
 144                         return 'N/A'
 145                 if bytes == 0:
 146                         exponent = 0
 147                 else:
 148                         exponent = long(math.log(float(bytes), 1024.0))
 149                 suffix = 'bkMGTPEZY'[exponent]
 150                 converted = float(bytes) / float(1024**exponent)
 151                 return '%.2f%s' % (converted, suffix)
 152
 153         @staticmethod
 154         def calc_percent(byte_counter, data_len):
 155                 if data_len is None:
 156                         return '---.-%'
 157                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 158
 159         @staticmethod
 160         def calc_eta(start, now, total, current):
 161                 if total is None:
 162                         return '--:--'
 163                 dif = now - start
 164                 if current == 0 or dif < 0.001: # One millisecond
 165                         return '--:--'
 166                 rate = float(current) / dif
 167                 eta = long((float(total) - float(current)) / rate)
 168                 (eta_mins, eta_secs) = divmod(eta, 60)
 169                 if eta_mins > 99:
 170                         return '--:--'
 171                 return '%02d:%02d' % (eta_mins, eta_secs)
 172
 173         @staticmethod
 174         def calc_speed(start, now, bytes):
 175                 dif = now - start
 176                 if bytes == 0 or dif < 0.001: # One millisecond
 177                         return '%10s' % '---b/s'
 178                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 179
 180         @staticmethod
 181         def best_block_size(elapsed_time, bytes):
 182                 new_min = max(bytes / 2.0, 1.0)
 183                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 184                 if elapsed_time < 0.001:
 185                         return int(new_max)
 186                 rate = bytes / elapsed_time
 187                 if rate > new_max:
 188                         return int(new_max)
 189                 if rate < new_min:
 190                         return int(new_min)
 191                 return int(rate)
 192
 193         @staticmethod
 194         def parse_bytes(bytestr):
 195                 """Parse a string indicating a byte quantity into a long integer."""
 196                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 197                 if matchobj is None:
 198                         return None
 199                 number = float(matchobj.group(1))
 200                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 201                 return long(round(number * multiplier))
 202
 203         def add_info_extractor(self, ie):
 204                 """Add an InfoExtractor object to the end of the list."""
 205                 self._ies.append(ie)
 206                 ie.set_downloader(self)
 207
 208         def add_post_processor(self, pp):
 209                 """Add a PostProcessor object to the end of the chain."""
 210                 self._pps.append(pp)
 211                 pp.set_downloader(self)
 212
 213         def to_stdout(self, message, skip_eol=False):
 214                 """Print message to stdout if not in quiet mode."""
 215                 if not self.params.get('quiet', False):
 216                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 217                         sys.stdout.flush()
 218
 219         def to_stderr(self, message):
 220                 """Print message to stderr."""
 221                 print >>sys.stderr, message
 222
 223         def fixed_template(self):
 224                 """Checks if the output template is fixed."""
 225                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 226
 227         def trouble(self, message=None):
 228                 """Determine action to take when a download problem appears.
 229
 230                 Depending on if the downloader has been configured to ignore
 231                 download errors or not, this method may throw an exception or
 232                 not when errors are found, after printing the message.
 233                 """
 234                 if message is not None:
 235                         self.to_stderr(message)
 236                 if not self.params.get('ignoreerrors', False):
 237                         raise DownloadError(message)
 238                 self._download_retcode = 1
 239
 240         def slow_down(self, start_time, byte_counter):
 241                 """Sleep if the download speed is over the rate limit."""
 242                 rate_limit = self.params.get('ratelimit', None)
 243                 if rate_limit is None or byte_counter == 0:
 244                         return
 245                 now = time.time()
 246                 elapsed = now - start_time
 247                 if elapsed <= 0.0:
 248                         return
 249                 speed = float(byte_counter) / elapsed
 250                 if speed > rate_limit:
 251                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 252
 253         def report_destination(self, filename):
 254                 """Report destination filename."""
 255                 self.to_stdout(u'[download] Destination: %s' % filename)
 256
 257         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 258                 """Report download progress."""
 259                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 260                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 261
 262         def report_finish(self):
 263                 """Report download finished."""
 264                 self.to_stdout(u'')
 265
 266         def process_info(self, info_dict):
 267                 """Process a single dictionary returned by an InfoExtractor."""
 268                 # Forced printings
 269                 if self.params.get('forcetitle', False):
 270                         print info_dict['title'].encode(locale.getpreferredencoding())
 271                 if self.params.get('forceurl', False):
 272                         print info_dict['url'].encode(locale.getpreferredencoding())
 273
 274                 # Do nothing else if in simulate mode
 275                 if self.params.get('simulate', False):
 276                         return
 277
 278                 try:
 279                         template_dict = dict(info_dict)
 280                         template_dict['epoch'] = unicode(long(time.time()))
 281                         filename = self.params['outtmpl'] % template_dict
 282                         self.report_destination(filename)
 283                 except (ValueError, KeyError), err:
 284                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 285                 if self.params['nooverwrites'] and os.path.exists(filename):
 286                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 287                         return
 288
 289                 try:
 290                         self.pmkdir(filename)
 291                 except (OSError, IOError), err:
 292                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 293                         return
 294
 295                 try:
 296                         outstream = open(filename, 'wb')
 297                 except (OSError, IOError), err:
 298                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 299                         return
 300
 301                 try:
 302                         self._do_download(outstream, info_dict['url'])
 303                         outstream.close()
 304                 except (OSError, IOError), err:
 305                         outstream.close()
 306                         os.remove(filename)
 307                         raise UnavailableFormatError
 308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 309                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 310                         return
 311                 except (ContentTooShortError, ), err:
 312                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 313                         return
 314
 315                 try:
 316                         self.post_process(filename, info_dict)
 317                 except (PostProcessingError), err:
 318                         self.trouble('ERROR: postprocessing: %s' % str(err))
 319                         return
 320
 321         def download(self, url_list):
 322                 """Download a given list of URLs."""
 323                 if len(url_list) > 1 and self.fixed_template():
 324                         raise SameFileError(self.params['outtmpl'])
 325
 326                 for url in url_list:
 327                         suitable_found = False
 328                         for ie in self._ies:
 329                                 # Go to next InfoExtractor if not suitable
 330                                 if not ie.suitable(url):
 331                                         continue
 332
 333                                 # Suitable InfoExtractor found
 334                                 suitable_found = True
 335
 336                                 # Extract information from URL and process it
 337                                 ie.extract(url)
 338
 339                                 # Suitable InfoExtractor had been found; go to next URL
 340                                 break
 341
 342                         if not suitable_found:
 343                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 344
 345                 return self._download_retcode
 346
 347         def post_process(self, filename, ie_info):
 348                 """Run the postprocessing chain on the given file."""
 349                 info = dict(ie_info)
 350                 info['filepath'] = filename
 351                 for pp in self._pps:
 352                         info = pp.run(info)
 353                         if info is None:
 354                                 break
 355
 356         def _do_download(self, stream, url):
 357                 request = urllib2.Request(url, None, std_headers)
 358                 data = urllib2.urlopen(request)
 359                 data_len = data.info().get('Content-length', None)
 360                 data_len_str = self.format_bytes(data_len)
 361                 byte_counter = 0
 362                 block_size = 1024
 363                 start = time.time()
 364                 while True:
 365                         # Progress message
 366                         percent_str = self.calc_percent(byte_counter, data_len)
 367                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 368                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 369                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 370
 371                         # Download and write
 372                         before = time.time()
 373                         data_block = data.read(block_size)
 374                         after = time.time()
 375                         data_block_len = len(data_block)
 376                         if data_block_len == 0:
 377                                 break
 378                         byte_counter += data_block_len
 379                         stream.write(data_block)
 380                         block_size = self.best_block_size(after - before, data_block_len)
 381
 382                         # Apply rate limit
 383                         self.slow_down(start, byte_counter)
 384
 385                 self.report_finish()
 386                 if data_len is not None and str(byte_counter) != data_len:
 387                         raise ContentTooShortError(byte_counter, long(data_len))
 388
 389 class InfoExtractor(object):
 390         """Information Extractor class.
 391
 392         Information extractors are the classes that, given a URL, extract
 393         information from the video (or videos) the URL refers to. This
 394         information includes the real video URL, the video title and simplified
 395         title, author and others. The information is stored in a dictionary
 396         which is then passed to the FileDownloader. The FileDownloader
 397         processes this information possibly downloading the video to the file
 398         system, among other possible outcomes. The dictionaries must include
 399         the following fields:
 400
 401         id:             Video identifier.
 402         url:            Final video URL.
 403         uploader:       Nickname of the video uploader.
 404         title:          Literal title.
 405         stitle:         Simplified title.
 406         ext:            Video filename extension.
 407
 408         Subclasses of this one should re-define the _real_initialize() and
 409         _real_extract() methods, as well as the suitable() static method.
 410         Probably, they should also be instantiated and added to the main
 411         downloader.
 412         """
 413
 414         _ready = False
 415         _downloader = None
 416
 417         def __init__(self, downloader=None):
 418                 """Constructor. Receives an optional downloader."""
 419                 self._ready = False
 420                 self.set_downloader(downloader)
 421
 422         @staticmethod
 423         def suitable(url):
 424                 """Receives a URL and returns True if suitable for this IE."""
 425                 return False
 426
 427         def initialize(self):
 428                 """Initializes an instance (authentication, etc)."""
 429                 if not self._ready:
 430                         self._real_initialize()
 431                         self._ready = True
 432
 433         def extract(self, url):
 434                 """Extracts URL information and returns it in list of dicts."""
 435                 self.initialize()
 436                 return self._real_extract(url)
 437
 438         def set_downloader(self, downloader):
 439                 """Sets the downloader for this IE."""
 440                 self._downloader = downloader
 441
 442         def _real_initialize(self):
 443                 """Real initialization process. Redefine in subclasses."""
 444                 pass
 445
 446         def _real_extract(self, url):
 447                 """Real extraction process. Redefine in subclasses."""
 448                 pass
 449
 450 class YoutubeIE(InfoExtractor):
 451         """Information extractor for youtube.com."""
 452
 453         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 454         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 455         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 456         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 457         _NETRC_MACHINE = 'youtube'
 458         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
 459         _video_extensions = {
 460                 '13': '3gp',
 461                 '17': 'mp4',
 462                 '18': 'mp4',
 463                 '22': 'mp4',
 464         }
 465
 466         @staticmethod
 467         def suitable(url):
 468                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 469
 470         @staticmethod
 471         def htmlentity_transform(matchobj):
 472                 """Transforms an HTML entity to a Unicode character."""
 473                 entity = matchobj.group(1)
 474
 475                 # Known non-numeric HTML entity
 476                 if entity in htmlentitydefs.name2codepoint:
 477                         return unichr(htmlentitydefs.name2codepoint[entity])
 478
 479                 # Unicode character
 480                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 481                 if mobj is not None:
 482                         numstr = mobj.group(1)
 483                         if numstr.startswith(u'x'):
 484                                 base = 16
 485                                 numstr = u'0%s' % numstr
 486                         else:
 487                                 base = 10
 488                         return unichr(long(numstr, base))
 489
 490                 # Unknown entity in name, return its literal representation
 491                 return (u'&%s;' % entity)
 492
 493         def report_lang(self):
 494                 """Report attempt to set language."""
 495                 self._downloader.to_stdout(u'[youtube] Setting language')
 496
 497         def report_login(self):
 498                 """Report attempt to log in."""
 499                 self._downloader.to_stdout(u'[youtube] Logging in')
 500
 501         def report_age_confirmation(self):
 502                 """Report attempt to confirm age."""
 503                 self._downloader.to_stdout(u'[youtube] Confirming age')
 504
 505         def report_webpage_download(self, video_id):
 506                 """Report attempt to download webpage."""
 507                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 508
 509         def report_information_extraction(self, video_id):
 510                 """Report attempt to extract video information."""
 511                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 512
 513         def report_video_url(self, video_id, video_real_url):
 514                 """Report extracted video URL."""
 515                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 516
 517         def report_unavailable_format(self, video_id, format):
 518                 """Report extracted video URL."""
 519                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 520
 521         def _real_initialize(self):
 522                 if self._downloader is None:
 523                         return
 524
 525                 username = None
 526                 password = None
 527                 downloader_params = self._downloader.params
 528
 529                 # Attempt to use provided username and password or .netrc data
 530                 if downloader_params.get('username', None) is not None:
 531                         username = downloader_params['username']
 532                         password = downloader_params['password']
 533                 elif downloader_params.get('usenetrc', False):
 534                         try:
 535                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 536                                 if info is not None:
 537                                         username = info[0]
 538                                         password = info[2]
 539                                 else:
 540                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 541                         except (IOError, netrc.NetrcParseError), err:
 542                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 543                                 return
 544
 545                 # Set language
 546                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 547                 try:
 548                         self.report_lang()
 549                         urllib2.urlopen(request).read()
 550                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 551                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 552                         return
 553
 554                 # No authentication to be performed
 555                 if username is None:
 556                         return
 557
 558                 # Log in
 559                 login_form = {
 560                                 'current_form': 'loginForm',
 561                                 'next':         '/',
 562                                 'action_login': 'Log In',
 563                                 'username':     username,
 564                                 'password':     password,
 565                                 }
 566                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 567                 try:
 568                         self.report_login()
 569                         login_results = urllib2.urlopen(request).read()
 570                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 571                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 572                                 return
 573                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 574                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 575                         return
 576
 577                 # Confirm age
 578                 age_form = {
 579                                 'next_url':             '/',
 580                                 'action_confirm':       'Confirm',
 581                                 }
 582                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 583                 try:
 584                         self.report_age_confirmation()
 585                         age_results = urllib2.urlopen(request).read()
 586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 587                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 588                         return
 589
 590         def _real_extract(self, url):
 591                 # Extract video id from URL
 592                 mobj = re.match(self._VALID_URL, url)
 593                 if mobj is None:
 594                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 595                         return
 596                 video_id = mobj.group(2)
 597
 598                 # Downloader parameters
 599                 best_quality = False
 600                 format_param = None
 601                 quality_index = 0
 602                 if self._downloader is not None:
 603                         params = self._downloader.params
 604                         format_param = params.get('format', None)
 605                         if format_param == '0':
 606                                 format_param = self._available_formats[quality_index]
 607                                 best_quality = True
 608
 609                 while True:
 610                         try:
 611                                 # Extension
 612                                 video_extension = self._video_extensions.get(format_param, 'flv')
 613
 614                                 # Normalize URL, including format
 615                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 616                                 if format_param is not None:
 617                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 618                                 request = urllib2.Request(normalized_url, None, std_headers)
 619                                 try:
 620                                         self.report_webpage_download(video_id)
 621                                         video_webpage = urllib2.urlopen(request).read()
 622                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 623                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 624                                         return
 625                                 self.report_information_extraction(video_id)
 626
 627                                 # "t" param
 628                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 629                                 if mobj is None:
 630                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 631                                         return
 632                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 633                                 if format_param is not None:
 634                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 635                                 self.report_video_url(video_id, video_real_url)
 636
 637                                 # uploader
 638                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 639                                 if mobj is None:
 640                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 641                                         return
 642                                 video_uploader = mobj.group(1)
 643
 644                                 # title
 645                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 646                                 if mobj is None:
 647                                         self._downloader.trouble(u'ERROR: unable to extract video title')
 648                                         return
 649                                 video_title = mobj.group(1).decode('utf-8')
 650                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 651                                 video_title = video_title.replace(os.sep, u'%')
 652
 653                                 # simplified title
 654                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 655                                 simple_title = simple_title.strip(ur'_')
 656
 657                                 # Process video information
 658                                 self._downloader.process_info({
 659                                         'id':           video_id.decode('utf-8'),
 660                                         'url':          video_real_url.decode('utf-8'),
 661                                         'uploader':     video_uploader.decode('utf-8'),
 662                                         'title':        video_title,
 663                                         'stitle':       simple_title,
 664                                         'ext':          video_extension.decode('utf-8'),
 665                                 })
 666
 667                                 return
 668
 669                         except UnavailableFormatError, err:
 670                                 if best_quality:
 671                                         if quality_index == len(self._available_formats) - 1:
 672                                                 # I don't ever expect this to happen
 673                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 674                                                 return
 675                                         else:
 676                                                 self.report_unavailable_format(video_id, format_param)
 677                                                 quality_index += 1
 678                                                 format_param = self._available_formats[quality_index]
 679                                                 continue
 680                                 else:
 681                                         self._downloader.trouble('ERROR: format not available for video')
 682                                         return
 683
 684
 685 class MetacafeIE(InfoExtractor):
 686         """Information Extractor for metacafe.com."""
 687
 688         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 689         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 690         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 691         _youtube_ie = None
 692
 693         def __init__(self, youtube_ie, downloader=None):
 694                 InfoExtractor.__init__(self, downloader)
 695                 self._youtube_ie = youtube_ie
 696
 697         @staticmethod
 698         def suitable(url):
 699                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 700
 701         def report_disclaimer(self):
 702                 """Report disclaimer retrieval."""
 703                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 704
 705         def report_age_confirmation(self):
 706                 """Report attempt to confirm age."""
 707                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 708
 709         def report_download_webpage(self, video_id):
 710                 """Report webpage download."""
 711                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 712
 713         def report_extraction(self, video_id):
 714                 """Report information extraction."""
 715                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 716
 717         def _real_initialize(self):
 718                 # Retrieve disclaimer
 719                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 720                 try:
 721                         self.report_disclaimer()
 722                         disclaimer = urllib2.urlopen(request).read()
 723                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 724                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 725                         return
 726
 727                 # Confirm age
 728                 disclaimer_form = {
 729                         'filters': '0',
 730                         'submit': "Continue - I'm over 18",
 731                         }
 732                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 733                 try:
 734                         self.report_age_confirmation()
 735                         disclaimer = urllib2.urlopen(request).read()
 736                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 737                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 738                         return
 739
 740         def _real_extract(self, url):
 741                 # Extract id and simplified title from URL
 742                 mobj = re.match(self._VALID_URL, url)
 743                 if mobj is None:
 744                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 745                         return
 746
 747                 video_id = mobj.group(1)
 748
 749                 # Check if video comes from YouTube
 750                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 751                 if mobj2 is not None:
 752                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 753                         return
 754
 755                 simple_title = mobj.group(2).decode('utf-8')
 756                 video_extension = 'flv'
 757
 758                 # Retrieve video webpage to extract further information
 759                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 760                 try:
 761                         self.report_download_webpage(video_id)
 762                         webpage = urllib2.urlopen(request).read()
 763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 764                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 765                         return
 766
 767                 # Extract URL, uploader and title from webpage
 768                 self.report_extraction(video_id)
 769                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 770                 if mobj is None:
 771                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 772                         return
 773                 mediaURL = urllib.unquote(mobj.group(1))
 774
 775                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 776                 if mobj is None:
 777                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 778                         return
 779                 gdaKey = mobj.group(1)
 780
 781                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 782
 783                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 784                 if mobj is None:
 785                         self._downloader.trouble(u'ERROR: unable to extract title')
 786                         return
 787                 video_title = mobj.group(1).decode('utf-8')
 788
 789                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 790                 if mobj is None:
 791                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 792                         return
 793                 video_uploader = mobj.group(1)
 794
 795                 try:
 796                         # Process video information
 797                         self._downloader.process_info({
 798                                 'id':           video_id.decode('utf-8'),
 799                                 'url':          video_url.decode('utf-8'),
 800                                 'uploader':     video_uploader.decode('utf-8'),
 801                                 'title':        video_title,
 802                                 'stitle':       simple_title,
 803                                 'ext':          video_extension.decode('utf-8'),
 804                         })
 805                 except UnavailableFormatError:
 806                         self._downloader.trouble(u'ERROR: format not available for video')
 807
 808
 809 class YoutubeSearchIE(InfoExtractor):
 810         """Information Extractor for YouTube search queries."""
 811         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 812         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 813         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 814         _MORE_PAGES_INDICATOR = r'>Next</a>'
 815         _youtube_ie = None
 816         _max_youtube_results = 1000
 817
 818         def __init__(self, youtube_ie, downloader=None):
 819                 InfoExtractor.__init__(self, downloader)
 820                 self._youtube_ie = youtube_ie
 821
 822         @staticmethod
 823         def suitable(url):
 824                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 825
 826         def report_download_page(self, query, pagenum):
 827                 """Report attempt to download playlist page with given number."""
 828                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 829
 830         def _real_initialize(self):
 831                 self._youtube_ie.initialize()
 832
 833         def _real_extract(self, query):
 834                 mobj = re.match(self._VALID_QUERY, query)
 835                 if mobj is None:
 836                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 837                         return
 838
 839                 prefix, query = query.split(':')
 840                 prefix = prefix[8:]
 841                 if prefix == '':
 842                         self._download_n_results(query, 1)
 843                         return
 844                 elif prefix == 'all':
 845                         self._download_n_results(query, self._max_youtube_results)
 846                         return
 847                 else:
 848                         try:
 849                                 n = int(prefix)
 850                                 if n <= 0:
 851                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 852                                         return
 853                                 elif n > self._max_youtube_results:
 854                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 855                                         n = self._max_youtube_results
 856                                 self._download_n_results(query, n)
 857                                 return
 858                         except ValueError: # parsing prefix as int fails
 859                                 self._download_n_results(query, 1)
 860                                 return
 861
 862         def _download_n_results(self, query, n):
 863                 """Downloads a specified number of results for a query"""
 864
 865                 video_ids = []
 866                 already_seen = set()
 867                 pagenum = 1
 868
 869                 while True:
 870                         self.report_download_page(query, pagenum)
 871                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 872                         request = urllib2.Request(result_url, None, std_headers)
 873                         try:
 874                                 page = urllib2.urlopen(request).read()
 875                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 876                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 877                                 return
 878
 879                         # Extract video identifiers
 880                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 881                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 882                                 if video_id not in already_seen:
 883                                         video_ids.append(video_id)
 884                                         already_seen.add(video_id)
 885                                         if len(video_ids) == n:
 886                                                 # Specified n videos reached
 887                                                 for id in video_ids:
 888                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 889                                                 return
 890
 891                         if self._MORE_PAGES_INDICATOR not in page:
 892                                 for id in video_ids:
 893                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 894                                 return
 895
 896                         pagenum = pagenum + 1
 897
 898 class YoutubePlaylistIE(InfoExtractor):
 899         """Information Extractor for YouTube playlists."""
 900
 901         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 902         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 903         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 904         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 905         _youtube_ie = None
 906
 907         def __init__(self, youtube_ie, downloader=None):
 908                 InfoExtractor.__init__(self, downloader)
 909                 self._youtube_ie = youtube_ie
 910
 911         @staticmethod
 912         def suitable(url):
 913                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 914
 915         def report_download_page(self, playlist_id, pagenum):
 916                 """Report attempt to download playlist page with given number."""
 917                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 918
 919         def _real_initialize(self):
 920                 self._youtube_ie.initialize()
 921
 922         def _real_extract(self, url):
 923                 # Extract playlist id
 924                 mobj = re.match(self._VALID_URL, url)
 925                 if mobj is None:
 926                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 927                         return
 928
 929                 # Download playlist pages
 930                 playlist_id = mobj.group(1)
 931                 video_ids = []
 932                 pagenum = 1
 933
 934                 while True:
 935                         self.report_download_page(playlist_id, pagenum)
 936                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 937                         try:
 938                                 page = urllib2.urlopen(request).read()
 939                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 940                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 941                                 return
 942
 943                         # Extract video identifiers
 944                         ids_in_page = []
 945                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 946                                 if mobj.group(1) not in ids_in_page:
 947                                         ids_in_page.append(mobj.group(1))
 948                         video_ids.extend(ids_in_page)
 949
 950                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 951                                 break
 952                         pagenum = pagenum + 1
 953
 954                 for id in video_ids:
 955                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 956                 return
 957
 958 class PostProcessor(object):
 959         """Post Processor class.
 960
 961         PostProcessor objects can be added to downloaders with their
 962         add_post_processor() method. When the downloader has finished a
 963         successful download, it will take its internal chain of PostProcessors
 964         and start calling the run() method on each one of them, first with
 965         an initial argument and then with the returned value of the previous
 966         PostProcessor.
 967
 968         The chain will be stopped if one of them ever returns None or the end
 969         of the chain is reached.
 970
 971         PostProcessor objects follow a "mutual registration" process similar
 972         to InfoExtractor objects.
 973         """
 974
 975         _downloader = None
 976
 977         def __init__(self, downloader=None):
 978                 self._downloader = downloader
 979
 980         def set_downloader(self, downloader):
 981                 """Sets the downloader for this PP."""
 982                 self._downloader = downloader
 983
 984         def run(self, information):
 985                 """Run the PostProcessor.
 986
 987                 The "information" argument is a dictionary like the ones
 988                 composed by InfoExtractors. The only difference is that this
 989                 one has an extra field called "filepath" that points to the
 990                 downloaded file.
 991
 992                 When this method returns None, the postprocessing chain is
 993                 stopped. However, this method may return an information
 994                 dictionary that will be passed to the next postprocessing
 995                 object in the chain. It can be the one it received after
 996                 changing some fields.
 997
 998                 In addition, this method may raise a PostProcessingError
 999                 exception that will be taken into account by the downloader
1000                 it was called from.
1001                 """
1002                 return information # by default, do nothing
1003
1004 ### MAIN PROGRAM ###
1005 if __name__ == '__main__':
1006         try:
1007                 # Modules needed only when running the main program
1008                 import getpass
1009                 import optparse
1010
1011                 # General configuration
1012                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1013                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1014                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1015
1016                 # Parse command line
1017                 parser = optparse.OptionParser(
1018                         usage='Usage: %prog [options] url...',
1019                         version='INTERNAL',
1020                         conflict_handler='resolve',
1021                 )
1022
1023                 parser.add_option('-h', '--help',
1024                                 action='help', help='print this help text and exit')
1025                 parser.add_option('-v', '--version',
1026                                 action='version', help='print program version and exit')
1027                 parser.add_option('-i', '--ignore-errors',
1028                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1029                 parser.add_option('-r', '--rate-limit',
1030                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1031
1032                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1033                 authentication.add_option('-u', '--username',
1034                                 dest='username', metavar='UN', help='account username')
1035                 authentication.add_option('-p', '--password',
1036                                 dest='password', metavar='PW', help='account password')
1037                 authentication.add_option('-n', '--netrc',
1038                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1039                 parser.add_option_group(authentication)
1040
1041                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1042                 video_format.add_option('-f', '--format',
1043                                 action='append', dest='format', metavar='FMT', help='video format code')
1044                 video_format.add_option('-b', '--best-quality',
1045                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1046                 video_format.add_option('-m', '--mobile-version',
1047                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1048                 video_format.add_option('-d', '--high-def',
1049                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1050                 parser.add_option_group(video_format)
1051
1052                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1053                 verbosity.add_option('-q', '--quiet',
1054                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1055                 verbosity.add_option('-s', '--simulate',
1056                                 action='store_true', dest='simulate', help='do not download video', default=False)
1057                 verbosity.add_option('-g', '--get-url',
1058                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1059                 verbosity.add_option('-e', '--get-title',
1060                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1061                 parser.add_option_group(verbosity)
1062
1063                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1064                 filesystem.add_option('-t', '--title',
1065                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1066                 filesystem.add_option('-l', '--literal',
1067                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1068                 filesystem.add_option('-o', '--output',
1069                                 dest='outtmpl', metavar='TPL', help='output filename template')
1070                 filesystem.add_option('-a', '--batch-file',
1071                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1072                 filesystem.add_option('-w', '--no-overwrites',
1073                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1074                 parser.add_option_group(filesystem)
1075
1076                 (opts, args) = parser.parse_args()
1077
1078                 # Batch file verification
1079                 batchurls = []
1080                 if opts.batchfile is not None:
1081                         try:
1082                                 batchurls = open(opts.batchfile, 'r').readlines()
1083                                 batchurls = [x.strip() for x in batchurls]
1084                                 batchurls = [x for x in batchurls if len(x) > 0]
1085                         except IOError:
1086                                 sys.exit(u'ERROR: batch file could not be read')
1087                 all_urls = batchurls + args
1088
1089                 # Conflicting, missing and erroneous options
1090                 if len(all_urls) < 1:
1091                         parser.error(u'you must provide at least one URL')
1092                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1093                         parser.error(u'using .netrc conflicts with giving username/password')
1094                 if opts.password is not None and opts.username is None:
1095                         parser.error(u'account username missing')
1096                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1097                         parser.error(u'using output template conflicts with using title or literal title')
1098                 if opts.usetitle and opts.useliteral:
1099                         parser.error(u'using title conflicts with using literal title')
1100                 if opts.username is not None and opts.password is None:
1101                         opts.password = getpass.getpass(u'Type account password and press return:')
1102                 if opts.ratelimit is not None:
1103                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1104                         if numeric_limit is None:
1105                                 parser.error(u'invalid rate limit specified')
1106                         opts.ratelimit = numeric_limit
1107                 if opts.format is not None and len(opts.format) > 1:
1108                         parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1109                 if opts.format is None:
1110                         real_format = None
1111                 else:
1112                         real_format = opts.format[0]
1113
1114
1115                 # Information extractors
1116                 youtube_ie = YoutubeIE()
1117                 metacafe_ie = MetacafeIE(youtube_ie)
1118                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1119                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1120
1121                 # File downloader
1122                 fd = FileDownloader({
1123                         'usenetrc': opts.usenetrc,
1124                         'username': opts.username,
1125                         'password': opts.password,
1126                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1127                         'forceurl': opts.geturl,
1128                         'forcetitle': opts.gettitle,
1129                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1130                         'format': real_format,
1131                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1132                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1133                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1134                                 or u'%(id)s.%(ext)s'),
1135                         'ignoreerrors': opts.ignoreerrors,
1136                         'ratelimit': opts.ratelimit,
1137                         'nooverwrites': opts.nooverwrites,
1138                         })
1139                 fd.add_info_extractor(youtube_search_ie)
1140                 fd.add_info_extractor(youtube_pl_ie)
1141                 fd.add_info_extractor(metacafe_ie)
1142                 fd.add_info_extractor(youtube_ie)
1143                 retcode = fd.download(all_urls)
1144                 sys.exit(retcode)
1145
1146         except DownloadError:
1147                 sys.exit(1)
1148         except SameFileError:
1149                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1150         except KeyboardInterrupt:
1151                 sys.exit(u'\nERROR: Interrupted by user')