youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61         pass
  62
  63 class ContentTooShortError(Exception):
  64         """Content Too Short exception.
  65
  66         This exception may be raised by FileDownloader objects when a file they
  67         download is too small for what the server announced first, indicating
  68         the connection was probably interrupted.
  69         """
  70         # Both in bytes
  71         downloaded = None
  72         expected = None
  73
  74         def __init__(self, downloaded, expected):
  75                 self.downloaded = downloaded
  76                 self.expected = expected
  77
  78 class FileDownloader(object):
  79         """File Downloader class.
  80
  81         File downloader objects are the ones responsible of downloading the
  82         actual video file and writing it to disk if the user has requested
  83         it, among some other tasks. In most cases there should be one per
  84         program. As, given a video URL, the downloader doesn't know how to
  85         extract all the needed information, task that InfoExtractors do, it
  86         has to pass the URL to one of them.
  87
  88         For this, file downloader objects have a method that allows
  89         InfoExtractors to be registered in a given order. When it is passed
  90         a URL, the file downloader handles it to the first InfoExtractor it
  91         finds that reports being able to handle it. The InfoExtractor extracts
  92         all the information about the video or videos the URL refers to, and
  93         asks the FileDownloader to process the video information, possibly
  94         downloading the video.
  95
  96         File downloaders accept a lot of parameters. In order not to saturate
  97         the object constructor with arguments, it receives a dictionary of
  98         options instead. These options are available through the params
  99         attribute for the InfoExtractors to use. The FileDownloader also
 100         registers itself as the downloader in charge for the InfoExtractors
 101         that are added to it, so this is a "mutual registration".
 102
 103         Available options:
 104
 105         username:       Username for authentication purposes.
 106         password:       Password for authentication purposes.
 107         usenetrc:       Use netrc for authentication instead.
 108         quiet:          Do not print messages to stdout.
 109         forceurl:       Force printing final URL.
 110         forcetitle:     Force printing title.
 111         simulate:       Do not download the video files.
 112         format:         Video format code.
 113         outtmpl:        Template for output names.
 114         ignoreerrors:   Do not stop on download errors.
 115         ratelimit:      Download speed limit, in bytes/sec.
 116         nooverwrites:   Prevent overwriting files.
 117         """
 118
 119         params = None
 120         _ies = []
 121         _pps = []
 122         _download_retcode = None
 123
 124         def __init__(self, params):
 125                 """Create a FileDownloader object with the given options."""
 126                 self._ies = []
 127                 self._pps = []
 128                 self._download_retcode = 0
 129                 self.params = params
 130
 131         @staticmethod
 132         def pmkdir(filename):
 133                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 134                 components = filename.split(os.sep)
 135                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 136                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 137                 for dir in aggregate:
 138                         if not os.path.exists(dir):
 139                                 os.mkdir(dir)
 140
 141         @staticmethod
 142         def format_bytes(bytes):
 143                 if bytes is None:
 144                         return 'N/A'
 145                 if bytes == 0:
 146                         exponent = 0
 147                 else:
 148                         exponent = long(math.log(float(bytes), 1024.0))
 149                 suffix = 'bkMGTPEZY'[exponent]
 150                 converted = float(bytes) / float(1024**exponent)
 151                 return '%.2f%s' % (converted, suffix)
 152
 153         @staticmethod
 154         def calc_percent(byte_counter, data_len):
 155                 if data_len is None:
 156                         return '---.-%'
 157                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 158
 159         @staticmethod
 160         def calc_eta(start, now, total, current):
 161                 if total is None:
 162                         return '--:--'
 163                 dif = now - start
 164                 if current == 0 or dif < 0.001: # One millisecond
 165                         return '--:--'
 166                 rate = float(current) / dif
 167                 eta = long((float(total) - float(current)) / rate)
 168                 (eta_mins, eta_secs) = divmod(eta, 60)
 169                 if eta_mins > 99:
 170                         return '--:--'
 171                 return '%02d:%02d' % (eta_mins, eta_secs)
 172
 173         @staticmethod
 174         def calc_speed(start, now, bytes):
 175                 dif = now - start
 176                 if bytes == 0 or dif < 0.001: # One millisecond
 177                         return '%10s' % '---b/s'
 178                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 179
 180         @staticmethod
 181         def best_block_size(elapsed_time, bytes):
 182                 new_min = max(bytes / 2.0, 1.0)
 183                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 184                 if elapsed_time < 0.001:
 185                         return int(new_max)
 186                 rate = bytes / elapsed_time
 187                 if rate > new_max:
 188                         return int(new_max)
 189                 if rate < new_min:
 190                         return int(new_min)
 191                 return int(rate)
 192
 193         @staticmethod
 194         def parse_bytes(bytestr):
 195                 """Parse a string indicating a byte quantity into a long integer."""
 196                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 197                 if matchobj is None:
 198                         return None
 199                 number = float(matchobj.group(1))
 200                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 201                 return long(round(number * multiplier))
 202
 203         @staticmethod
 204         def verify_url(url):
 205                 """Verify a URL is valid and data could be downloaded."""
 206                 request = urllib2.Request(url, None, std_headers)
 207                 data = urllib2.urlopen(request)
 208                 data.read(1)
 209                 data.close()
 210
 211         def add_info_extractor(self, ie):
 212                 """Add an InfoExtractor object to the end of the list."""
 213                 self._ies.append(ie)
 214                 ie.set_downloader(self)
 215
 216         def add_post_processor(self, pp):
 217                 """Add a PostProcessor object to the end of the chain."""
 218                 self._pps.append(pp)
 219                 pp.set_downloader(self)
 220
 221         def to_stdout(self, message, skip_eol=False):
 222                 """Print message to stdout if not in quiet mode."""
 223                 if not self.params.get('quiet', False):
 224                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 225                         sys.stdout.flush()
 226
 227         def to_stderr(self, message):
 228                 """Print message to stderr."""
 229                 print >>sys.stderr, message
 230
 231         def fixed_template(self):
 232                 """Checks if the output template is fixed."""
 233                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 234
 235         def trouble(self, message=None):
 236                 """Determine action to take when a download problem appears.
 237
 238                 Depending on if the downloader has been configured to ignore
 239                 download errors or not, this method may throw an exception or
 240                 not when errors are found, after printing the message.
 241                 """
 242                 if message is not None:
 243                         self.to_stderr(message)
 244                 if not self.params.get('ignoreerrors', False):
 245                         raise DownloadError(message)
 246                 self._download_retcode = 1
 247
 248         def slow_down(self, start_time, byte_counter):
 249                 """Sleep if the download speed is over the rate limit."""
 250                 rate_limit = self.params.get('ratelimit', None)
 251                 if rate_limit is None or byte_counter == 0:
 252                         return
 253                 now = time.time()
 254                 elapsed = now - start_time
 255                 if elapsed <= 0.0:
 256                         return
 257                 speed = float(byte_counter) / elapsed
 258                 if speed > rate_limit:
 259                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 260
 261         def report_destination(self, filename):
 262                 """Report destination filename."""
 263                 self.to_stdout(u'[download] Destination: %s' % filename)
 264
 265         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 266                 """Report download progress."""
 267                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 268                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 269
 270         def report_finish(self):
 271                 """Report download finished."""
 272                 self.to_stdout(u'')
 273
 274         def process_info(self, info_dict):
 275                 """Process a single dictionary returned by an InfoExtractor."""
 276                 # Do nothing else if in simulate mode
 277                 if self.params.get('simulate', False):
 278                         try:
 279                                 self.verify_url(info_dict['url'])
 280                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 281                                 raise UnavailableFormatError
 282
 283                         # Forced printings
 284                         if self.params.get('forcetitle', False):
 285                                 print info_dict['title'].encode(locale.getpreferredencoding())
 286                         if self.params.get('forceurl', False):
 287                                 print info_dict['url'].encode(locale.getpreferredencoding())
 288
 289                         return
 290
 291                 try:
 292                         template_dict = dict(info_dict)
 293                         template_dict['epoch'] = unicode(long(time.time()))
 294                         filename = self.params['outtmpl'] % template_dict
 295                         self.report_destination(filename)
 296                 except (ValueError, KeyError), err:
 297                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 298                 if self.params['nooverwrites'] and os.path.exists(filename):
 299                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 300                         return
 301
 302                 try:
 303                         self.pmkdir(filename)
 304                 except (OSError, IOError), err:
 305                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 306                         return
 307
 308                 try:
 309                         outstream = open(filename, 'ab')
 310                 except (OSError, IOError), err:
 311                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 312                         return
 313
 314                 try:
 315                         self._do_download(outstream, info_dict['url'])
 316                         outstream.close()
 317                 except (OSError, IOError), err:
 318                         outstream.close()
 319                         os.remove(filename)
 320                         raise UnavailableFormatError
 321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 322                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 323                         return
 324                 except (ContentTooShortError, ), err:
 325                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 326                         return
 327
 328                 try:
 329                         self.post_process(filename, info_dict)
 330                 except (PostProcessingError), err:
 331                         self.trouble('ERROR: postprocessing: %s' % str(err))
 332                         return
 333
 334         def download(self, url_list):
 335                 """Download a given list of URLs."""
 336                 if len(url_list) > 1 and self.fixed_template():
 337                         raise SameFileError(self.params['outtmpl'])
 338
 339                 for url in url_list:
 340                         suitable_found = False
 341                         for ie in self._ies:
 342                                 # Go to next InfoExtractor if not suitable
 343                                 if not ie.suitable(url):
 344                                         continue
 345
 346                                 # Suitable InfoExtractor found
 347                                 suitable_found = True
 348
 349                                 # Extract information from URL and process it
 350                                 ie.extract(url)
 351
 352                                 # Suitable InfoExtractor had been found; go to next URL
 353                                 break
 354
 355                         if not suitable_found:
 356                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 357
 358                 return self._download_retcode
 359
 360         def post_process(self, filename, ie_info):
 361                 """Run the postprocessing chain on the given file."""
 362                 info = dict(ie_info)
 363                 info['filepath'] = filename
 364                 for pp in self._pps:
 365                         info = pp.run(info)
 366                         if info is None:
 367                                 break
 368
 369         def _do_download(self, stream, url):
 370                 request = urllib2.Request(url, None, std_headers)
 371                 # Resume transfer if filesize is non-zero
 372                 resume_len = stream.tell()
 373                 if self.params["continue"] and resume_len != 0:
 374                         print "[download] Resuming download at byte %d" % resume_len
 375                         request.add_header("Range","bytes=%d-" % resume_len)
 376                 else:
 377                         stream.close()
 378                         stream = open(stream.name,'wb')
 379                 try:
 380                         data = urllib2.urlopen(request)
 381                 except urllib2.HTTPError, e:
 382                         if not e.code == 416: #  416 is 'Requested range not satisfiable'
 383                                 raise
 384                         data = urllib2.urlopen(url)
 385                         if int(data.info()['Content-Length']) == resume_len:
 386                                 print '[download] %s has already been downloaded' % stream.name
 387                                 return
 388                         else:
 389                                 print "[download] Unable to resume, restarting download from the beginning"
 390                                 stream.close()
 391                                 stream = open(stream.name,'wb')
 392                 data_len = data.info().get('Content-length', None)
 393                 data_len_str = self.format_bytes(data_len)
 394                 byte_counter = 0
 395                 block_size = 1024
 396                 start = time.time()
 397                 while True:
 398                         # Progress message
 399                         percent_str = self.calc_percent(byte_counter, data_len)
 400                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 401                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 402                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 403
 404                         # Download and write
 405                         before = time.time()
 406                         data_block = data.read(block_size)
 407                         after = time.time()
 408                         data_block_len = len(data_block)
 409                         if data_block_len == 0:
 410                                 break
 411                         byte_counter += data_block_len
 412                         stream.write(data_block)
 413                         block_size = self.best_block_size(after - before, data_block_len)
 414
 415                         # Apply rate limit
 416                         self.slow_down(start, byte_counter)
 417
 418                 self.report_finish()
 419                 if data_len is not None and str(byte_counter) != data_len:
 420                         raise ContentTooShortError(byte_counter, long(data_len))
 421
 422 class InfoExtractor(object):
 423         """Information Extractor class.
 424
 425         Information extractors are the classes that, given a URL, extract
 426         information from the video (or videos) the URL refers to. This
 427         information includes the real video URL, the video title and simplified
 428         title, author and others. The information is stored in a dictionary
 429         which is then passed to the FileDownloader. The FileDownloader
 430         processes this information possibly downloading the video to the file
 431         system, among other possible outcomes. The dictionaries must include
 432         the following fields:
 433
 434         id:             Video identifier.
 435         url:            Final video URL.
 436         uploader:       Nickname of the video uploader.
 437         title:          Literal title.
 438         stitle:         Simplified title.
 439         ext:            Video filename extension.
 440
 441         Subclasses of this one should re-define the _real_initialize() and
 442         _real_extract() methods, as well as the suitable() static method.
 443         Probably, they should also be instantiated and added to the main
 444         downloader.
 445         """
 446
 447         _ready = False
 448         _downloader = None
 449
 450         def __init__(self, downloader=None):
 451                 """Constructor. Receives an optional downloader."""
 452                 self._ready = False
 453                 self.set_downloader(downloader)
 454
 455         @staticmethod
 456         def suitable(url):
 457                 """Receives a URL and returns True if suitable for this IE."""
 458                 return False
 459
 460         def initialize(self):
 461                 """Initializes an instance (authentication, etc)."""
 462                 if not self._ready:
 463                         self._real_initialize()
 464                         self._ready = True
 465
 466         def extract(self, url):
 467                 """Extracts URL information and returns it in list of dicts."""
 468                 self.initialize()
 469                 return self._real_extract(url)
 470
 471         def set_downloader(self, downloader):
 472                 """Sets the downloader for this IE."""
 473                 self._downloader = downloader
 474
 475         def _real_initialize(self):
 476                 """Real initialization process. Redefine in subclasses."""
 477                 pass
 478
 479         def _real_extract(self, url):
 480                 """Real extraction process. Redefine in subclasses."""
 481                 pass
 482
 483 class YoutubeIE(InfoExtractor):
 484         """Information extractor for youtube.com."""
 485
 486         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 487         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 488         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 489         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 490         _NETRC_MACHINE = 'youtube'
 491         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
 492         _video_extensions = {
 493                 '13': '3gp',
 494                 '17': 'mp4',
 495                 '18': 'mp4',
 496                 '22': 'mp4',
 497         }
 498
 499         @staticmethod
 500         def suitable(url):
 501                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 502
 503         @staticmethod
 504         def htmlentity_transform(matchobj):
 505                 """Transforms an HTML entity to a Unicode character."""
 506                 entity = matchobj.group(1)
 507
 508                 # Known non-numeric HTML entity
 509                 if entity in htmlentitydefs.name2codepoint:
 510                         return unichr(htmlentitydefs.name2codepoint[entity])
 511
 512                 # Unicode character
 513                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 514                 if mobj is not None:
 515                         numstr = mobj.group(1)
 516                         if numstr.startswith(u'x'):
 517                                 base = 16
 518                                 numstr = u'0%s' % numstr
 519                         else:
 520                                 base = 10
 521                         return unichr(long(numstr, base))
 522
 523                 # Unknown entity in name, return its literal representation
 524                 return (u'&%s;' % entity)
 525
 526         def report_lang(self):
 527                 """Report attempt to set language."""
 528                 self._downloader.to_stdout(u'[youtube] Setting language')
 529
 530         def report_login(self):
 531                 """Report attempt to log in."""
 532                 self._downloader.to_stdout(u'[youtube] Logging in')
 533
 534         def report_age_confirmation(self):
 535                 """Report attempt to confirm age."""
 536                 self._downloader.to_stdout(u'[youtube] Confirming age')
 537
 538         def report_webpage_download(self, video_id):
 539                 """Report attempt to download webpage."""
 540                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 541
 542         def report_information_extraction(self, video_id):
 543                 """Report attempt to extract video information."""
 544                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 545
 546         def report_video_url(self, video_id, video_real_url):
 547                 """Report extracted video URL."""
 548                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 549
 550         def report_unavailable_format(self, video_id, format):
 551                 """Report extracted video URL."""
 552                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 553
 554         def _real_initialize(self):
 555                 if self._downloader is None:
 556                         return
 557
 558                 username = None
 559                 password = None
 560                 downloader_params = self._downloader.params
 561
 562                 # Attempt to use provided username and password or .netrc data
 563                 if downloader_params.get('username', None) is not None:
 564                         username = downloader_params['username']
 565                         password = downloader_params['password']
 566                 elif downloader_params.get('usenetrc', False):
 567                         try:
 568                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 569                                 if info is not None:
 570                                         username = info[0]
 571                                         password = info[2]
 572                                 else:
 573                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 574                         except (IOError, netrc.NetrcParseError), err:
 575                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 576                                 return
 577
 578                 # Set language
 579                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 580                 try:
 581                         self.report_lang()
 582                         urllib2.urlopen(request).read()
 583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 584                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 585                         return
 586
 587                 # No authentication to be performed
 588                 if username is None:
 589                         return
 590
 591                 # Log in
 592                 login_form = {
 593                                 'current_form': 'loginForm',
 594                                 'next':         '/',
 595                                 'action_login': 'Log In',
 596                                 'username':     username,
 597                                 'password':     password,
 598                                 }
 599                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 600                 try:
 601                         self.report_login()
 602                         login_results = urllib2.urlopen(request).read()
 603                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 604                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 605                                 return
 606                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 607                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 608                         return
 609
 610                 # Confirm age
 611                 age_form = {
 612                                 'next_url':             '/',
 613                                 'action_confirm':       'Confirm',
 614                                 }
 615                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 616                 try:
 617                         self.report_age_confirmation()
 618                         age_results = urllib2.urlopen(request).read()
 619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 620                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 621                         return
 622
 623         def _real_extract(self, url):
 624                 # Extract video id from URL
 625                 mobj = re.match(self._VALID_URL, url)
 626                 if mobj is None:
 627                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 628                         return
 629                 video_id = mobj.group(2)
 630
 631                 # Downloader parameters
 632                 best_quality = False
 633                 format_param = None
 634                 quality_index = 0
 635                 if self._downloader is not None:
 636                         params = self._downloader.params
 637                         format_param = params.get('format', None)
 638                         if format_param == '0':
 639                                 format_param = self._available_formats[quality_index]
 640                                 best_quality = True
 641
 642                 while True:
 643                         # Extension
 644                         video_extension = self._video_extensions.get(format_param, 'flv')
 645
 646                         # Normalize URL, including format
 647                         normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 648                         if format_param is not None:
 649                                 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 650                         request = urllib2.Request(normalized_url, None, std_headers)
 651                         try:
 652                                 self.report_webpage_download(video_id)
 653                                 video_webpage = urllib2.urlopen(request).read()
 654                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 655                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 656                                 return
 657                         self.report_information_extraction(video_id)
 658
 659                         # "t" param
 660                         mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 661                         if mobj is None:
 662                                 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 663                                 return
 664                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 665                         if format_param is not None:
 666                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 667                         self.report_video_url(video_id, video_real_url)
 668
 669                         # uploader
 670                         mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 671                         if mobj is None:
 672                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 673                                 return
 674                         video_uploader = mobj.group(1)
 675
 676                         # title
 677                         mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 678                         if mobj is None:
 679                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 680                                 return
 681                         video_title = mobj.group(1).decode('utf-8')
 682                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 683                         video_title = video_title.replace(os.sep, u'%')
 684
 685                         # simplified title
 686                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 687                         simple_title = simple_title.strip(ur'_')
 688
 689                         try:
 690                                 # Process video information
 691                                 self._downloader.process_info({
 692                                         'id':           video_id.decode('utf-8'),
 693                                         'url':          video_real_url.decode('utf-8'),
 694                                         'uploader':     video_uploader.decode('utf-8'),
 695                                         'title':        video_title,
 696                                         'stitle':       simple_title,
 697                                         'ext':          video_extension.decode('utf-8'),
 698                                 })
 699
 700                                 return
 701
 702                         except UnavailableFormatError, err:
 703                                 if best_quality:
 704                                         if quality_index == len(self._available_formats) - 1:
 705                                                 # I don't ever expect this to happen
 706                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 707                                                 return
 708                                         else:
 709                                                 self.report_unavailable_format(video_id, format_param)
 710                                                 quality_index += 1
 711                                                 format_param = self._available_formats[quality_index]
 712                                                 continue
 713                                 else:
 714                                         self._downloader.trouble('ERROR: format not available for video')
 715                                         return
 716
 717
 718 class MetacafeIE(InfoExtractor):
 719         """Information Extractor for metacafe.com."""
 720
 721         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 722         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 723         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 724         _youtube_ie = None
 725
 726         def __init__(self, youtube_ie, downloader=None):
 727                 InfoExtractor.__init__(self, downloader)
 728                 self._youtube_ie = youtube_ie
 729
 730         @staticmethod
 731         def suitable(url):
 732                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 733
 734         def report_disclaimer(self):
 735                 """Report disclaimer retrieval."""
 736                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 737
 738         def report_age_confirmation(self):
 739                 """Report attempt to confirm age."""
 740                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 741
 742         def report_download_webpage(self, video_id):
 743                 """Report webpage download."""
 744                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 745
 746         def report_extraction(self, video_id):
 747                 """Report information extraction."""
 748                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 749
 750         def _real_initialize(self):
 751                 # Retrieve disclaimer
 752                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 753                 try:
 754                         self.report_disclaimer()
 755                         disclaimer = urllib2.urlopen(request).read()
 756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 757                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 758                         return
 759
 760                 # Confirm age
 761                 disclaimer_form = {
 762                         'filters': '0',
 763                         'submit': "Continue - I'm over 18",
 764                         }
 765                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 766                 try:
 767                         self.report_age_confirmation()
 768                         disclaimer = urllib2.urlopen(request).read()
 769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 770                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 771                         return
 772
 773         def _real_extract(self, url):
 774                 # Extract id and simplified title from URL
 775                 mobj = re.match(self._VALID_URL, url)
 776                 if mobj is None:
 777                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 778                         return
 779
 780                 video_id = mobj.group(1)
 781
 782                 # Check if video comes from YouTube
 783                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 784                 if mobj2 is not None:
 785                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 786                         return
 787
 788                 simple_title = mobj.group(2).decode('utf-8')
 789                 video_extension = 'flv'
 790
 791                 # Retrieve video webpage to extract further information
 792                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 793                 try:
 794                         self.report_download_webpage(video_id)
 795                         webpage = urllib2.urlopen(request).read()
 796                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 797                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 798                         return
 799
 800                 # Extract URL, uploader and title from webpage
 801                 self.report_extraction(video_id)
 802                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 803                 if mobj is None:
 804                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 805                         return
 806                 mediaURL = urllib.unquote(mobj.group(1))
 807
 808                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 809                 if mobj is None:
 810                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 811                         return
 812                 gdaKey = mobj.group(1)
 813
 814                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 815
 816                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 817                 if mobj is None:
 818                         self._downloader.trouble(u'ERROR: unable to extract title')
 819                         return
 820                 video_title = mobj.group(1).decode('utf-8')
 821
 822                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 823                 if mobj is None:
 824                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 825                         return
 826                 video_uploader = mobj.group(1)
 827
 828                 try:
 829                         # Process video information
 830                         self._downloader.process_info({
 831                                 'id':           video_id.decode('utf-8'),
 832                                 'url':          video_url.decode('utf-8'),
 833                                 'uploader':     video_uploader.decode('utf-8'),
 834                                 'title':        video_title,
 835                                 'stitle':       simple_title,
 836                                 'ext':          video_extension.decode('utf-8'),
 837                         })
 838                 except UnavailableFormatError:
 839                         self._downloader.trouble(u'ERROR: format not available for video')
 840
 841
 842 class YoutubeSearchIE(InfoExtractor):
 843         """Information Extractor for YouTube search queries."""
 844         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 845         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 846         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 847         _MORE_PAGES_INDICATOR = r'>Next</a>'
 848         _youtube_ie = None
 849         _max_youtube_results = 1000
 850
 851         def __init__(self, youtube_ie, downloader=None):
 852                 InfoExtractor.__init__(self, downloader)
 853                 self._youtube_ie = youtube_ie
 854
 855         @staticmethod
 856         def suitable(url):
 857                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 858
 859         def report_download_page(self, query, pagenum):
 860                 """Report attempt to download playlist page with given number."""
 861                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 862
 863         def _real_initialize(self):
 864                 self._youtube_ie.initialize()
 865
 866         def _real_extract(self, query):
 867                 mobj = re.match(self._VALID_QUERY, query)
 868                 if mobj is None:
 869                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 870                         return
 871
 872                 prefix, query = query.split(':')
 873                 prefix = prefix[8:]
 874                 if prefix == '':
 875                         self._download_n_results(query, 1)
 876                         return
 877                 elif prefix == 'all':
 878                         self._download_n_results(query, self._max_youtube_results)
 879                         return
 880                 else:
 881                         try:
 882                                 n = int(prefix)
 883                                 if n <= 0:
 884                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 885                                         return
 886                                 elif n > self._max_youtube_results:
 887                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 888                                         n = self._max_youtube_results
 889                                 self._download_n_results(query, n)
 890                                 return
 891                         except ValueError: # parsing prefix as int fails
 892                                 self._download_n_results(query, 1)
 893                                 return
 894
 895         def _download_n_results(self, query, n):
 896                 """Downloads a specified number of results for a query"""
 897
 898                 video_ids = []
 899                 already_seen = set()
 900                 pagenum = 1
 901
 902                 while True:
 903                         self.report_download_page(query, pagenum)
 904                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 905                         request = urllib2.Request(result_url, None, std_headers)
 906                         try:
 907                                 page = urllib2.urlopen(request).read()
 908                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 909                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 910                                 return
 911
 912                         # Extract video identifiers
 913                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 914                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 915                                 if video_id not in already_seen:
 916                                         video_ids.append(video_id)
 917                                         already_seen.add(video_id)
 918                                         if len(video_ids) == n:
 919                                                 # Specified n videos reached
 920                                                 for id in video_ids:
 921                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 922                                                 return
 923
 924                         if self._MORE_PAGES_INDICATOR not in page:
 925                                 for id in video_ids:
 926                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 927                                 return
 928
 929                         pagenum = pagenum + 1
 930
 931 class YoutubePlaylistIE(InfoExtractor):
 932         """Information Extractor for YouTube playlists."""
 933
 934         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 935         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 936         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 937         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 938         _youtube_ie = None
 939
 940         def __init__(self, youtube_ie, downloader=None):
 941                 InfoExtractor.__init__(self, downloader)
 942                 self._youtube_ie = youtube_ie
 943
 944         @staticmethod
 945         def suitable(url):
 946                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 947
 948         def report_download_page(self, playlist_id, pagenum):
 949                 """Report attempt to download playlist page with given number."""
 950                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 951
 952         def _real_initialize(self):
 953                 self._youtube_ie.initialize()
 954
 955         def _real_extract(self, url):
 956                 # Extract playlist id
 957                 mobj = re.match(self._VALID_URL, url)
 958                 if mobj is None:
 959                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 960                         return
 961
 962                 # Download playlist pages
 963                 playlist_id = mobj.group(1)
 964                 video_ids = []
 965                 pagenum = 1
 966
 967                 while True:
 968                         self.report_download_page(playlist_id, pagenum)
 969                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 970                         try:
 971                                 page = urllib2.urlopen(request).read()
 972                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 973                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 974                                 return
 975
 976                         # Extract video identifiers
 977                         ids_in_page = []
 978                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 979                                 if mobj.group(1) not in ids_in_page:
 980                                         ids_in_page.append(mobj.group(1))
 981                         video_ids.extend(ids_in_page)
 982
 983                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 984                                 break
 985                         pagenum = pagenum + 1
 986
 987                 for id in video_ids:
 988                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 989                 return
 990
 991 class PostProcessor(object):
 992         """Post Processor class.
 993
 994         PostProcessor objects can be added to downloaders with their
 995         add_post_processor() method. When the downloader has finished a
 996         successful download, it will take its internal chain of PostProcessors
 997         and start calling the run() method on each one of them, first with
 998         an initial argument and then with the returned value of the previous
 999         PostProcessor.
1000
1001         The chain will be stopped if one of them ever returns None or the end
1002         of the chain is reached.
1003
1004         PostProcessor objects follow a "mutual registration" process similar
1005         to InfoExtractor objects.
1006         """
1007
1008         _downloader = None
1009
1010         def __init__(self, downloader=None):
1011                 self._downloader = downloader
1012
1013         def set_downloader(self, downloader):
1014                 """Sets the downloader for this PP."""
1015                 self._downloader = downloader
1016
1017         def run(self, information):
1018                 """Run the PostProcessor.
1019
1020                 The "information" argument is a dictionary like the ones
1021                 composed by InfoExtractors. The only difference is that this
1022                 one has an extra field called "filepath" that points to the
1023                 downloaded file.
1024
1025                 When this method returns None, the postprocessing chain is
1026                 stopped. However, this method may return an information
1027                 dictionary that will be passed to the next postprocessing
1028                 object in the chain. It can be the one it received after
1029                 changing some fields.
1030
1031                 In addition, this method may raise a PostProcessingError
1032                 exception that will be taken into account by the downloader
1033                 it was called from.
1034                 """
1035                 return information # by default, do nothing
1036
1037 ### MAIN PROGRAM ###
1038 if __name__ == '__main__':
1039         try:
1040                 # Modules needed only when running the main program
1041                 import getpass
1042                 import optparse
1043
1044                 # General configuration
1045                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1046                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1047                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1048
1049                 # Parse command line
1050                 parser = optparse.OptionParser(
1051                         usage='Usage: %prog [options] url...',
1052                         version='INTERNAL',
1053                         conflict_handler='resolve',
1054                 )
1055
1056                 parser.add_option('-h', '--help',
1057                                 action='help', help='print this help text and exit')
1058                 parser.add_option('-v', '--version',
1059                                 action='version', help='print program version and exit')
1060                 parser.add_option('-i', '--ignore-errors',
1061                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1062                 parser.add_option('-r', '--rate-limit',
1063                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1064
1065                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1066                 authentication.add_option('-u', '--username',
1067                                 dest='username', metavar='UN', help='account username')
1068                 authentication.add_option('-p', '--password',
1069                                 dest='password', metavar='PW', help='account password')
1070                 authentication.add_option('-n', '--netrc',
1071                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1072                 parser.add_option_group(authentication)
1073
1074                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1075                 video_format.add_option('-f', '--format',
1076                                 action='store', dest='format', metavar='FMT', help='video format code')
1077                 video_format.add_option('-b', '--best-quality',
1078                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1079                 video_format.add_option('-m', '--mobile-version',
1080                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1081                 video_format.add_option('-d', '--high-def',
1082                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1083                 parser.add_option_group(video_format)
1084
1085                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1086                 verbosity.add_option('-q', '--quiet',
1087                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1088                 verbosity.add_option('-s', '--simulate',
1089                                 action='store_true', dest='simulate', help='do not download video', default=False)
1090                 verbosity.add_option('-g', '--get-url',
1091                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1092                 verbosity.add_option('-e', '--get-title',
1093                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1094                 parser.add_option_group(verbosity)
1095
1096                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1097                 filesystem.add_option('-t', '--title',
1098                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1099                 filesystem.add_option('-l', '--literal',
1100                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1101                 filesystem.add_option('-o', '--output',
1102                                 dest='outtmpl', metavar='TPL', help='output filename template')
1103                 filesystem.add_option('-a', '--batch-file',
1104                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1105                 filesystem.add_option('-w', '--no-overwrites',
1106                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1107                 filesystem.add_option('-c', '--continue',
1108                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1109                 parser.add_option_group(filesystem)
1110
1111                 (opts, args) = parser.parse_args()
1112
1113                 # Batch file verification
1114                 batchurls = []
1115                 if opts.batchfile is not None:
1116                         try:
1117                                 batchurls = open(opts.batchfile, 'r').readlines()
1118                                 batchurls = [x.strip() for x in batchurls]
1119                                 batchurls = [x for x in batchurls if len(x) > 0]
1120                         except IOError:
1121                                 sys.exit(u'ERROR: batch file could not be read')
1122                 all_urls = batchurls + args
1123
1124                 # Conflicting, missing and erroneous options
1125                 if len(all_urls) < 1:
1126                         parser.error(u'you must provide at least one URL')
1127                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1128                         parser.error(u'using .netrc conflicts with giving username/password')
1129                 if opts.password is not None and opts.username is None:
1130                         parser.error(u'account username missing')
1131                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1132                         parser.error(u'using output template conflicts with using title or literal title')
1133                 if opts.usetitle and opts.useliteral:
1134                         parser.error(u'using title conflicts with using literal title')
1135                 if opts.username is not None and opts.password is None:
1136                         opts.password = getpass.getpass(u'Type account password and press return:')
1137                 if opts.ratelimit is not None:
1138                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1139                         if numeric_limit is None:
1140                                 parser.error(u'invalid rate limit specified')
1141                         opts.ratelimit = numeric_limit
1142
1143                 # Information extractors
1144                 youtube_ie = YoutubeIE()
1145                 metacafe_ie = MetacafeIE(youtube_ie)
1146                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1147                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1148
1149                 # File downloader
1150                 fd = FileDownloader({
1151                         'usenetrc': opts.usenetrc,
1152                         'username': opts.username,
1153                         'password': opts.password,
1154                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1155                         'forceurl': opts.geturl,
1156                         'forcetitle': opts.gettitle,
1157                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1158                         'format': opts.format,
1159                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1160                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1161                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1162                                 or u'%(id)s.%(ext)s'),
1163                         'ignoreerrors': opts.ignoreerrors,
1164                         'ratelimit': opts.ratelimit,
1165                         'nooverwrites': opts.nooverwrites,
1166                         'continue': opts.continue_dl,
1167                         })
1168                 fd.add_info_extractor(youtube_search_ie)
1169                 fd.add_info_extractor(youtube_pl_ie)
1170                 fd.add_info_extractor(metacafe_ie)
1171                 fd.add_info_extractor(youtube_ie)
1172                 retcode = fd.download(all_urls)
1173                 sys.exit(retcode)
1174
1175         except DownloadError:
1176                 sys.exit(1)
1177         except SameFileError:
1178                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1179         except KeyboardInterrupt:
1180                 sys.exit(u'\nERROR: Interrupted by user')