youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class DownloadError(Exception):
  29         """Download Error exception.
  30
  31         This exception may be thrown by FileDownloader objects if they are not
  32         configured to continue on errors. They will contain the appropriate
  33         error message.
  34         """
  35         pass
  36
  37 class SameFileError(Exception):
  38         """Same File exception.
  39
  40         This exception will be thrown by FileDownloader objects if they detect
  41         multiple files would have to be downloaded to the same file on disk.
  42         """
  43         pass
  44
  45 class FileDownloader(object):
  46         """File Downloader class.
  47
  48         File downloader objects are the ones responsible of downloading the
  49         actual video file and writing it to disk if the user has requested
  50         it, among some other tasks. In most cases there should be one per
  51         program. As, given a video URL, the downloader doesn't know how to
  52         extract all the needed information, task that InfoExtractors do, it
  53         has to pass the URL to one of them.
  54
  55         For this, file downloader objects have a method that allows
  56         InfoExtractors to be registered in a given order. When it is passed
  57         a URL, the file downloader handles it to the first InfoExtractor it
  58         finds that reports being able to handle it. The InfoExtractor returns
  59         all the information to the FileDownloader and the latter downloads the
  60         file or does whatever it's instructed to do.
  61
  62         File downloaders accept a lot of parameters. In order not to saturate
  63         the object constructor with arguments, it receives a dictionary of
  64         options instead. These options are available through the get_params()
  65         method for the InfoExtractors to use. The FileDownloader also registers
  66         itself as the downloader in charge for the InfoExtractors that are
  67         added to it, so this is a "mutual registration".
  68
  69         Available options:
  70
  71         username:       Username for authentication purposes.
  72         password:       Password for authentication purposes.
  73         usenetrc:       Use netrc for authentication instead.
  74         quiet:          Do not print messages to stdout.
  75         forceurl:       Force printing final URL.
  76         forcetitle:     Force printing title.
  77         simulate:       Do not download the video files.
  78         format:         Video format code.
  79         outtmpl:        Template for output names.
  80         ignoreerrors:   Do not stop on download errors.
  81         """
  82
  83         _params = None
  84         _ies = []
  85
  86         def __init__(self, params):
  87                 """Create a FileDownloader object with the given options."""
  88                 self._ies = []
  89                 self.set_params(params)
  90
  91         @staticmethod
  92         def pmkdir(filename):
  93                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  94                 components = filename.split(os.sep)
  95                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  96                 for dir in aggregate:
  97                         if not os.path.exists(dir):
  98                                 os.mkdir(dir)
  99
 100         @staticmethod
 101         def format_bytes(bytes):
 102                 if bytes is None:
 103                         return 'N/A'
 104                 if bytes == 0:
 105                         exponent = 0
 106                 else:
 107                         exponent = long(math.log(float(bytes), 1024.0))
 108                 suffix = 'bkMGTPEZY'[exponent]
 109                 converted = float(bytes) / float(1024**exponent)
 110                 return '%.2f%s' % (converted, suffix)
 111
 112         @staticmethod
 113         def calc_percent(byte_counter, data_len):
 114                 if data_len is None:
 115                         return '---.-%'
 116                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 117
 118         @staticmethod
 119         def calc_eta(start, now, total, current):
 120                 if total is None:
 121                         return '--:--'
 122                 dif = now - start
 123                 if current == 0 or dif < 0.001: # One millisecond
 124                         return '--:--'
 125                 rate = float(current) / dif
 126                 eta = long((float(total) - float(current)) / rate)
 127                 (eta_mins, eta_secs) = divmod(eta, 60)
 128                 if eta_mins > 99:
 129                         return '--:--'
 130                 return '%02d:%02d' % (eta_mins, eta_secs)
 131
 132         @staticmethod
 133         def calc_speed(start, now, bytes):
 134                 dif = now - start
 135                 if bytes == 0 or dif < 0.001: # One millisecond
 136                         return '%10s' % '---b/s'
 137                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 138
 139         @staticmethod
 140         def best_block_size(elapsed_time, bytes):
 141                 new_min = max(bytes / 2.0, 1.0)
 142                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 143                 if elapsed_time < 0.001:
 144                         return int(new_max)
 145                 rate = bytes / elapsed_time
 146                 if rate > new_max:
 147                         return int(new_max)
 148                 if rate < new_min:
 149                         return int(new_min)
 150                 return int(rate)
 151
 152         def set_params(self, params):
 153                 """Sets parameters."""
 154                 if type(params) != dict:
 155                         raise ValueError('params: dictionary expected')
 156                 self._params = params
 157
 158         def get_params(self):
 159                 """Get parameters."""
 160                 return self._params
 161
 162         def add_info_extractor(self, ie):
 163                 """Add an InfoExtractor object to the end of the list."""
 164                 self._ies.append(ie)
 165                 ie.set_downloader(self)
 166
 167         def to_stdout(self, message, skip_eol=False):
 168                 """Print message to stdout if not in quiet mode."""
 169                 if not self._params.get('quiet', False):
 170                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 171                         sys.stdout.flush()
 172
 173         def to_stderr(self, message):
 174                 """Print message to stderr."""
 175                 sys.stderr.write('%s\n' % message)
 176
 177         def fixed_template(self):
 178                 """Checks if the output template is fixed."""
 179                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 180
 181         def trouble(self, message=None):
 182                 """Determine action to take when a download problem appears.
 183
 184                 Depending on if the downloader has been configured to ignore
 185                 download errors or not, this method may throw an exception or
 186                 not when errors are found, after printing the message. If it
 187                 doesn't raise, it returns an error code suitable to be returned
 188                 later as a program exit code to indicate error.
 189                 """
 190                 if message is not None:
 191                         self.to_stderr(message)
 192                 if not self._params.get('ignoreerrors', False):
 193                         raise DownloadError(message)
 194                 return 1
 195
 196         def download(self, url_list):
 197                 """Download a given list of URLs."""
 198                 retcode = 0
 199                 if len(url_list) > 1 and self.fixed_template():
 200                         raise SameFileError(self._params['outtmpl'])
 201
 202                 for url in url_list:
 203                         suitable_found = False
 204                         for ie in self._ies:
 205                                 if not ie.suitable(url):
 206                                         continue
 207                                 # Suitable InfoExtractor found
 208                                 suitable_found = True
 209                                 all_results = ie.extract(url)
 210                                 results = [x for x in all_results if x is not None]
 211                                 if len(results) != len(all_results):
 212                                         retcode = self.trouble()
 213
 214                                 if len(results) > 1 and self.fixed_template():
 215                                         raise SameFileError(self._params['outtmpl'])
 216
 217                                 for result in results:
 218
 219                                         # Forced printings
 220                                         if self._params.get('forcetitle', False):
 221                                                 print result['title']
 222                                         if self._params.get('forceurl', False):
 223                                                 print result['url']
 224
 225                                         # Do nothing else if in simulate mode
 226                                         if self._params.get('simulate', False):
 227                                                 continue
 228
 229                                         try:
 230                                                 filename = self._params['outtmpl'] % result
 231                                                 self.to_stdout('[download] Destination: %s' % filename)
 232                                         except (ValueError, KeyError), err:
 233                                                 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
 234                                                 continue
 235                                         try:
 236                                                 self.pmkdir(filename)
 237                                         except (OSError, IOError), err:
 238                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 239                                                 continue
 240                                         try:
 241                                                 outstream = open(filename, 'wb')
 242                                         except (OSError, IOError), err:
 243                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 244                                                 continue
 245                                         try:
 246                                                 self._do_download(outstream, result['url'])
 247                                                 outstream.close()
 248                                         except (OSError, IOError), err:
 249                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 250                                                 continue
 251                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 252                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 253                                                 continue
 254                                 break
 255                         if not suitable_found:
 256                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 257
 258                 return retcode
 259
 260         def _do_download(self, stream, url):
 261                 request = urllib2.Request(url, None, std_headers)
 262                 data = urllib2.urlopen(request)
 263                 data_len = data.info().get('Content-length', None)
 264                 data_len_str = self.format_bytes(data_len)
 265                 byte_counter = 0
 266                 block_size = 1024
 267                 start = time.time()
 268                 while True:
 269                         percent_str = self.calc_percent(byte_counter, data_len)
 270                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 271                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 272                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 273                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 274
 275                         before = time.time()
 276                         data_block = data.read(block_size)
 277                         after = time.time()
 278                         data_block_len = len(data_block)
 279                         if data_block_len == 0:
 280                                 break
 281                         byte_counter += data_block_len
 282                         stream.write(data_block)
 283                         block_size = self.best_block_size(after - before, data_block_len)
 284
 285                 self.to_stdout('')
 286                 if data_len is not None and str(byte_counter) != data_len:
 287                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 288
 289 class InfoExtractor(object):
 290         """Information Extractor class.
 291
 292         Information extractors are the classes that, given a URL, extract
 293         information from the video (or videos) the URL refers to. This
 294         information includes the real video URL, the video title and simplified
 295         title, author and others. It is returned in a list of dictionaries when
 296         calling its extract() method. It is a list because a URL can refer to
 297         more than one video (think of playlists). The dictionaries must include
 298         the following fields:
 299
 300         id:             Video identifier.
 301         url:            Final video URL.
 302         uploader:       Nickname of the video uploader.
 303         title:          Literal title.
 304         stitle:         Simplified title.
 305         ext:            Video filename extension.
 306
 307         Subclasses of this one should re-define the _real_initialize() and
 308         _real_extract() methods, as well as the suitable() static method.
 309         Probably, they should also be instantiated and added to the main
 310         downloader.
 311         """
 312
 313         _ready = False
 314         _downloader = None
 315
 316         def __init__(self, downloader=None):
 317                 """Constructor. Receives an optional downloader."""
 318                 self._ready = False
 319                 self.set_downloader(downloader)
 320
 321         @staticmethod
 322         def suitable(url):
 323                 """Receives a URL and returns True if suitable for this IE."""
 324                 return True
 325
 326         def initialize(self):
 327                 """Initializes an instance (authentication, etc)."""
 328                 if not self._ready:
 329                         self._real_initialize()
 330                         self._ready = True
 331
 332         def extract(self, url):
 333                 """Extracts URL information and returns it in list of dicts."""
 334                 self.initialize()
 335                 return self._real_extract(url)
 336
 337         def set_downloader(self, downloader):
 338                 """Sets the downloader for this IE."""
 339                 self._downloader = downloader
 340
 341         def to_stdout(self, message):
 342                 """Print message to stdout if downloader is not in quiet mode."""
 343                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 344                         print message
 345
 346         def to_stderr(self, message):
 347                 """Print message to stderr."""
 348                 sys.stderr.write('%s\n' % message)
 349
 350         def _real_initialize(self):
 351                 """Real initialization process. Redefine in subclasses."""
 352                 pass
 353
 354         def _real_extract(self, url):
 355                 """Real extraction process. Redefine in subclasses."""
 356                 pass
 357
 358 class YoutubeIE(InfoExtractor):
 359         """Information extractor for youtube.com."""
 360
 361         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 362         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 363         _NETRC_MACHINE = 'youtube'
 364
 365         def _real_initialize(self):
 366                 if self._downloader is None:
 367                         return
 368
 369                 username = None
 370                 password = None
 371                 downloader_params = self._downloader.get_params()
 372
 373                 # Attempt to use provided username and password or .netrc data
 374                 if downloader_params.get('username', None) is not None:
 375                         username = downloader_params['username']
 376                         password = downloader_params['password']
 377                 elif downloader_params.get('usenetrc', False):
 378                         try:
 379                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 380                                 if info is not None:
 381                                         username = info[0]
 382                                         password = info[2]
 383                                 else:
 384                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 385                         except (IOError, netrc.NetrcParseError), err:
 386                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 387                                 return
 388
 389                 # No authentication to be performed
 390                 if username is None:
 391                         return
 392
 393                 # Log in
 394                 login_form = {
 395                                 'current_form': 'loginForm',
 396                                 'next':         '/',
 397                                 'action_login': 'Log In',
 398                                 'username':     username,
 399                                 'password':     password,
 400                                 }
 401                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 402                 try:
 403                         self.to_stdout('[youtube] Logging in')
 404                         login_results = urllib2.urlopen(request).read()
 405                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 406                                 self.to_stderr('WARNING: unable to log in: bad username or password')
 407                                 return
 408                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 409                         self.to_stderr('WARNING: unable to log in: %s' % str(err))
 410                         return
 411
 412                 # Confirm age
 413                 age_form = {
 414                                 'next_url':             '/',
 415                                 'action_confirm':       'Confirm',
 416                                 }
 417                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 418                 try:
 419                         self.to_stdout('[youtube] Confirming age')
 420                         age_results = urllib2.urlopen(request).read()
 421                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 422                         self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
 423                         return
 424
 425         def _real_extract(self, url):
 426                 # Extract video id from URL
 427                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 428                 if mobj is None:
 429                         self.to_stderr('ERROR: invalid URL: %s' % url)
 430                         return [None]
 431                 video_id = mobj.group(2)
 432
 433                 # Downloader parameters
 434                 format_param = None
 435                 if self._downloader is not None:
 436                         params = self._downloader.get_params()
 437                         format_param = params.get('format', None)
 438
 439                 # Extension
 440                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 441
 442                 # Normalize URL, including format
 443                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 444                 if format_param is not None:
 445                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 446                 request = urllib2.Request(normalized_url, None, std_headers)
 447                 try:
 448                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 449                         video_webpage = urllib2.urlopen(request).read()
 450                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 451                         self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
 452                         return [None]
 453                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 454
 455                 # "t" param
 456                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 457                 if mobj is None:
 458                         self.to_stderr('ERROR: unable to extract "t" parameter')
 459                         return [None]
 460                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 461                 if format_param is not None:
 462                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 463                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 464
 465                 # uploader
 466                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 467                 if mobj is None:
 468                         self.to_stderr('ERROR: unable to extract uploader nickname')
 469                         return [None]
 470                 video_uploader = mobj.group(1)
 471
 472                 # title
 473                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 474                 if mobj is None:
 475                         self.to_stderr('ERROR: unable to extract video title')
 476                         return [None]
 477                 video_title = mobj.group(1).decode('utf-8')
 478                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 479                 video_title = video_title.replace(os.sep, u'%')
 480
 481                 # simplified title
 482                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 483                 simple_title = simple_title.strip(ur'_')
 484
 485                 # Return information
 486                 return [{
 487                         'id':           video_id,
 488                         'url':          video_real_url,
 489                         'uploader':     video_uploader,
 490                         'title':        video_title,
 491                         'stitle':       simple_title,
 492                         'ext':          video_extension,
 493                         }]
 494
 495 if __name__ == '__main__':
 496         try:
 497                 # Modules needed only when running the main program
 498                 import getpass
 499                 import optparse
 500
 501                 # General configuration
 502                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 503                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 504                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 505
 506                 # Parse command line
 507                 parser = optparse.OptionParser(
 508                                 usage='Usage: %prog [options] url...',
 509                                 version='INTERNAL',
 510                                 conflict_handler='resolve',
 511                                 )
 512                 parser.add_option('-h', '--help',
 513                                 action='help', help='print this help text and exit')
 514                 parser.add_option('-v', '--version',
 515                                 action='version', help='print program version and exit')
 516                 parser.add_option('-u', '--username',
 517                                 dest='username', metavar='UN', help='account username')
 518                 parser.add_option('-p', '--password',
 519                                 dest='password', metavar='PW', help='account password')
 520                 parser.add_option('-o', '--output',
 521                                 dest='outtmpl', metavar='TPL', help='output filename template')
 522                 parser.add_option('-q', '--quiet',
 523                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 524                 parser.add_option('-s', '--simulate',
 525                                 action='store_true', dest='simulate', help='do not download video', default=False)
 526                 parser.add_option('-t', '--title',
 527                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 528                 parser.add_option('-l', '--literal',
 529                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 530                 parser.add_option('-n', '--netrc',
 531                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 532                 parser.add_option('-g', '--get-url',
 533                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 534                 parser.add_option('-e', '--get-title',
 535                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 536                 parser.add_option('-f', '--format',
 537                                 dest='format', metavar='FMT', help='video format code')
 538                 parser.add_option('-b', '--best-quality',
 539                                 action='store_const', dest='video_format', help='alias for -f 18', const='18')
 540                 parser.add_option('-m', '--mobile-version',
 541                                 action='store_const', dest='video_format', help='alias for -f 17', const='17')
 542                 parser.add_option('-i', '--ignore-errors',
 543                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 544                 (opts, args) = parser.parse_args()
 545
 546                 # Conflicting, missing and erroneous options
 547                 if len(args) < 1:
 548                         sys.exit('ERROR: you must provide at least one URL')
 549                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 550                         sys.exit('ERROR: using .netrc conflicts with giving username/password')
 551                 if opts.password is not None and opts.username is None:
 552                         sys.exit('ERROR: account username missing')
 553                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 554                         sys.exit('ERROR: using output template conflicts with using title or literal title')
 555                 if opts.usetitle and opts.useliteral:
 556                         sys.exit('ERROR: using title conflicts with using literal title')
 557                 if opts.username is not None and opts.password is None:
 558                         opts.password = getpass.getpass('Type account password and press return:')
 559
 560                 # Information extractors
 561                 youtube_ie = YoutubeIE()
 562
 563                 # File downloader
 564                 fd = FileDownloader({
 565                         'usenetrc': opts.usenetrc,
 566                         'username': opts.username,
 567                         'password': opts.password,
 568                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 569                         'forceurl': opts.geturl,
 570                         'forcetitle': opts.gettitle,
 571                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 572                         'format': opts.format,
 573                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
 574                                 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
 575                                 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
 576                                 or '%(id)s.%(ext)s'),
 577                         'ignoreerrors': opts.ignoreerrors,
 578                         })
 579                 fd.add_info_extractor(youtube_ie)
 580                 retcode = fd.download(args)
 581                 sys.exit(retcode)
 582
 583         except DownloadError:
 584                 sys.exit(1)
 585         except SameFileError:
 586                 sys.exit('ERROR: fixed output name but more than one file to download')
 587         except KeyboardInterrupt:
 588                 sys.exit('\nERROR: Interrupted by user')