youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         forceurl:       Force printing final URL.
  59         forcetitle:     Force printing title.
  60         simulate:       Do not download the video files.
  61         format:         Video format code.
  62         outtmpl:        Template for output names.
  63         ignoreerrors:   Do not stop on download errors.
  64         """
  65
  66         _params = None
  67         _ies = []
  68
  69         def __init__(self, params):
  70                 self._ies = []
  71                 self.set_params(params)
  72
  73         @staticmethod
  74         def pmkdir(filename):
  75                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  76                 components = filename.split(os.sep)
  77                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  78                 for dir in aggregate:
  79                         if not os.path.exists(dir):
  80                                 os.mkdir(dir)
  81
  82         @staticmethod
  83         def format_bytes(bytes):
  84                 if bytes is None:
  85                         return 'N/A'
  86                 if bytes == 0:
  87                         exponent = 0
  88                 else:
  89                         exponent = long(math.log(float(bytes), 1024.0))
  90                 suffix = 'bkMGTPEZY'[exponent]
  91                 converted = float(bytes) / float(1024**exponent)
  92                 return '%.2f%s' % (converted, suffix)
  93
  94         @staticmethod
  95         def calc_percent(byte_counter, data_len):
  96                 if data_len is None:
  97                         return '---.-%'
  98                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  99
 100         @staticmethod
 101         def calc_eta(start, now, total, current):
 102                 if total is None:
 103                         return '--:--'
 104                 dif = now - start
 105                 if current == 0 or dif < 0.001: # One millisecond
 106                         return '--:--'
 107                 rate = float(current) / dif
 108                 eta = long((float(total) - float(current)) / rate)
 109                 (eta_mins, eta_secs) = divmod(eta, 60)
 110                 if eta_mins > 99:
 111                         return '--:--'
 112                 return '%02d:%02d' % (eta_mins, eta_secs)
 113
 114         @staticmethod
 115         def calc_speed(start, now, bytes):
 116                 dif = now - start
 117                 if bytes == 0 or dif < 0.001: # One millisecond
 118                         return '%10s' % '---b/s'
 119                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 120
 121         @staticmethod
 122         def best_block_size(elapsed_time, bytes):
 123                 new_min = max(bytes / 2.0, 1.0)
 124                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 125                 if elapsed_time < 0.001:
 126                         return int(new_max)
 127                 rate = bytes / elapsed_time
 128                 if rate > new_max:
 129                         return int(new_max)
 130                 if rate < new_min:
 131                         return int(new_min)
 132                 return int(rate)
 133
 134         def set_params(self, params):
 135                 """Sets parameters."""
 136                 if type(params) != dict:
 137                         raise ValueError('params: dictionary expected')
 138                 self._params = params
 139
 140         def get_params(self):
 141                 """Get parameters."""
 142                 return self._params
 143
 144         def add_info_extractor(self, ie):
 145                 """Add an InfoExtractor object to the end of the list."""
 146                 self._ies.append(ie)
 147                 ie.set_downloader(self)
 148
 149         def to_stdout(self, message, skip_eol=False):
 150                 """Print message to stdout if not in quiet mode."""
 151                 if not self._params.get('quiet', False):
 152                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 153                         sys.stdout.flush()
 154
 155         def to_stderr(self, message):
 156                 """Print message to stderr."""
 157                 sys.stderr.write('%s\n' % message)
 158
 159         def fixed_template(self):
 160                 """Checks if the output template is fixed."""
 161                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 162
 163         def trouble(self, message=None):
 164                 """Determine action to take when a download problem appears.
 165
 166                 Depending on if the downloader has been configured to ignore
 167                 download errors or not, this method may exit the program or
 168                 not when errors are found, after printing the message. If it
 169                 doesn't exit, it returns an error code suitable to be returned
 170                 later as a program exit code to indicate error.
 171                 """
 172                 if message is not None:
 173                         self.to_stderr(message)
 174                 if not self._params.get('ignoreerrors', False):
 175                         sys.exit(1)
 176                 return 1
 177
 178         def download(self, url_list):
 179                 """Download a given list of URLs."""
 180                 retcode = 0
 181                 if len(url_list) > 1 and self.fixed_template():
 182                         sys.exit('ERROR: fixed output name but more than one file to download')
 183
 184                 for url in url_list:
 185                         suitable_found = False
 186                         for ie in self._ies:
 187                                 if not ie.suitable(url):
 188                                         continue
 189                                 # Suitable InfoExtractor found
 190                                 suitable_found = True
 191                                 all_results = ie.extract(url)
 192                                 results = [x for x in all_results if x is not None]
 193                                 if len(results) != len(all_results):
 194                                         retcode = self.trouble()
 195
 196                                 if len(results) > 1 and self.fixed_template():
 197                                         sys.exit('ERROR: fixed output name but more than one file to download')
 198
 199                                 for result in results:
 200
 201                                         # Forced printings
 202                                         if self._params.get('forcetitle', False):
 203                                                 print result['title']
 204                                         if self._params.get('forceurl', False):
 205                                                 print result['url']
 206
 207                                         # Do nothing else if in simulate mode
 208                                         if self._params.get('simulate', False):
 209                                                 continue
 210
 211                                         try:
 212                                                 filename = self._params['outtmpl'] % result
 213                                         except (ValueError, KeyError), err:
 214                                                 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
 215                                                 continue
 216                                         try:
 217                                                 self.pmkdir(filename)
 218                                         except (OSError, IOError), err:
 219                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 220                                                 continue
 221                                         try:
 222                                                 outstream = open(filename, 'wb')
 223                                         except (OSError, IOError), err:
 224                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 225                                                 continue
 226                                         try:
 227                                                 self._do_download(outstream, result['url'])
 228                                                 outstream.close()
 229                                         except (OSError, IOError), err:
 230                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 231                                                 continue
 232                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 233                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 234                                                 continue
 235                                 break
 236                         if not suitable_found:
 237                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 238
 239                 return retcode
 240
 241         def _do_download(self, stream, url):
 242                 request = urllib2.Request(url, None, std_headers)
 243                 data = urllib2.urlopen(request)
 244                 data_len = data.info().get('Content-length', None)
 245                 data_len_str = self.format_bytes(data_len)
 246                 byte_counter = 0
 247                 block_size = 1024
 248                 start = time.time()
 249                 while True:
 250                         percent_str = self.calc_percent(byte_counter, data_len)
 251                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 252                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 253                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 254                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 255
 256                         before = time.time()
 257                         data_block = data.read(block_size)
 258                         after = time.time()
 259                         data_block_len = len(data_block)
 260                         if data_block_len == 0:
 261                                 break
 262                         byte_counter += data_block_len
 263                         stream.write(data_block)
 264                         block_size = self.best_block_size(after - before, data_block_len)
 265
 266                 self.to_stdout('')
 267                 if data_len is not None and str(byte_counter) != data_len:
 268                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 269
 270 class InfoExtractor(object):
 271         """Information Extractor class.
 272
 273         Information extractors are the classes that, given a URL, extract
 274         information from the video (or videos) the URL refers to. This
 275         information includes the real video URL, the video title and simplified
 276         title, author and others. It is returned in a list of dictionaries when
 277         calling its extract() method. It is a list because a URL can refer to
 278         more than one video (think of playlists). The dictionaries must include
 279         the following fields:
 280
 281         id:             Video identifier.
 282         url:            Final video URL.
 283         uploader:       Nickname of the video uploader.
 284         title:          Literal title.
 285         stitle:         Simplified title.
 286         ext:            Video filename extension.
 287
 288         Subclasses of this one should re-define the _real_initialize() and
 289         _real_extract() methods, as well as the suitable() static method.
 290         Probably, they should also be instantiated and added to the main
 291         downloader.
 292         """
 293
 294         _ready = False
 295         _downloader = None
 296
 297         def __init__(self, downloader=None):
 298                 """Constructor. Receives an optional downloader."""
 299                 self._ready = False
 300                 self.set_downloader(downloader)
 301
 302         @staticmethod
 303         def suitable(url):
 304                 """Receives a URL and returns True if suitable for this IE."""
 305                 return True
 306
 307         def initialize(self):
 308                 """Initializes an instance (login, etc)."""
 309                 if not self._ready:
 310                         self._real_initialize()
 311                         self._ready = True
 312
 313         def extract(self, url):
 314                 """Extracts URL information and returns it in list of dicts."""
 315                 self.initialize()
 316                 return self._real_extract(url)
 317
 318         def set_downloader(self, downloader):
 319                 """Sets the downloader for this IE."""
 320                 self._downloader = downloader
 321
 322         def to_stdout(self, message):
 323                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 324                         print message
 325
 326         def to_stderr(self, message):
 327                 sys.stderr.write('%s\n' % message)
 328
 329         def _real_initialize(self):
 330                 """Real initialization process. Redefine in subclasses."""
 331                 pass
 332
 333         def _real_extract(self, url):
 334                 """Real extraction process. Redefine in subclasses."""
 335                 pass
 336
 337 class YoutubeIE(InfoExtractor):
 338         """Information extractor for youtube.com."""
 339
 340         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 341         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 342         _NETRC_MACHINE = 'youtube'
 343
 344         def _real_initialize(self):
 345                 if self._downloader is None:
 346                         return
 347
 348                 username = None
 349                 password = None
 350                 downloader_params = self._downloader.get_params()
 351
 352                 # Attempt to use provided username and password or .netrc data
 353                 if downloader_params.get('username', None) is not None:
 354                         username = downloader_params['username']
 355                         password = downloader_params['password']
 356                 elif downloader_params.get('usenetrc', False):
 357                         try:
 358                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 359                                 if info is not None:
 360                                         username = info[0]
 361                                         password = info[2]
 362                                 else:
 363                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 364                         except (IOError, netrc.NetrcParseError), err:
 365                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 366                                 return
 367
 368                 if username is None:
 369                         return
 370
 371                 # Log in
 372                 login_form = {
 373                                 'current_form': 'loginForm',
 374                                 'next':         '/',
 375                                 'action_login': 'Log In',
 376                                 'username':     username,
 377                                 'password':     password,
 378                                 }
 379                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 380                 try:
 381                         self.to_stdout('[youtube] Logging in')
 382                         login_results = urllib2.urlopen(request).read()
 383                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 384                                 self.to_stderr('WARNING: unable to log in: bad username or password')
 385                                 return
 386                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 387                         self.to_stderr('WARNING: unable to log in: %s' % str(err))
 388                         return
 389
 390                 # Confirm age
 391                 age_form = {
 392                                 'next_url':             '/',
 393                                 'action_confirm':       'Confirm',
 394                                 }
 395                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 396                 try:
 397                         self.to_stdout('[youtube] Confirming age')
 398                         age_results = urllib2.urlopen(request).read()
 399                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 400                         sys.exit('ERROR: unable to confirm age: %s' % str(err))
 401
 402         def _real_extract(self, url):
 403                 # Extract video id from URL
 404                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 405                 if mobj is None:
 406                         self.to_stderr('ERROR: invalid URL: %s' % url)
 407                         return [None]
 408                 video_id = mobj.group(2)
 409
 410                 # Downloader parameters
 411                 format_param = None
 412                 if self._downloader is not None:
 413                         params = self._downloader.get_params()
 414                         format_param = params.get('format', None)
 415
 416                 # Extension
 417                 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
 418
 419                 # Normalize URL, including format
 420                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 421                 if format_param is not None:
 422                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 423                 request = urllib2.Request(normalized_url, None, std_headers)
 424                 try:
 425                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 426                         video_webpage = urllib2.urlopen(request).read()
 427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 428                         sys.exit('ERROR: unable to download video: %s' % str(err))
 429                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 430
 431                 # "t" param
 432                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 433                 if mobj is None:
 434                         self.to_stderr('ERROR: unable to extract "t" parameter')
 435                         return [None]
 436                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 437                 if format_param is not None:
 438                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 439                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 440
 441                 # uploader
 442                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 443                 if mobj is None:
 444                         self.to_stderr('ERROR: unable to extract uploader nickname')
 445                         return [None]
 446                 video_uploader = mobj.group(1)
 447
 448                 # title
 449                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 450                 if mobj is None:
 451                         self.to_stderr('ERROR: unable to extract video title')
 452                         return [None]
 453                 video_title = mobj.group(1).decode('utf-8')
 454                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 455                 video_title = video_title.replace(os.sep, u'%')
 456
 457                 # simplified title
 458                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 459                 simple_title = simple_title.strip(ur'_')
 460
 461                 # Return information
 462                 return [{
 463                         'id':           video_id,
 464                         'url':          video_real_url,
 465                         'uploader':     video_uploader,
 466                         'title':        video_title,
 467                         'stitle':       simple_title,
 468                         'ext':          video_extension,
 469                         }]
 470
 471 if __name__ == '__main__':
 472         try:
 473                 # Modules needed only when running the main program
 474                 import getpass
 475                 import optparse
 476
 477                 # General configuration
 478                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 479                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 480                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 481
 482                 # Parse command line
 483                 parser = optparse.OptionParser(
 484                                 usage='Usage: %prog [options] url...',
 485                                 version='INTERNAL',
 486                                 conflict_handler='resolve',
 487                                 )
 488                 parser.add_option('-h', '--help',
 489                                 action='help', help='print this help text and exit')
 490                 parser.add_option('-v', '--version',
 491                                 action='version', help='print program version and exit')
 492                 parser.add_option('-u', '--username',
 493                                 dest='username', metavar='UN', help='account username')
 494                 parser.add_option('-p', '--password',
 495                                 dest='password', metavar='PW', help='account password')
 496                 parser.add_option('-o', '--output',
 497                                 dest='outtmpl', metavar='TPL', help='output filename template')
 498                 parser.add_option('-q', '--quiet',
 499                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 500                 parser.add_option('-s', '--simulate',
 501                                 action='store_true', dest='simulate', help='do not download video', default=False)
 502                 parser.add_option('-t', '--title',
 503                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 504                 parser.add_option('-l', '--literal',
 505                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 506                 parser.add_option('-n', '--netrc',
 507                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 508                 parser.add_option('-g', '--get-url',
 509                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 510                 parser.add_option('-e', '--get-title',
 511                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 512                 parser.add_option('-f', '--format',
 513                                 dest='format', metavar='FMT', help='video format code')
 514                 parser.add_option('-b', '--best-quality',
 515                                 action='store_const', dest='video_format', help='alias for -f 18', const='18')
 516                 parser.add_option('-i', '--ignore-errors',
 517                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 518                 (opts, args) = parser.parse_args()
 519
 520                 # Conflicting, missing and erroneous options
 521                 if len(args) < 1:
 522                         sys.exit('ERROR: you must provide at least one URL')
 523                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 524                         sys.exit('ERROR: using .netrc conflicts with giving username/password')
 525                 if opts.password is not None and opts.username is None:
 526                         sys.exit('ERROR: account username missing')
 527                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 528                         sys.exit('ERROR: using output template conflicts with using title or literal title')
 529                 if opts.usetitle and opts.useliteral:
 530                         sys.exit('ERROR: using title conflicts with using literal title')
 531                 if opts.username is not None and opts.password is None:
 532                         opts.password = getpass.getpass('Type account password and press return:')
 533
 534                 # Information extractors
 535                 youtube_ie = YoutubeIE()
 536
 537                 # File downloader
 538                 fd = FileDownloader({
 539                         'usenetrc': opts.usenetrc,
 540                         'username': opts.username,
 541                         'password': opts.password,
 542                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 543                         'forceurl': opts.geturl,
 544                         'forcetitle': opts.gettitle,
 545                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 546                         'format': opts.format,
 547                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
 548                                 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
 549                                 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
 550                                 or '%(id)s.%(ext)s'),
 551                         'ignoreerrors': opts.ignoreerrors,
 552                         })
 553                 fd.add_info_extractor(youtube_ie)
 554                 retcode = fd.download(args)
 555                 sys.exit(retcode)
 556
 557         except KeyboardInterrupt:
 558                 sys.exit('\nERROR: Interrupted by user')