youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         forceurl:       Force printing final URL.
  59         forcetitle:     Force printing title.
  60         simulate:       Do not download the video files.
  61         format:         Video format code.
  62         outtmpl:        Template for output names.
  63         """
  64
  65         _params = None
  66         _ies = []
  67
  68         def __init__(self, params):
  69                 self._ies = []
  70                 self.set_params(params)
  71
  72         @staticmethod
  73         def pmkdir(filename):
  74                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  75                 components = filename.split(os.sep)
  76                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  77                 for dir in aggregate:
  78                         if not os.path.exists(dir):
  79                                 os.mkdir(dir)
  80
  81         @staticmethod
  82         def format_bytes(bytes):
  83                 if bytes is None:
  84                         return 'N/A'
  85                 if bytes == 0:
  86                         exponent = 0
  87                 else:
  88                         exponent = long(math.log(float(bytes), 1024.0))
  89                 suffix = 'bkMGTPEZY'[exponent]
  90                 converted = float(bytes) / float(1024**exponent)
  91                 return '%.2f%s' % (converted, suffix)
  92
  93         @staticmethod
  94         def calc_percent(byte_counter, data_len):
  95                 if data_len is None:
  96                         return '---.-%'
  97                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  98
  99         @staticmethod
 100         def calc_eta(start, now, total, current):
 101                 if total is None:
 102                         return '--:--'
 103                 dif = now - start
 104                 if current == 0 or dif < 0.001: # One millisecond
 105                         return '--:--'
 106                 rate = float(current) / dif
 107                 eta = long((float(total) - float(current)) / rate)
 108                 (eta_mins, eta_secs) = divmod(eta, 60)
 109                 if eta_mins > 99:
 110                         return '--:--'
 111                 return '%02d:%02d' % (eta_mins, eta_secs)
 112
 113         @staticmethod
 114         def calc_speed(start, now, bytes):
 115                 dif = now - start
 116                 if bytes == 0 or dif < 0.001: # One millisecond
 117                         return '%10s' % '---b/s'
 118                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 119
 120         @staticmethod
 121         def best_block_size(elapsed_time, bytes):
 122                 new_min = max(bytes / 2.0, 1.0)
 123                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 124                 if elapsed_time < 0.001:
 125                         return int(new_max)
 126                 rate = bytes / elapsed_time
 127                 if rate > new_max:
 128                         return int(new_max)
 129                 if rate < new_min:
 130                         return int(new_min)
 131                 return int(rate)
 132
 133         def set_params(self, params):
 134                 """Sets parameters."""
 135                 if type(params) != dict:
 136                         raise ValueError('params: dictionary expected')
 137                 self._params = params
 138
 139         def get_params(self):
 140                 """Get parameters."""
 141                 return self._params
 142
 143         def add_info_extractor(self, ie):
 144                 """Add an InfoExtractor object to the end of the list."""
 145                 self._ies.append(ie)
 146                 ie.set_downloader(self)
 147
 148         def to_stdout(self, message, skip_eol=False):
 149                 """Print message to stdout if not in quiet mode."""
 150                 if not self._params.get('quiet', False):
 151                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 152                         sys.stdout.flush()
 153
 154         def to_stderr(self, message):
 155                 """Print message to stderr."""
 156                 sys.stderr.write('%s\n' % message)
 157
 158         def fixed_template(self):
 159                 """Checks if the output template is fixed."""
 160                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 161
 162         def download(self, url_list):
 163                 """Download a given list of URLs."""
 164                 retcode = 0
 165                 if len(url_list) > 1 and self.fixed_template():
 166                         sys.exit('ERROR: fixed output name but more than one file to download')
 167
 168                 for url in url_list:
 169                         suitable_found = False
 170                         for ie in self._ies:
 171                                 if not ie.suitable(url):
 172                                         continue
 173                                 # Suitable InfoExtractor found
 174                                 suitable_found = True
 175                                 all_results = ie.extract(url)
 176                                 results = [x for x in all_results if x is not None]
 177                                 if len(results) != len(all_results):
 178                                         retcode = 1
 179
 180                                 if len(results) > 1 and self.fixed_template():
 181                                         sys.exit('ERROR: fixed output name but more than one file to download')
 182
 183                                 for result in results:
 184
 185                                         # Forced printings
 186                                         if self._params.get('forcetitle', False):
 187                                                 print result['title']
 188                                         if self._params.get('forceurl', False):
 189                                                 print result['url']
 190
 191                                         # Do nothing else if in simulate mode
 192                                         if self._params.get('simulate', False):
 193                                                 continue
 194
 195                                         try:
 196                                                 filename = self._params['outtmpl'] % result
 197                                         except (ValueError, KeyError), err:
 198                                                 self.to_stderr('ERROR: invalid output template: %s' % str(err))
 199                                                 retcode = 1
 200                                                 continue
 201                                         try:
 202                                                 self.pmkdir(filename)
 203                                         except (OSError, IOError), err:
 204                                                 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
 205                                                 retcode = 1
 206                                                 continue
 207                                         try:
 208                                                 outstream = open(filename, 'wb')
 209                                         except (OSError, IOError), err:
 210                                                 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
 211                                                 retcode = 1
 212                                                 continue
 213                                         try:
 214                                                 self._do_download(outstream, result['url'])
 215                                                 outstream.close()
 216                                         except (OSError, IOError), err:
 217                                                 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
 218                                                 retcode = 1
 219                                                 continue
 220                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 221                                                 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
 222                                                 retcode = 1
 223                                                 continue
 224                                 break
 225                         if not suitable_found:
 226                                 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
 227                                 retcode = 1
 228
 229                 return retcode
 230
 231         def _do_download(self, stream, url):
 232                 request = urllib2.Request(url, None, std_headers)
 233                 data = urllib2.urlopen(request)
 234                 data_len = data.info().get('Content-length', None)
 235                 data_len_str = self.format_bytes(data_len)
 236                 byte_counter = 0
 237                 block_size = 1024
 238                 start = time.time()
 239                 while True:
 240                         percent_str = self.calc_percent(byte_counter, data_len)
 241                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 242                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 243                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 244                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 245
 246                         before = time.time()
 247                         data_block = data.read(block_size)
 248                         after = time.time()
 249                         data_block_len = len(data_block)
 250                         if data_block_len == 0:
 251                                 break
 252                         byte_counter += data_block_len
 253                         stream.write(data_block)
 254                         block_size = self.best_block_size(after - before, data_block_len)
 255
 256                 self.to_stdout('')
 257                 if data_len is not None and str(byte_counter) != data_len:
 258                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 259
 260 class InfoExtractor(object):
 261         """Information Extractor class.
 262
 263         Information extractors are the classes that, given a URL, extract
 264         information from the video (or videos) the URL refers to. This
 265         information includes the real video URL, the video title and simplified
 266         title, author and others. It is returned in a list of dictionaries when
 267         calling its extract() method. It is a list because a URL can refer to
 268         more than one video (think of playlists). The dictionaries must include
 269         the following fields:
 270
 271         id:             Video identifier.
 272         url:            Final video URL.
 273         uploader:       Nickname of the video uploader.
 274         title:          Literal title.
 275         stitle:         Simplified title.
 276         ext:            Video filename extension.
 277
 278         Subclasses of this one should re-define the _real_initialize() and
 279         _real_extract() methods, as well as the suitable() static method.
 280         Probably, they should also be instantiated and added to the main
 281         downloader.
 282         """
 283
 284         _ready = False
 285         _downloader = None
 286
 287         def __init__(self, downloader=None):
 288                 """Constructor. Receives an optional downloader."""
 289                 self._ready = False
 290                 self.set_downloader(downloader)
 291
 292         @staticmethod
 293         def suitable(url):
 294                 """Receives a URL and returns True if suitable for this IE."""
 295                 return True
 296
 297         def initialize(self):
 298                 """Initializes an instance (login, etc)."""
 299                 if not self._ready:
 300                         self._real_initialize()
 301                         self._ready = True
 302
 303         def extract(self, url):
 304                 """Extracts URL information and returns it in list of dicts."""
 305                 self.initialize()
 306                 return self._real_extract(url)
 307
 308         def set_downloader(self, downloader):
 309                 """Sets the downloader for this IE."""
 310                 self._downloader = downloader
 311
 312         def to_stdout(self, message):
 313                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 314                         print message
 315
 316         def to_stderr(self, message):
 317                 sys.stderr.write('%s\n' % message)
 318
 319         def _real_initialize(self):
 320                 """Real initialization process. Redefine in subclasses."""
 321                 pass
 322
 323         def _real_extract(self, url):
 324                 """Real extraction process. Redefine in subclasses."""
 325                 pass
 326
 327 class YoutubeIE(InfoExtractor):
 328         """Information extractor for youtube.com."""
 329
 330         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 331         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 332         _NETRC_MACHINE = 'youtube'
 333
 334         def _real_initialize(self):
 335                 if self._downloader is None:
 336                         return
 337
 338                 username = None
 339                 password = None
 340                 downloader_params = self._downloader.get_params()
 341
 342                 # Attempt to use provided username and password or .netrc data
 343                 if downloader_params.get('username', None) is not None:
 344                         username = downloader_params['username']
 345                         password = downloader_params['password']
 346                 elif downloader_params.get('usenetrc', False):
 347                         try:
 348                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 349                                 if info is not None:
 350                                         username = info[0]
 351                                         password = info[2]
 352                                 else:
 353                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 354                         except (IOError, netrc.NetrcParseError), err:
 355                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 356                                 return
 357
 358                 if username is None:
 359                         return
 360
 361                 # Log in
 362                 login_form = {
 363                                 'current_form': 'loginForm',
 364                                 'next':         '/',
 365                                 'action_login': 'Log In',
 366                                 'username':     username,
 367                                 'password':     password,
 368                                 }
 369                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 370                 try:
 371                         self.to_stdout('[youtube] Logging in')
 372                         login_results = urllib2.urlopen(request).read()
 373                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 374                                 self.to_stderr('WARNING: unable to log in: bad username or password')
 375                                 return
 376                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 377                         self.to_stderr('WARNING: unable to log in: %s' % str(err))
 378                         return
 379
 380                 # Confirm age
 381                 age_form = {
 382                                 'next_url':             '/',
 383                                 'action_confirm':       'Confirm',
 384                                 }
 385                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 386                 try:
 387                         self.to_stdout('[youtube] Confirming age')
 388                         age_results = urllib2.urlopen(request).read()
 389                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 390                         sys.exit('ERROR: unable to confirm age: %s' % str(err))
 391
 392         def _real_extract(self, url):
 393                 # Extract video id from URL
 394                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 395                 if mobj is None:
 396                         self.to_stderr('ERROR: invalid URL: %s' % url)
 397                         return [None]
 398                 video_id = mobj.group(2)
 399
 400                 # Downloader parameters
 401                 format_param = None
 402                 if self._downloader is not None:
 403                         params = self._downloader.get_params()
 404                         format_param = params.get('format', None)
 405
 406                 # Extension
 407                 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
 408
 409                 # Normalize URL, including format
 410                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 411                 if format_param is not None:
 412                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 413                 request = urllib2.Request(normalized_url, None, std_headers)
 414                 try:
 415                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 416                         video_webpage = urllib2.urlopen(request).read()
 417                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 418                         sys.exit('ERROR: unable to download video: %s' % str(err))
 419                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 420
 421                 # "t" param
 422                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 423                 if mobj is None:
 424                         self.to_stderr('ERROR: unable to extract "t" parameter')
 425                         return [None]
 426                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 427                 if format_param is not None:
 428                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 429                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 430
 431                 # uploader
 432                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 433                 if mobj is None:
 434                         self.to_stderr('ERROR: unable to extract uploader nickname')
 435                         return [None]
 436                 video_uploader = mobj.group(1)
 437
 438                 # title
 439                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 440                 if mobj is None:
 441                         self.to_stderr('ERROR: unable to extract video title')
 442                         return [None]
 443                 video_title = mobj.group(1).decode('utf-8')
 444                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 445                 video_title = video_title.replace(os.sep, u'%')
 446
 447                 # simplified title
 448                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 449                 simple_title = simple_title.strip(ur'_')
 450
 451                 # Return information
 452                 return [{
 453                         'id':           video_id,
 454                         'url':          video_real_url,
 455                         'uploader':     video_uploader,
 456                         'title':        video_title,
 457                         'stitle':       simple_title,
 458                         'ext':          video_extension,
 459                         }]
 460
 461 if __name__ == '__main__':
 462         try:
 463                 # Modules needed only when running the main program
 464                 import getpass
 465                 import optparse
 466
 467                 # General configuration
 468                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 469                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 470                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 471
 472                 # Parse command line
 473                 parser = optparse.OptionParser(
 474                                 usage='Usage: %prog [options] url...',
 475                                 version='INTERNAL',
 476                                 conflict_handler='resolve',
 477                                 )
 478                 parser.add_option('-h', '--help',
 479                                 action='help', help='print this help text and exit')
 480                 parser.add_option('-v', '--version',
 481                                 action='version', help='print program version and exit')
 482                 parser.add_option('-u', '--username',
 483                                 dest='username', metavar='UN', help='account username')
 484                 parser.add_option('-p', '--password',
 485                                 dest='password', metavar='PW', help='account password')
 486                 parser.add_option('-o', '--output',
 487                                 dest='outtmpl', metavar='TPL', help='output filename template')
 488                 parser.add_option('-q', '--quiet',
 489                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 490                 parser.add_option('-s', '--simulate',
 491                                 action='store_true', dest='simulate', help='do not download video', default=False)
 492                 parser.add_option('-t', '--title',
 493                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 494                 parser.add_option('-l', '--literal',
 495                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 496                 parser.add_option('-n', '--netrc',
 497                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 498                 parser.add_option('-g', '--get-url',
 499                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 500                 parser.add_option('-e', '--get-title',
 501                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 502                 parser.add_option('-f', '--format',
 503                                 dest='format', metavar='FMT', help='video format code')
 504                 parser.add_option('-b', '--best-quality',
 505                                 action='store_const', dest='video_format', help='alias for -f 18', const='18')
 506                 (opts, args) = parser.parse_args()
 507
 508                 # Conflicting, missing and erroneous options
 509                 if len(args) < 1:
 510                         sys.exit('ERROR: you must provide at least one URL')
 511                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 512                         sys.exit('ERROR: using .netrc conflicts with giving username/password')
 513                 if opts.password is not None and opts.username is None:
 514                         sys.exit('ERROR: account username missing')
 515                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 516                         sys.exit('ERROR: using output template conflicts with using title or literal title')
 517                 if opts.usetitle and opts.useliteral:
 518                         sys.exit('ERROR: using title conflicts with using literal title')
 519                 if opts.username is not None and opts.password is None:
 520                         opts.password = getpass.getpass('Type account password and press return:')
 521
 522                 # Information extractors
 523                 youtube_ie = YoutubeIE()
 524
 525                 # File downloader
 526                 fd = FileDownloader({
 527                         'usenetrc': opts.usenetrc,
 528                         'username': opts.username,
 529                         'password': opts.password,
 530                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 531                         'forceurl': opts.geturl,
 532                         'forcetitle': opts.gettitle,
 533                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 534                         'format': opts.format,
 535                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
 536                                 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
 537                                 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
 538                                 or '%(id)s.%(ext)s'),
 539                         })
 540                 fd.add_info_extractor(youtube_ie)
 541                 retcode = fd.download(args)
 542                 sys.exit(retcode)
 543
 544         except KeyboardInterrupt:
 545                 sys.exit('\nERROR: Interrupted by user')