youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         simulate:       Do not download the video files.
  59         format:         Video format code.
  60         outtmpl:        Template for output names.
  61         """
  62
  63         _params = None
  64         _ies = []
  65
  66         def __init__(self, params):
  67                 self._ies = []
  68                 self.set_params(params)
  69
  70         @staticmethod
  71         def pmkdir(filename):
  72                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  73                 components = filename.split(os.sep)
  74                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  75                 for dir in aggregate:
  76                         if not os.path.exists(dir):
  77                                 os.mkdir(dir)
  78
  79         @staticmethod
  80         def format_bytes(bytes):
  81                 if bytes is None:
  82                         return 'N/A'
  83                 if bytes == 0:
  84                         exponent = 0
  85                 else:
  86                         exponent = long(math.log(float(bytes), 1024.0))
  87                 suffix = 'bkMGTPEZY'[exponent]
  88                 converted = float(bytes) / float(1024**exponent)
  89                 return '%.2f%s' % (converted, suffix)
  90
  91         @staticmethod
  92         def calc_percent(byte_counter, data_len):
  93                 if data_len is None:
  94                         return '---.-%'
  95                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  96
  97         @staticmethod
  98         def calc_eta(start, now, total, current):
  99                 if total is None:
 100                         return '--:--'
 101                 dif = now - start
 102                 if current == 0 or dif < 0.001: # One millisecond
 103                         return '--:--'
 104                 rate = float(current) / dif
 105                 eta = long((float(total) - float(current)) / rate)
 106                 (eta_mins, eta_secs) = divmod(eta, 60)
 107                 if eta_mins > 99:
 108                         return '--:--'
 109                 return '%02d:%02d' % (eta_mins, eta_secs)
 110
 111         @staticmethod
 112         def calc_speed(start, now, bytes):
 113                 dif = now - start
 114                 if bytes == 0 or dif < 0.001: # One millisecond
 115                         return '%10s' % '---b/s'
 116                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 117
 118         @staticmethod
 119         def best_block_size(elapsed_time, bytes):
 120                 new_min = max(bytes / 2.0, 1.0)
 121                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 122                 if elapsed_time < 0.001:
 123                         return int(new_max)
 124                 rate = bytes / elapsed_time
 125                 if rate > new_max:
 126                         return int(new_max)
 127                 if rate < new_min:
 128                         return int(new_min)
 129                 return int(rate)
 130
 131         def set_params(self, params):
 132                 """Sets parameters."""
 133                 if type(params) != dict:
 134                         raise ValueError('params: dictionary expected')
 135                 self._params = params
 136
 137         def get_params(self):
 138                 """Get parameters."""
 139                 return self._params
 140
 141         def add_info_extractor(self, ie):
 142                 """Add an InfoExtractor object to the end of the list."""
 143                 self._ies.append(ie)
 144                 ie.set_downloader(self)
 145
 146         def to_stdout(self, message, skip_eol=False):
 147                 """Print message to stdout if not in quiet mode."""
 148                 if not self._params.get('quiet', False):
 149                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 150                         sys.stdout.flush()
 151
 152         def download(self, url_list):
 153                 """Download a given list of URLs."""
 154                 for url in url_list:
 155                         suitable_found = False
 156                         for ie in self._ies:
 157                                 if not ie.suitable(url):
 158                                         continue
 159                                 # Suitable InfoExtractor found
 160                                 suitable_found = True
 161                                 results = [x for x in ie.extract(url) if x is not None]
 162
 163                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
 164                                         sys.exit('ERROR: fixed output name but more than one file to download')
 165
 166                                 if self._params.get('simulate', False):
 167                                         continue
 168
 169                                 for result in results:
 170                                         try:
 171                                                 filename = self._params['outtmpl'] % result
 172                                         except (KeyError), err:
 173                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
 174                                                 continue
 175                                         try:
 176                                                 self.pmkdir(filename)
 177                                         except (OSError, IOError), err:
 178                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
 179                                                 continue
 180                                         try:
 181                                                 outstream = open(filename, 'wb')
 182                                         except (OSError, IOError), err:
 183                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
 184                                                 continue
 185                                         try:
 186                                                 self._do_download(outstream, result['url'])
 187                                                 outstream.close()
 188                                         except (OSError, IOError), err:
 189                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
 190                                                 continue
 191                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 192                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
 193                                                 continue
 194                                 break
 195                         if not suitable_found:
 196                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
 197
 198         def _do_download(self, stream, url):
 199                 request = urllib2.Request(url, None, std_headers)
 200                 data = urllib2.urlopen(request)
 201                 data_len = data.info().get('Content-length', None)
 202                 data_len_str = self.format_bytes(data_len)
 203                 byte_counter = 0
 204                 block_size = 1024
 205                 start = time.time()
 206                 while True:
 207                         percent_str = self.calc_percent(byte_counter, data_len)
 208                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 209                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 210                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 211                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 212
 213                         before = time.time()
 214                         data_block = data.read(block_size)
 215                         after = time.time()
 216                         data_block_len = len(data_block)
 217                         if data_block_len == 0:
 218                                 break
 219                         byte_counter += data_block_len
 220                         stream.write(data_block)
 221                         block_size = self.best_block_size(after - before, data_block_len)
 222
 223                 self.to_stdout('')
 224                 if data_len is not None and str(byte_counter) != data_len:
 225                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 226
 227 class InfoExtractor(object):
 228         """Information Extractor class.
 229
 230         Information extractors are the classes that, given a URL, extract
 231         information from the video (or videos) the URL refers to. This
 232         information includes the real video URL, the video title and simplified
 233         title, author and others. It is returned in a list of dictionaries when
 234         calling its extract() method. It is a list because a URL can refer to
 235         more than one video (think of playlists). The dictionaries must include
 236         the following fields:
 237
 238         id:             Video identifier.
 239         url:            Final video URL.
 240         uploader:       Nickname of the video uploader.
 241         title:          Literal title.
 242         stitle:         Simplified title.
 243         ext:            Video filename extension.
 244
 245         Subclasses of this one should re-define the _real_initialize() and
 246         _real_extract() methods, as well as the suitable() static method.
 247         Probably, they should also be instantiated and added to the main
 248         downloader.
 249         """
 250
 251         _ready = False
 252         _downloader = None
 253
 254         def __init__(self, downloader=None):
 255                 """Constructor. Receives an optional downloader."""
 256                 self._ready = False
 257                 self.set_downloader(downloader)
 258
 259         @staticmethod
 260         def suitable(url):
 261                 """Receives a URL and returns True if suitable for this IE."""
 262                 return True
 263
 264         def initialize(self):
 265                 """Initializes an instance (login, etc)."""
 266                 if not self._ready:
 267                         self._real_initialize()
 268                         self._ready = True
 269
 270         def extract(self, url):
 271                 """Extracts URL information and returns it in list of dicts."""
 272                 self.initialize()
 273                 return self._real_extract(url)
 274
 275         def set_downloader(self, downloader):
 276                 """Sets the downloader for this IE."""
 277                 self._downloader = downloader
 278
 279         def to_stdout(self, message):
 280                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 281                         print message
 282
 283         def to_stderr(self, message):
 284                 sys.stderr.write('%s\n' % message)
 285
 286         def _real_initialize(self):
 287                 """Real initialization process. Redefine in subclasses."""
 288                 pass
 289
 290         def _real_extract(self, url):
 291                 """Real extraction process. Redefine in subclasses."""
 292                 pass
 293
 294 class YoutubeIE(InfoExtractor):
 295         """Information extractor for youtube.com."""
 296
 297         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 298         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 299         _NETRC_MACHINE = 'youtube'
 300
 301         def _real_initialize(self):
 302                 if self._downloader is None:
 303                         return
 304
 305                 username = None
 306                 password = None
 307                 downloader_params = self._downloader.get_params()
 308
 309                 # Attempt to use provided username and password or .netrc data
 310                 if downloader_params.get('username', None) is not None:
 311                         username = downloader_params['username']
 312                         password = downloader_params['password']
 313                 elif downloader_params.get('usenetrc', False):
 314                         try:
 315                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 316                                 if info is not None:
 317                                         username = info[0]
 318                                         password = info[2]
 319                                 else:
 320                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 321                         except (IOError, netrc.NetrcParseError), err:
 322                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 323                                 return
 324
 325                 if username is None:
 326                         return
 327
 328                 # Log in
 329                 login_form = {
 330                                 'current_form': 'loginForm',
 331                                 'next':         '/',
 332                                 'action_login': 'Log In',
 333                                 'username':     username,
 334                                 'password':     password,
 335                                 }
 336                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 337                 try:
 338                         self.to_stdout('[youtube] Logging in')
 339                         login_results = urllib2.urlopen(request).read()
 340                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 341                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 342                                 return
 343                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 344                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 345                         return
 346
 347                 # Confirm age
 348                 age_form = {
 349                                 'next_url':             '/',
 350                                 'action_confirm':       'Confirm',
 351                                 }
 352                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 353                 try:
 354                         self.to_stdout('[youtube] Confirming age')
 355                         age_results = urllib2.urlopen(request).read()
 356                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 357                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 358
 359         def _real_extract(self, url):
 360                 # Extract video id from URL
 361                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 362                 if mobj is None:
 363                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 364                         return [None]
 365                 video_id = mobj.group(2)
 366
 367                 # Downloader parameters
 368                 format_param = None
 369                 if self._downloader is not None:
 370                         params = self._downloader.get_params()
 371                         format_param = params.get('format', None)
 372
 373                 # Extension
 374                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
 375
 376                 # Normalize URL, including format
 377                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 378                 if format_param is not None:
 379                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 380                 request = urllib2.Request(normalized_url, None, std_headers)
 381                 try:
 382                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 383                         video_webpage = urllib2.urlopen(request).read()
 384                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 385                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 386                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 387
 388                 # "t" param
 389                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 390                 if mobj is None:
 391                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 392                         return [None]
 393                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 394                 if format_param is not None:
 395                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 396                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 397
 398                 # uploader
 399                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 400                 if mobj is None:
 401                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 402                         return [None]
 403                 video_uploader = mobj.group(1)
 404
 405                 # title
 406                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 407                 if mobj is None:
 408                         self.to_stderr('ERROR: Unable to extract video title')
 409                         return [None]
 410                 video_title = mobj.group(1).decode('utf-8')
 411                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 412
 413                 # simplified title
 414                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
 415                 simple_title = simple_title.strip(u'_')
 416
 417                 # Return information
 418                 return [{
 419                         'id':           video_id,
 420                         'url':          video_real_url,
 421                         'uploader':     video_uploader,
 422                         'title':        video_title,
 423                         'stitle':       simple_title,
 424                         'ext':          video_extension,
 425                         }]
 426
 427 if __name__ == '__main__':
 428         try:
 429                 # General configuration
 430                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 431                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 432
 433                 # Information extractors
 434                 youtube_ie = YoutubeIE()
 435
 436                 # File downloader
 437                 fd = FileDownloader({
 438                         'usenetrc': False,
 439                         'username': None,
 440                         'password': None,
 441                         'quiet': False,
 442                         'simulate': True,
 443                         'format': None,
 444                         'outtmpl': '%(id)s.%(ext)s'
 445                         })
 446                 fd.add_info_extractor(youtube_ie)
 447                 fd.download([
 448                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
 449                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
 450                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
 451                         ])
 452
 453         except KeyboardInterrupt:
 454                 sys.exit('\nERROR: Interrupted by user')