youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         format:         Video format code.
  59         outtmpl:        Template for output names.
  60         """
  61
  62         _params = None
  63         _ies = []
  64
  65         def __init__(self, params):
  66                 self._ies = []
  67                 self.set_params(params)
  68
  69         @staticmethod
  70         def pmkdir(filename):
  71                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  72                 components = filename.split(os.sep)
  73                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  74                 for dir in aggregate:
  75                         if not os.path.exists(dir):
  76                                 os.mkdir(dir)
  77
  78         @staticmethod
  79         def format_bytes(bytes):
  80                 if bytes is None:
  81                         return 'N/A'
  82                 if bytes == 0:
  83                         exponent = 0
  84                 else:
  85                         exponent = long(math.log(float(bytes), 1024.0))
  86                 suffix = 'bkMGTPEZY'[exponent]
  87                 converted = float(bytes) / float(1024**exponent)
  88                 return '%.2f%s' % (converted, suffix)
  89
  90         @staticmethod
  91         def calc_percent(byte_counter, data_len):
  92                 if data_len is None:
  93                         return '---.-%'
  94                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  95
  96         @staticmethod
  97         def calc_eta(start, now, total, current):
  98                 if total is None:
  99                         return '--:--'
 100                 dif = now - start
 101                 if current == 0 or dif < 0.001: # One millisecond
 102                         return '--:--'
 103                 rate = float(current) / dif
 104                 eta = long((float(total) - float(current)) / rate)
 105                 (eta_mins, eta_secs) = divmod(eta, 60)
 106                 if eta_mins > 99:
 107                         return '--:--'
 108                 return '%02d:%02d' % (eta_mins, eta_secs)
 109
 110         @staticmethod
 111         def calc_speed(start, now, bytes):
 112                 dif = now - start
 113                 if bytes == 0 or dif < 0.001: # One millisecond
 114                         return '%10s' % '---b/s'
 115                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 116
 117         @staticmethod
 118         def best_block_size(elapsed_time, bytes):
 119                 new_min = max(bytes / 2.0, 1.0)
 120                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 121                 if elapsed_time < 0.001:
 122                         return int(new_max)
 123                 rate = bytes / elapsed_time
 124                 if rate > new_max:
 125                         return int(new_max)
 126                 if rate < new_min:
 127                         return int(new_min)
 128                 return int(rate)
 129
 130         def set_params(self, params):
 131                 """Sets parameters."""
 132                 if type(params) != dict:
 133                         raise ValueError('params: dictionary expected')
 134                 self._params = params
 135
 136         def get_params(self):
 137                 """Get parameters."""
 138                 return self._params
 139
 140         def add_info_extractor(self, ie):
 141                 """Add an InfoExtractor object to the end of the list."""
 142                 self._ies.append(ie)
 143                 ie.set_downloader(self)
 144
 145         def to_stdout(self, message, skip_eol=False):
 146                 """Print message to stdout if not in quiet mode."""
 147                 if not self._params.get('quiet', False):
 148                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 149                         sys.stdout.flush()
 150
 151         def download(self, url_list):
 152                 """Download a given list of URLs."""
 153                 for url in url_list:
 154                         suitable_found = False
 155                         for ie in self._ies:
 156                                 if not ie.suitable(url):
 157                                         continue
 158                                 # Suitable InfoExtractor found
 159                                 suitable_found = True
 160                                 results = [x for x in ie.extract(url) if x is not None]
 161
 162                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
 163                                         sys.exit('ERROR: fixed output name but more than one file to download')
 164
 165                                 for result in results:
 166                                         try:
 167                                                 filename = self._params['outtmpl'] % result
 168                                         except (KeyError), err:
 169                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
 170                                                 continue
 171                                         try:
 172                                                 self.pmkdir(filename)
 173                                         except (OSError, IOError), err:
 174                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
 175                                                 continue
 176                                         try:
 177                                                 outstream = open(filename, 'wb')
 178                                         except (OSError, IOError), err:
 179                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
 180                                                 continue
 181                                         try:
 182                                                 self._do_download(outstream, result['url'])
 183                                                 outstream.close()
 184                                         except (OSError, IOError), err:
 185                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
 186                                                 continue
 187                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 188                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
 189                                                 continue
 190                                 break
 191                         if not suitable_found:
 192                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
 193
 194         def _do_download(self, stream, url):
 195                 request = urllib2.Request(url, None, std_headers)
 196                 data = urllib2.urlopen(request)
 197                 data_len = data.info().get('Content-length', None)
 198                 data_len_str = self.format_bytes(data_len)
 199                 byte_counter = 0
 200                 block_size = 1024
 201                 start = time.time()
 202                 while True:
 203                         percent_str = self.calc_percent(byte_counter, data_len)
 204                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 205                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 206                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 207                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 208
 209                         before = time.time()
 210                         data_block = data.read(block_size)
 211                         after = time.time()
 212                         data_block_len = len(data_block)
 213                         if data_block_len == 0:
 214                                 break
 215                         byte_counter += data_block_len
 216                         stream.write(data_block)
 217                         block_size = self.best_block_size(after - before, data_block_len)
 218
 219                 self.to_stdout('')
 220                 if data_len is not None and str(byte_counter) != data_len:
 221                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 222
 223 class InfoExtractor(object):
 224         """Information Extractor class.
 225
 226         Information extractors are the classes that, given a URL, extract
 227         information from the video (or videos) the URL refers to. This
 228         information includes the real video URL, the video title and simplified
 229         title, author and others. It is returned in a list of dictionaries when
 230         calling its extract() method. It is a list because a URL can refer to
 231         more than one video (think of playlists). The dictionaries must include
 232         the following fields:
 233
 234         id:             Video identifier.
 235         url:            Final video URL.
 236         uploader:       Nickname of the video uploader.
 237         title:          Literal title.
 238         stitle:         Simplified title.
 239         ext:            Video filename extension.
 240
 241         Subclasses of this one should re-define the _real_initialize() and
 242         _real_extract() methods, as well as the suitable() static method.
 243         Probably, they should also be instantiated and added to the main
 244         downloader.
 245         """
 246
 247         _ready = False
 248         _downloader = None
 249
 250         def __init__(self, downloader=None):
 251                 """Constructor. Receives an optional downloader."""
 252                 self._ready = False
 253                 self.set_downloader(downloader)
 254
 255         @staticmethod
 256         def suitable(url):
 257                 """Receives a URL and returns True if suitable for this IE."""
 258                 return True
 259
 260         def initialize(self):
 261                 """Initializes an instance (login, etc)."""
 262                 if not self._ready:
 263                         self._real_initialize()
 264                         self._ready = True
 265
 266         def extract(self, url):
 267                 """Extracts URL information and returns it in list of dicts."""
 268                 self.initialize()
 269                 return self._real_extract(url)
 270
 271         def set_downloader(self, downloader):
 272                 """Sets the downloader for this IE."""
 273                 self._downloader = downloader
 274
 275         def to_stdout(self, message):
 276                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 277                         print message
 278
 279         def to_stderr(self, message):
 280                 sys.stderr.write('%s\n' % message)
 281
 282         def _real_initialize(self):
 283                 """Real initialization process. Redefine in subclasses."""
 284                 pass
 285
 286         def _real_extract(self, url):
 287                 """Real extraction process. Redefine in subclasses."""
 288                 pass
 289
 290 class YoutubeIE(InfoExtractor):
 291         """Information extractor for youtube.com."""
 292
 293         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 294         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 295         _NETRC_MACHINE = 'youtube'
 296
 297         def _real_initialize(self):
 298                 if self._downloader is None:
 299                         return
 300
 301                 username = None
 302                 password = None
 303                 downloader_params = self._downloader.get_params()
 304
 305                 # Attempt to use provided username and password or .netrc data
 306                 if downloader_params.get('username', None) is not None:
 307                         username = downloader_params['username']
 308                         password = downloader_params['password']
 309                 elif downloader_params.get('usenetrc', False):
 310                         try:
 311                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 312                                 if info is not None:
 313                                         username = info[0]
 314                                         password = info[2]
 315                                 else:
 316                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 317                         except (IOError, netrc.NetrcParseError), err:
 318                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 319                                 return
 320
 321                 if username is None:
 322                         return
 323
 324                 # Log in
 325                 login_form = {
 326                                 'current_form': 'loginForm',
 327                                 'next':         '/',
 328                                 'action_login': 'Log In',
 329                                 'username':     username,
 330                                 'password':     password,
 331                                 }
 332                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 333                 try:
 334                         self.to_stdout('[youtube] Logging in')
 335                         login_results = urllib2.urlopen(request).read()
 336                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 337                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 338                                 return
 339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 340                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 341                         return
 342
 343                 # Confirm age
 344                 age_form = {
 345                                 'next_url':             '/',
 346                                 'action_confirm':       'Confirm',
 347                                 }
 348                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 349                 try:
 350                         self.to_stdout('[youtube] Confirming age')
 351                         age_results = urllib2.urlopen(request).read()
 352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 353                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 354
 355         def _real_extract(self, url):
 356                 # Extract video id from URL
 357                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 358                 if mobj is None:
 359                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 360                         return [None]
 361                 video_id = mobj.group(2)
 362
 363                 # Downloader parameters
 364                 format_param = None
 365                 if self._downloader is not None:
 366                         params = self._downloader.get_params()
 367                         format_param = params.get('format', None)
 368
 369                 # Extension
 370                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
 371
 372                 # Normalize URL, including format
 373                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 374                 if format_param is not None:
 375                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 376                 request = urllib2.Request(normalized_url, None, std_headers)
 377                 try:
 378                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 379                         video_webpage = urllib2.urlopen(request).read()
 380                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 381                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 382                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 383
 384                 # "t" param
 385                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 386                 if mobj is None:
 387                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 388                         return [None]
 389                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 390                 if format_param is not None:
 391                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 392                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 393
 394                 # uploader
 395                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 396                 if mobj is None:
 397                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 398                         return [None]
 399                 video_uploader = mobj.group(1)
 400
 401                 # title
 402                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 403                 if mobj is None:
 404                         self.to_stderr('ERROR: Unable to extract video title')
 405                         return [None]
 406                 video_title = mobj.group(1).decode('utf-8')
 407                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 408
 409                 # simplified title
 410                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
 411                 simple_title = simple_title.strip(u'_')
 412
 413                 # Return information
 414                 return [{
 415                         'id':           video_id,
 416                         'url':          video_real_url,
 417                         'uploader':     video_uploader,
 418                         'title':        video_title,
 419                         'stitle':       simple_title,
 420                         'ext':          video_extension,
 421                         }]
 422
 423 if __name__ == '__main__':
 424         try:
 425                 # General configuration
 426                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 427                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 428
 429                 # Information extractors
 430                 youtube_ie = YoutubeIE()
 431
 432                 # File downloader
 433                 fd = FileDownloader({
 434                         'usenetrc': False,
 435                         'username': None,
 436                         'password': None,
 437                         'quiet': False,
 438                         'format': None,
 439                         'outtmpl': '%(id)s.%(ext)s'
 440                         })
 441                 fd.add_info_extractor(youtube_ie)
 442                 fd.download([
 443                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
 444                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
 445                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
 446                         ])
 447
 448         except KeyboardInterrupt:
 449                 sys.exit('\nERROR: Interrupted by user')