youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         simulate:       Do not download the video files.
  59         format:         Video format code.
  60         outtmpl:        Template for output names.
  61         """
  62
  63         _params = None
  64         _ies = []
  65
  66         def __init__(self, params):
  67                 self._ies = []
  68                 self.set_params(params)
  69
  70         @staticmethod
  71         def pmkdir(filename):
  72                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  73                 components = filename.split(os.sep)
  74                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  75                 for dir in aggregate:
  76                         if not os.path.exists(dir):
  77                                 os.mkdir(dir)
  78
  79         @staticmethod
  80         def format_bytes(bytes):
  81                 if bytes is None:
  82                         return 'N/A'
  83                 if bytes == 0:
  84                         exponent = 0
  85                 else:
  86                         exponent = long(math.log(float(bytes), 1024.0))
  87                 suffix = 'bkMGTPEZY'[exponent]
  88                 converted = float(bytes) / float(1024**exponent)
  89                 return '%.2f%s' % (converted, suffix)
  90
  91         @staticmethod
  92         def calc_percent(byte_counter, data_len):
  93                 if data_len is None:
  94                         return '---.-%'
  95                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  96
  97         @staticmethod
  98         def calc_eta(start, now, total, current):
  99                 if total is None:
 100                         return '--:--'
 101                 dif = now - start
 102                 if current == 0 or dif < 0.001: # One millisecond
 103                         return '--:--'
 104                 rate = float(current) / dif
 105                 eta = long((float(total) - float(current)) / rate)
 106                 (eta_mins, eta_secs) = divmod(eta, 60)
 107                 if eta_mins > 99:
 108                         return '--:--'
 109                 return '%02d:%02d' % (eta_mins, eta_secs)
 110
 111         @staticmethod
 112         def calc_speed(start, now, bytes):
 113                 dif = now - start
 114                 if bytes == 0 or dif < 0.001: # One millisecond
 115                         return '%10s' % '---b/s'
 116                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 117
 118         @staticmethod
 119         def best_block_size(elapsed_time, bytes):
 120                 new_min = max(bytes / 2.0, 1.0)
 121                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 122                 if elapsed_time < 0.001:
 123                         return int(new_max)
 124                 rate = bytes / elapsed_time
 125                 if rate > new_max:
 126                         return int(new_max)
 127                 if rate < new_min:
 128                         return int(new_min)
 129                 return int(rate)
 130
 131         def set_params(self, params):
 132                 """Sets parameters."""
 133                 if type(params) != dict:
 134                         raise ValueError('params: dictionary expected')
 135                 self._params = params
 136
 137         def get_params(self):
 138                 """Get parameters."""
 139                 return self._params
 140
 141         def add_info_extractor(self, ie):
 142                 """Add an InfoExtractor object to the end of the list."""
 143                 self._ies.append(ie)
 144                 ie.set_downloader(self)
 145
 146         def to_stdout(self, message, skip_eol=False):
 147                 """Print message to stdout if not in quiet mode."""
 148                 if not self._params.get('quiet', False):
 149                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 150                         sys.stdout.flush()
 151
 152         def to_stderr(self, message):
 153                 """Print message to stderr."""
 154                 sys.stderr.write('%s\n' % message)
 155
 156         def download(self, url_list):
 157                 """Download a given list of URLs."""
 158                 for url in url_list:
 159                         suitable_found = False
 160                         for ie in self._ies:
 161                                 if not ie.suitable(url):
 162                                         continue
 163                                 # Suitable InfoExtractor found
 164                                 suitable_found = True
 165                                 results = [x for x in ie.extract(url) if x is not None]
 166
 167                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
 168                                         sys.exit('ERROR: fixed output name but more than one file to download')
 169
 170                                 if self._params.get('simulate', False):
 171                                         continue
 172
 173                                 for result in results:
 174                                         try:
 175                                                 filename = self._params['outtmpl'] % result
 176                                         except (KeyError), err:
 177                                                 self.to_stderr('ERROR: invalid output template: %s' % str(err))
 178                                                 continue
 179                                         try:
 180                                                 self.pmkdir(filename)
 181                                         except (OSError, IOError), err:
 182                                                 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
 183                                                 continue
 184                                         try:
 185                                                 outstream = open(filename, 'wb')
 186                                         except (OSError, IOError), err:
 187                                                 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
 188                                                 continue
 189                                         try:
 190                                                 self._do_download(outstream, result['url'])
 191                                                 outstream.close()
 192                                         except (OSError, IOError), err:
 193                                                 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
 194                                                 continue
 195                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 196                                                 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
 197                                                 continue
 198                                 break
 199                         if not suitable_found:
 200                                 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
 201
 202         def _do_download(self, stream, url):
 203                 request = urllib2.Request(url, None, std_headers)
 204                 data = urllib2.urlopen(request)
 205                 data_len = data.info().get('Content-length', None)
 206                 data_len_str = self.format_bytes(data_len)
 207                 byte_counter = 0
 208                 block_size = 1024
 209                 start = time.time()
 210                 while True:
 211                         percent_str = self.calc_percent(byte_counter, data_len)
 212                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 213                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 214                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 215                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 216
 217                         before = time.time()
 218                         data_block = data.read(block_size)
 219                         after = time.time()
 220                         data_block_len = len(data_block)
 221                         if data_block_len == 0:
 222                                 break
 223                         byte_counter += data_block_len
 224                         stream.write(data_block)
 225                         block_size = self.best_block_size(after - before, data_block_len)
 226
 227                 self.to_stdout('')
 228                 if data_len is not None and str(byte_counter) != data_len:
 229                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 230
 231 class InfoExtractor(object):
 232         """Information Extractor class.
 233
 234         Information extractors are the classes that, given a URL, extract
 235         information from the video (or videos) the URL refers to. This
 236         information includes the real video URL, the video title and simplified
 237         title, author and others. It is returned in a list of dictionaries when
 238         calling its extract() method. It is a list because a URL can refer to
 239         more than one video (think of playlists). The dictionaries must include
 240         the following fields:
 241
 242         id:             Video identifier.
 243         url:            Final video URL.
 244         uploader:       Nickname of the video uploader.
 245         title:          Literal title.
 246         stitle:         Simplified title.
 247         ext:            Video filename extension.
 248
 249         Subclasses of this one should re-define the _real_initialize() and
 250         _real_extract() methods, as well as the suitable() static method.
 251         Probably, they should also be instantiated and added to the main
 252         downloader.
 253         """
 254
 255         _ready = False
 256         _downloader = None
 257
 258         def __init__(self, downloader=None):
 259                 """Constructor. Receives an optional downloader."""
 260                 self._ready = False
 261                 self.set_downloader(downloader)
 262
 263         @staticmethod
 264         def suitable(url):
 265                 """Receives a URL and returns True if suitable for this IE."""
 266                 return True
 267
 268         def initialize(self):
 269                 """Initializes an instance (login, etc)."""
 270                 if not self._ready:
 271                         self._real_initialize()
 272                         self._ready = True
 273
 274         def extract(self, url):
 275                 """Extracts URL information and returns it in list of dicts."""
 276                 self.initialize()
 277                 return self._real_extract(url)
 278
 279         def set_downloader(self, downloader):
 280                 """Sets the downloader for this IE."""
 281                 self._downloader = downloader
 282
 283         def to_stdout(self, message):
 284                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 285                         print message
 286
 287         def to_stderr(self, message):
 288                 sys.stderr.write('%s\n' % message)
 289
 290         def _real_initialize(self):
 291                 """Real initialization process. Redefine in subclasses."""
 292                 pass
 293
 294         def _real_extract(self, url):
 295                 """Real extraction process. Redefine in subclasses."""
 296                 pass
 297
 298 class YoutubeIE(InfoExtractor):
 299         """Information extractor for youtube.com."""
 300
 301         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 302         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 303         _NETRC_MACHINE = 'youtube'
 304
 305         def _real_initialize(self):
 306                 if self._downloader is None:
 307                         return
 308
 309                 username = None
 310                 password = None
 311                 downloader_params = self._downloader.get_params()
 312
 313                 # Attempt to use provided username and password or .netrc data
 314                 if downloader_params.get('username', None) is not None:
 315                         username = downloader_params['username']
 316                         password = downloader_params['password']
 317                 elif downloader_params.get('usenetrc', False):
 318                         try:
 319                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 320                                 if info is not None:
 321                                         username = info[0]
 322                                         password = info[2]
 323                                 else:
 324                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 325                         except (IOError, netrc.NetrcParseError), err:
 326                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 327                                 return
 328
 329                 if username is None:
 330                         return
 331
 332                 # Log in
 333                 login_form = {
 334                                 'current_form': 'loginForm',
 335                                 'next':         '/',
 336                                 'action_login': 'Log In',
 337                                 'username':     username,
 338                                 'password':     password,
 339                                 }
 340                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 341                 try:
 342                         self.to_stdout('[youtube] Logging in')
 343                         login_results = urllib2.urlopen(request).read()
 344                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 345                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 346                                 return
 347                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 348                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 349                         return
 350
 351                 # Confirm age
 352                 age_form = {
 353                                 'next_url':             '/',
 354                                 'action_confirm':       'Confirm',
 355                                 }
 356                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 357                 try:
 358                         self.to_stdout('[youtube] Confirming age')
 359                         age_results = urllib2.urlopen(request).read()
 360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 361                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 362
 363         def _real_extract(self, url):
 364                 # Extract video id from URL
 365                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 366                 if mobj is None:
 367                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 368                         return [None]
 369                 video_id = mobj.group(2)
 370
 371                 # Downloader parameters
 372                 format_param = None
 373                 if self._downloader is not None:
 374                         params = self._downloader.get_params()
 375                         format_param = params.get('format', None)
 376
 377                 # Extension
 378                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
 379
 380                 # Normalize URL, including format
 381                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 382                 if format_param is not None:
 383                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 384                 request = urllib2.Request(normalized_url, None, std_headers)
 385                 try:
 386                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 387                         video_webpage = urllib2.urlopen(request).read()
 388                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 389                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 390                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 391
 392                 # "t" param
 393                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 394                 if mobj is None:
 395                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 396                         return [None]
 397                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 398                 if format_param is not None:
 399                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 400                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 401
 402                 # uploader
 403                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 404                 if mobj is None:
 405                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 406                         return [None]
 407                 video_uploader = mobj.group(1)
 408
 409                 # title
 410                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 411                 if mobj is None:
 412                         self.to_stderr('ERROR: Unable to extract video title')
 413                         return [None]
 414                 video_title = mobj.group(1).decode('utf-8')
 415                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 416
 417                 # simplified title
 418                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
 419                 simple_title = simple_title.strip(u'_')
 420
 421                 # Return information
 422                 return [{
 423                         'id':           video_id,
 424                         'url':          video_real_url,
 425                         'uploader':     video_uploader,
 426                         'title':        video_title,
 427                         'stitle':       simple_title,
 428                         'ext':          video_extension,
 429                         }]
 430
 431 if __name__ == '__main__':
 432         try:
 433                 # General configuration
 434                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 435                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 436
 437                 # Information extractors
 438                 youtube_ie = YoutubeIE()
 439
 440                 # File downloader
 441                 fd = FileDownloader({
 442                         'usenetrc': False,
 443                         'username': None,
 444                         'password': None,
 445                         'quiet': False,
 446                         'simulate': True,
 447                         'format': None,
 448                         'outtmpl': '%(id)s.%(ext)s'
 449                         })
 450                 fd.add_info_extractor(youtube_ie)
 451                 fd.download([
 452                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
 453                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
 454                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
 455                         ])
 456
 457         except KeyboardInterrupt:
 458                 sys.exit('\nERROR: Interrupted by user')