youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         format:         Video format code.
  59         outtmpl:        Template for output names.
  60         """
  61
  62         _params = None
  63         _ies = []
  64
  65         def __init__(self, params):
  66                 self._ies = []
  67                 self.set_params(params)
  68
  69         @staticmethod
  70         def pmkdir(filename):
  71                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  72                 components = filename.split(os.sep)
  73                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  74                 for dir in aggregate:
  75                         if not os.path.exists(dir):
  76                                 os.mkdir(dir)
  77
  78         @staticmethod
  79         def format_bytes(bytes):
  80                 if bytes is None:
  81                         return 'N/A'
  82                 if bytes == 0:
  83                         exponent = 0
  84                 else:
  85                         exponent = long(math.log(float(bytes), 1024.0))
  86                 suffix = 'bkMGTPEZY'[exponent]
  87                 if exponent == 0:
  88                         return '%s%s' % (bytes, suffix)
  89                 converted = float(bytes) / float(1024**exponent)
  90                 return '%.2f%s' % (converted, suffix)
  91
  92         @staticmethod
  93         def calc_percent(byte_counter, data_len):
  94                 if data_len is None:
  95                         return '---.-%'
  96                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  97
  98         @staticmethod
  99         def calc_eta(start, now, total, current):
 100                 if total is None:
 101                         return '--:--'
 102                 dif = now - start
 103                 if current == 0 or dif < 0.001: # One millisecond
 104                         return '--:--'
 105                 rate = float(current) / dif
 106                 eta = long((float(total) - float(current)) / rate)
 107                 (eta_mins, eta_secs) = divmod(eta, 60)
 108                 if eta_mins > 99:
 109                         return '--:--'
 110                 return '%02d:%02d' % (eta_mins, eta_secs)
 111
 112         @staticmethod
 113         def calc_speed(start, now, bytes):
 114                 dif = now - start
 115                 if bytes == 0 or dif < 0.001: # One millisecond
 116                         return '%9s' % 'N/A b/s'
 117                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 118
 119         @staticmethod
 120         def best_block_size(elapsed_time, bytes):
 121                 new_min = max(bytes / 2.0, 1.0)
 122                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 123                 if elapsed_time < 0.001:
 124                         return int(new_max)
 125                 rate = bytes / elapsed_time
 126                 if rate > new_max:
 127                         return int(new_max)
 128                 if rate < new_min:
 129                         return int(new_min)
 130                 return int(rate)
 131
 132         def set_params(self, params):
 133                 """Sets parameters."""
 134                 if type(params) != dict:
 135                         raise ValueError('params: dictionary expected')
 136                 self._params = params
 137
 138         def get_params(self):
 139                 """Get parameters."""
 140                 return self._params
 141
 142         def add_info_extractor(self, ie):
 143                 """Add an InfoExtractor object to the end of the list."""
 144                 self._ies.append(ie)
 145                 ie.set_downloader(self)
 146
 147         def download(self, url_list):
 148                 """Download a given list of URLs."""
 149                 for url in url_list:
 150                         suitable_found = False
 151                         for ie in self._ies:
 152                                 if not ie.suitable(url):
 153                                         continue
 154                                 # Suitable InfoExtractor found
 155                                 suitable_found = True
 156                                 results = [x for x in ie.extract(url) if x is not None]
 157
 158                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
 159                                         sys.exit('ERROR: fixed output name but more than one file to download')
 160
 161                                 for result in results:
 162                                         try:
 163                                                 filename = self._params['outtmpl'] % result
 164                                         except (KeyError), err:
 165                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
 166                                                 continue
 167                                         try:
 168                                                 self.pmkdir(filename)
 169                                         except (OSError, IOError), err:
 170                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
 171                                                 continue
 172                                         try:
 173                                                 outstream = open(filename, 'wb')
 174                                         except (OSError, IOError), err:
 175                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
 176                                                 continue
 177                                         try:
 178                                                 self._do_download(outstream, result['url'])
 179                                                 outstream.close()
 180                                         except (OSError, IOError), err:
 181                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
 182                                                 continue
 183                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 184                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
 185                                                 continue
 186                                 break
 187                         if not suitable_found:
 188                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
 189
 190         def _do_download(self, stream, url):
 191                 request = urllib2.Request(url, None, std_headers)
 192                 data = urllib2.urlopen(request)
 193                 data_len = data.info().get('Content-length', None)
 194                 data_len_str = self.format_bytes(data_len)
 195                 byte_counter = 0
 196                 block_size = 1024
 197                 start = time.time()
 198                 while True:
 199                         percent_str = self.calc_percent(byte_counter, data_len)
 200                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 201                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 202
 203                         if not self._params.get('quiet', False):
 204                                 sys.stdout.write('\r[download] %s of %s at %s ETA %s' %
 205                                                 (percent_str, data_len_str, speed_str, eta_str))
 206                                 sys.stdout.flush()
 207
 208                         before = time.time()
 209                         data_block = data.read(block_size)
 210                         after = time.time()
 211                         data_block_len = len(data_block)
 212                         if data_block_len == 0:
 213                                 break
 214                         byte_counter += data_block_len
 215                         stream.write(data_block)
 216                         block_size = self.best_block_size(after - before, data_block_len)
 217
 218                 if not self._params.get('quiet', False):
 219                         print
 220
 221                 if data_len is not None and str(byte_counter) != data_len:
 222                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 223
 224 class InfoExtractor(object):
 225         """Information Extractor class.
 226
 227         Information extractors are the classes that, given a URL, extract
 228         information from the video (or videos) the URL refers to. This
 229         information includes the real video URL, the video title and simplified
 230         title, author and others. It is returned in a list of dictionaries when
 231         calling its extract() method. It is a list because a URL can refer to
 232         more than one video (think of playlists). The dictionaries must include
 233         the following fields:
 234
 235         id:             Video identifier.
 236         url:            Final video URL.
 237         uploader:       Nickname of the video uploader.
 238         title:          Literal title.
 239         stitle:         Simplified title.
 240         ext:            Video filename extension.
 241
 242         Subclasses of this one should re-define the _real_initialize() and
 243         _real_extract() methods, as well as the suitable() static method.
 244         Probably, they should also be instantiated and added to the main
 245         downloader.
 246         """
 247
 248         _ready = False
 249         _downloader = None
 250
 251         def __init__(self, downloader=None):
 252                 """Constructor. Receives an optional downloader."""
 253                 self._ready = False
 254                 self.set_downloader(downloader)
 255
 256         @staticmethod
 257         def suitable(url):
 258                 """Receives a URL and returns True if suitable for this IE."""
 259                 return True
 260
 261         def initialize(self):
 262                 """Initializes an instance (login, etc)."""
 263                 if not self._ready:
 264                         self._real_initialize()
 265                         self._ready = True
 266
 267         def extract(self, url):
 268                 """Extracts URL information and returns it in list of dicts."""
 269                 self.initialize()
 270                 return self._real_extract(url)
 271
 272         def set_downloader(self, downloader):
 273                 """Sets the downloader for this IE."""
 274                 self._downloader = downloader
 275
 276         def to_stdout(self, message):
 277                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 278                         print message
 279
 280         def to_stderr(self, message):
 281                 sys.stderr.write('%s\n' % message)
 282
 283         def _real_initialize(self):
 284                 """Real initialization process. Redefine in subclasses."""
 285                 pass
 286
 287         def _real_extract(self, url):
 288                 """Real extraction process. Redefine in subclasses."""
 289                 pass
 290
 291 class YoutubeIE(InfoExtractor):
 292         """Information extractor for youtube.com."""
 293
 294         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 295         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 296         _NETRC_MACHINE = 'youtube'
 297
 298         def _real_initialize(self):
 299                 if self._downloader is None:
 300                         return
 301
 302                 username = None
 303                 password = None
 304                 downloader_params = self._downloader.get_params()
 305
 306                 # Attempt to use provided username and password or .netrc data
 307                 if downloader_params.get('username', None) is not None:
 308                         username = downloader_params['username']
 309                         password = downloader_params['password']
 310                 elif downloader_params.get('usenetrc', False):
 311                         try:
 312                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 313                                 if info is not None:
 314                                         username = info[0]
 315                                         password = info[2]
 316                                 else:
 317                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 318                         except (IOError, netrc.NetrcParseError), err:
 319                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 320                                 return
 321
 322                 if username is None:
 323                         return
 324
 325                 # Log in
 326                 login_form = {  'current_form': 'loginForm',
 327                                 'next':         '/',
 328                                 'action_login': 'Log In',
 329                                 'username':     username,
 330                                 'password':     password,       }
 331                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 332                 try:
 333                         self.to_stdout('[youtube] Logging in')
 334                         login_results = urllib2.urlopen(request).read()
 335                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 336                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 337                                 return
 338                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 339                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 340                         return
 341
 342                 # Confirm age
 343                 age_form = {    'next_url':             '/',
 344                                 'action_confirm':       'Confirm',      }
 345                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 346                 try:
 347                         self.to_stdout('[youtube] Confirming age')
 348                         age_results = urllib2.urlopen(request).read()
 349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 350                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 351
 352         def _real_extract(self, url):
 353                 # Extract video id from URL
 354                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 355                 if mobj is None:
 356                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 357                         return [None]
 358                 video_id = mobj.group(2)
 359
 360                 # Downloader parameters
 361                 format_param = None
 362                 if self._downloader is not None:
 363                         params = self._downloader.get_params()
 364                         format_param = params.get('format', None)
 365
 366                 # Extension
 367                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
 368
 369                 # Normalize URL, including format
 370                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 371                 if format_param is not None:
 372                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 373                 request = urllib2.Request(normalized_url, None, std_headers)
 374                 try:
 375                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 376                         video_webpage = urllib2.urlopen(request).read()
 377                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 378                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 379                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 380
 381                 # "t" param
 382                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 383                 if mobj is None:
 384                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 385                         return [None]
 386                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 387                 if format_param is not None:
 388                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 389
 390                 # uploader
 391                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 392                 if mobj is None:
 393                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 394                         return [None]
 395                 video_uploader = mobj.group(1)
 396
 397                 # title
 398                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 399                 if mobj is None:
 400                         self.to_stderr('ERROR: Unable to extract video title')
 401                         return [None]
 402                 video_title = mobj.group(1).decode('utf-8')
 403                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 404
 405                 # simplified title
 406                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
 407                 simple_title = simple_title.strip(u'_')
 408
 409                 # Return information
 410                 return [{       'id':           video_id,
 411                                 'url':          video_real_url,
 412                                 'uploader':     video_uploader,
 413                                 'title':        video_title,
 414                                 'stitle':       simple_title,
 415                                 'ext':          video_extension,
 416                                 }]
 417
 418 if __name__ == '__main__':
 419         try:
 420                 # General configuration
 421                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 422                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 423
 424                 # Information extractors
 425                 youtube_ie = YoutubeIE()
 426
 427                 # File downloader
 428                 fd = FileDownloader({   'usenetrc': False,
 429                                         'username': None,
 430                                         'password': None,
 431                                         'quiet': False,
 432                                         'format': None,
 433                                         'outtmpl': '%(id)s.%(ext)s'
 434                                         })
 435                 fd.add_info_extractor(youtube_ie)
 436                 fd.download([   'http://www.youtube.com/watch?v=t7qdwI7TVe8',
 437                                 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
 438                                 'http://www.youtube.com/watch?v=DZRXe1wtC-M',   ])
 439
 440         except KeyboardInterrupt:
 441                 sys.exit('\nERROR: Interrupted by user')