youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor returns
  69         all the information to the FileDownloader and the latter downloads the
  70         file or does whatever it's instructed to do.
  71
  72         File downloaders accept a lot of parameters. In order not to saturate
  73         the object constructor with arguments, it receives a dictionary of
  74         options instead. These options are available through the get_params()
  75         method for the InfoExtractors to use. The FileDownloader also registers
  76         itself as the downloader in charge for the InfoExtractors that are
  77         added to it, so this is a "mutual registration".
  78
  79         Available options:
  80
  81         username:       Username for authentication purposes.
  82         password:       Password for authentication purposes.
  83         usenetrc:       Use netrc for authentication instead.
  84         quiet:          Do not print messages to stdout.
  85         forceurl:       Force printing final URL.
  86         forcetitle:     Force printing title.
  87         simulate:       Do not download the video files.
  88         format:         Video format code.
  89         outtmpl:        Template for output names.
  90         ignoreerrors:   Do not stop on download errors.
  91         ratelimit:      Download speed limit, in bytes/sec.
  92         """
  93
  94         _params = None
  95         _ies = []
  96         _pps = []
  97
  98         def __init__(self, params):
  99                 """Create a FileDownloader object with the given options."""
 100                 self._ies = []
 101                 self._pps = []
 102                 self.set_params(params)
 103
 104         @staticmethod
 105         def pmkdir(filename):
 106                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 107                 components = filename.split(os.sep)
 108                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 109                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 110                 for dir in aggregate:
 111                         if not os.path.exists(dir):
 112                                 os.mkdir(dir)
 113
 114         @staticmethod
 115         def format_bytes(bytes):
 116                 if bytes is None:
 117                         return 'N/A'
 118                 if bytes == 0:
 119                         exponent = 0
 120                 else:
 121                         exponent = long(math.log(float(bytes), 1024.0))
 122                 suffix = 'bkMGTPEZY'[exponent]
 123                 converted = float(bytes) / float(1024**exponent)
 124                 return '%.2f%s' % (converted, suffix)
 125
 126         @staticmethod
 127         def calc_percent(byte_counter, data_len):
 128                 if data_len is None:
 129                         return '---.-%'
 130                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 131
 132         @staticmethod
 133         def calc_eta(start, now, total, current):
 134                 if total is None:
 135                         return '--:--'
 136                 dif = now - start
 137                 if current == 0 or dif < 0.001: # One millisecond
 138                         return '--:--'
 139                 rate = float(current) / dif
 140                 eta = long((float(total) - float(current)) / rate)
 141                 (eta_mins, eta_secs) = divmod(eta, 60)
 142                 if eta_mins > 99:
 143                         return '--:--'
 144                 return '%02d:%02d' % (eta_mins, eta_secs)
 145
 146         @staticmethod
 147         def calc_speed(start, now, bytes):
 148                 dif = now - start
 149                 if bytes == 0 or dif < 0.001: # One millisecond
 150                         return '%10s' % '---b/s'
 151                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 152
 153         @staticmethod
 154         def best_block_size(elapsed_time, bytes):
 155                 new_min = max(bytes / 2.0, 1.0)
 156                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 157                 if elapsed_time < 0.001:
 158                         return int(new_max)
 159                 rate = bytes / elapsed_time
 160                 if rate > new_max:
 161                         return int(new_max)
 162                 if rate < new_min:
 163                         return int(new_min)
 164                 return int(rate)
 165
 166         @staticmethod
 167         def parse_bytes(bytestr):
 168                 """Parse a string indicating a byte quantity into a long integer."""
 169                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 170                 if matchobj is None:
 171                         return None
 172                 number = float(matchobj.group(1))
 173                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 174                 return long(round(number * multiplier))
 175
 176         def set_params(self, params):
 177                 """Sets parameters."""
 178                 if type(params) != dict:
 179                         raise ValueError('params: dictionary expected')
 180                 self._params = params
 181
 182         def get_params(self):
 183                 """Get parameters."""
 184                 return self._params
 185
 186         def add_info_extractor(self, ie):
 187                 """Add an InfoExtractor object to the end of the list."""
 188                 self._ies.append(ie)
 189                 ie.set_downloader(self)
 190
 191         def add_post_processor(self, pp):
 192                 """Add a PostProcessor object to the end of the chain."""
 193                 self._pps.append(pp)
 194                 pp.set_downloader(self)
 195
 196         def to_stdout(self, message, skip_eol=False):
 197                 """Print message to stdout if not in quiet mode."""
 198                 if not self._params.get('quiet', False):
 199                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 200                         sys.stdout.flush()
 201
 202         def to_stderr(self, message):
 203                 """Print message to stderr."""
 204                 print >>sys.stderr, message
 205
 206         def fixed_template(self):
 207                 """Checks if the output template is fixed."""
 208                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 209
 210         def trouble(self, message=None):
 211                 """Determine action to take when a download problem appears.
 212
 213                 Depending on if the downloader has been configured to ignore
 214                 download errors or not, this method may throw an exception or
 215                 not when errors are found, after printing the message. If it
 216                 doesn't raise, it returns an error code suitable to be returned
 217                 later as a program exit code to indicate error.
 218                 """
 219                 if message is not None:
 220                         self.to_stderr(message)
 221                 if not self._params.get('ignoreerrors', False):
 222                         raise DownloadError(message)
 223                 return 1
 224
 225         def slow_down(self, start_time, byte_counter):
 226                 """Sleep if the download speed is over the rate limit."""
 227                 rate_limit = self._params.get('ratelimit', None)
 228                 if rate_limit is None or byte_counter == 0:
 229                         return
 230                 now = time.time()
 231                 elapsed = now - start_time
 232                 if elapsed <= 0.0:
 233                         return
 234                 speed = float(byte_counter) / elapsed
 235                 if speed > rate_limit:
 236                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 237
 238         def report_destination(self, filename):
 239                 """Report destination filename."""
 240                 self.to_stdout(u'[download] Destination: %s' % filename)
 241
 242         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 243                 """Report download progress."""
 244                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 245                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 246
 247         def report_finish(self):
 248                 """Report download finished."""
 249                 self.to_stdout(u'')
 250
 251         def download(self, url_list):
 252                 """Download a given list of URLs."""
 253                 retcode = 0
 254                 if len(url_list) > 1 and self.fixed_template():
 255                         raise SameFileError(self._params['outtmpl'])
 256
 257                 for url in url_list:
 258                         suitable_found = False
 259                         for ie in self._ies:
 260                                 if not ie.suitable(url):
 261                                         continue
 262                                 # Suitable InfoExtractor found
 263                                 suitable_found = True
 264                                 all_results = ie.extract(url)
 265                                 results = [x for x in all_results if x is not None]
 266                                 if len(results) != len(all_results):
 267                                         retcode = self.trouble()
 268
 269                                 if len(results) > 1 and self.fixed_template():
 270                                         raise SameFileError(self._params['outtmpl'])
 271
 272                                 for result in results:
 273                                         # Forced printings
 274                                         if self._params.get('forcetitle', False):
 275                                                 print result['title']
 276                                         if self._params.get('forceurl', False):
 277                                                 print result['url']
 278
 279                                         # Do nothing else if in simulate mode
 280                                         if self._params.get('simulate', False):
 281                                                 continue
 282
 283                                         try:
 284                                                 filename = self._params['outtmpl'] % result
 285                                                 self.report_destination(filename)
 286                                         except (ValueError, KeyError), err:
 287                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 288                                                 continue
 289                                         try:
 290                                                 self.pmkdir(filename)
 291                                         except (OSError, IOError), err:
 292                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 293                                                 continue
 294                                         try:
 295                                                 outstream = open(filename, 'wb')
 296                                         except (OSError, IOError), err:
 297                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 298                                                 continue
 299                                         try:
 300                                                 self._do_download(outstream, result['url'])
 301                                                 outstream.close()
 302                                         except (OSError, IOError), err:
 303                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 304                                                 continue
 305                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 306                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 307                                                 continue
 308                                         try:
 309                                                 self.post_process(filename, result)
 310                                         except (PostProcessingError), err:
 311                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
 312                                                 continue
 313
 314                                 break
 315                         if not suitable_found:
 316                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 317
 318                 return retcode
 319
 320         def post_process(self, filename, ie_info):
 321                 """Run the postprocessing chain on the given file."""
 322                 info = dict(ie_info)
 323                 info['filepath'] = filename
 324                 for pp in self._pps:
 325                         info = pp.run(info)
 326                         if info is None:
 327                                 break
 328
 329         def _do_download(self, stream, url):
 330                 request = urllib2.Request(url, None, std_headers)
 331                 data = urllib2.urlopen(request)
 332                 data_len = data.info().get('Content-length', None)
 333                 data_len_str = self.format_bytes(data_len)
 334                 byte_counter = 0
 335                 block_size = 1024
 336                 start = time.time()
 337                 while True:
 338                         # Progress message
 339                         percent_str = self.calc_percent(byte_counter, data_len)
 340                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 341                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 342                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 343
 344                         # Download and write
 345                         before = time.time()
 346                         data_block = data.read(block_size)
 347                         after = time.time()
 348                         data_block_len = len(data_block)
 349                         if data_block_len == 0:
 350                                 break
 351                         byte_counter += data_block_len
 352                         stream.write(data_block)
 353                         block_size = self.best_block_size(after - before, data_block_len)
 354
 355                         # Apply rate limit
 356                         self.slow_down(start, byte_counter)
 357
 358                 self.report_finish()
 359                 if data_len is not None and str(byte_counter) != data_len:
 360                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 361
 362 class InfoExtractor(object):
 363         """Information Extractor class.
 364
 365         Information extractors are the classes that, given a URL, extract
 366         information from the video (or videos) the URL refers to. This
 367         information includes the real video URL, the video title and simplified
 368         title, author and others. It is returned in a list of dictionaries when
 369         calling its extract() method. It is a list because a URL can refer to
 370         more than one video (think of playlists). The dictionaries must include
 371         the following fields:
 372
 373         id:             Video identifier.
 374         url:            Final video URL.
 375         uploader:       Nickname of the video uploader.
 376         title:          Literal title.
 377         stitle:         Simplified title.
 378         ext:            Video filename extension.
 379
 380         Subclasses of this one should re-define the _real_initialize() and
 381         _real_extract() methods, as well as the suitable() static method.
 382         Probably, they should also be instantiated and added to the main
 383         downloader.
 384         """
 385
 386         _ready = False
 387         _downloader = None
 388
 389         def __init__(self, downloader=None):
 390                 """Constructor. Receives an optional downloader."""
 391                 self._ready = False
 392                 self.set_downloader(downloader)
 393
 394         @staticmethod
 395         def suitable(url):
 396                 """Receives a URL and returns True if suitable for this IE."""
 397                 return False
 398
 399         def initialize(self):
 400                 """Initializes an instance (authentication, etc)."""
 401                 if not self._ready:
 402                         self._real_initialize()
 403                         self._ready = True
 404
 405         def extract(self, url):
 406                 """Extracts URL information and returns it in list of dicts."""
 407                 self.initialize()
 408                 return self._real_extract(url)
 409
 410         def set_downloader(self, downloader):
 411                 """Sets the downloader for this IE."""
 412                 self._downloader = downloader
 413
 414         def to_stdout(self, message):
 415                 """Print message to stdout if downloader is not in quiet mode."""
 416                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 417                         print message
 418
 419         def to_stderr(self, message):
 420                 """Print message to stderr."""
 421                 print >>sys.stderr, message
 422
 423         def _real_initialize(self):
 424                 """Real initialization process. Redefine in subclasses."""
 425                 pass
 426
 427         def _real_extract(self, url):
 428                 """Real extraction process. Redefine in subclasses."""
 429                 pass
 430
 431 class YoutubeIE(InfoExtractor):
 432         """Information extractor for youtube.com."""
 433
 434         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 435         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 436         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 437         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 438         _NETRC_MACHINE = 'youtube'
 439
 440         @staticmethod
 441         def suitable(url):
 442                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 443
 444         def report_lang(self):
 445                 """Report attempt to set language."""
 446                 self.to_stdout(u'[youtube] Setting language')
 447
 448         def report_login(self):
 449                 """Report attempt to log in."""
 450                 self.to_stdout(u'[youtube] Logging in')
 451
 452         def report_age_confirmation(self):
 453                 """Report attempt to confirm age."""
 454                 self.to_stdout(u'[youtube] Confirming age')
 455
 456         def report_webpage_download(self, video_id):
 457                 """Report attempt to download webpage."""
 458                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 459
 460         def report_information_extraction(self, video_id):
 461                 """Report attempt to extract video information."""
 462                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 463
 464         def report_video_url(self, video_id, video_real_url):
 465                 """Report extracted video URL."""
 466                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 467
 468         def _real_initialize(self):
 469                 if self._downloader is None:
 470                         return
 471
 472                 username = None
 473                 password = None
 474                 downloader_params = self._downloader.get_params()
 475
 476                 # Attempt to use provided username and password or .netrc data
 477                 if downloader_params.get('username', None) is not None:
 478                         username = downloader_params['username']
 479                         password = downloader_params['password']
 480                 elif downloader_params.get('usenetrc', False):
 481                         try:
 482                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 483                                 if info is not None:
 484                                         username = info[0]
 485                                         password = info[2]
 486                                 else:
 487                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 488                         except (IOError, netrc.NetrcParseError), err:
 489                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 490                                 return
 491
 492                 # No authentication to be performed
 493                 if username is None:
 494                         return
 495
 496                 # Set language
 497                 request = urllib2.Request(self._LOGIN_URL, None, std_headers)
 498                 try:
 499                         self.report_lang()
 500                         urllib2.urlopen(request).read()
 501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 502                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 503                         return
 504
 505                 # Log in
 506                 login_form = {
 507                                 'current_form': 'loginForm',
 508                                 'next':         '/',
 509                                 'action_login': 'Log In',
 510                                 'username':     username,
 511                                 'password':     password,
 512                                 }
 513                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 514                 try:
 515                         self.report_login()
 516                         login_results = urllib2.urlopen(request).read()
 517                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 518                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 519                                 return
 520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 521                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 522                         return
 523
 524                 # Confirm age
 525                 age_form = {
 526                                 'next_url':             '/',
 527                                 'action_confirm':       'Confirm',
 528                                 }
 529                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 530                 try:
 531                         self.report_age_confirmation()
 532                         age_results = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 535                         return
 536
 537         def _real_extract(self, url):
 538                 # Extract video id from URL
 539                 mobj = re.match(self._VALID_URL, url)
 540                 if mobj is None:
 541                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 542                         return [None]
 543                 video_id = mobj.group(2)
 544
 545                 # Downloader parameters
 546                 format_param = None
 547                 if self._downloader is not None:
 548                         params = self._downloader.get_params()
 549                         format_param = params.get('format', None)
 550
 551                 # Extension
 552                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 553
 554                 # Normalize URL, including format
 555                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 556                 if format_param is not None:
 557                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 558                 request = urllib2.Request(normalized_url, None, std_headers)
 559                 try:
 560                         self.report_webpage_download(video_id)
 561                         video_webpage = urllib2.urlopen(request).read()
 562                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 563                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 564                         return [None]
 565                 self.report_information_extraction(video_id)
 566
 567                 # "t" param
 568                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 569                 if mobj is None:
 570                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 571                         return [None]
 572                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 573                 if format_param is not None:
 574                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 575                 self.report_video_url(video_id, video_real_url)
 576
 577                 # uploader
 578                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 579                 if mobj is None:
 580                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 581                         return [None]
 582                 video_uploader = mobj.group(1)
 583
 584                 # title
 585                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 586                 if mobj is None:
 587                         self.to_stderr(u'ERROR: unable to extract video title')
 588                         return [None]
 589                 video_title = mobj.group(1).decode('utf-8')
 590                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 591                 video_title = video_title.replace(os.sep, u'%')
 592
 593                 # simplified title
 594                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 595                 simple_title = simple_title.strip(ur'_')
 596
 597                 # Return information
 598                 return [{
 599                         'id':           video_id.decode('utf-8'),
 600                         'url':          video_real_url.decode('utf-8'),
 601                         'uploader':     video_uploader.decode('utf-8'),
 602                         'title':        video_title,
 603                         'stitle':       simple_title,
 604                         'ext':          video_extension.decode('utf-8'),
 605                         }]
 606
 607 class MetacafeIE(InfoExtractor):
 608         """Information Extractor for metacafe.com."""
 609
 610         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 611         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 612         _youtube_ie = None
 613
 614         def __init__(self, youtube_ie, downloader=None):
 615                 InfoExtractor.__init__(self, downloader)
 616                 self._youtube_ie = youtube_ie
 617
 618         @staticmethod
 619         def suitable(url):
 620                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 621
 622         def report_disclaimer(self):
 623                 """Report disclaimer retrieval."""
 624                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 625
 626         def report_age_confirmation(self):
 627                 """Report attempt to confirm age."""
 628                 self.to_stdout(u'[metacafe] Confirming age')
 629
 630         def report_download_webpage(self, video_id):
 631                 """Report webpage download."""
 632                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 633
 634         def report_extraction(self, video_id):
 635                 """Report information extraction."""
 636                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 637
 638         def _real_initialize(self):
 639                 # Retrieve disclaimer
 640                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 641                 try:
 642                         self.report_disclaimer()
 643                         disclaimer = urllib2.urlopen(request).read()
 644                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 645                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 646                         return
 647
 648                 # Confirm age
 649                 disclaimer_form = {
 650                         'filters': '0',
 651                         'submit': "Continue - I'm over 18",
 652                         }
 653                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 654                 try:
 655                         self.report_age_confirmation()
 656                         disclaimer = urllib2.urlopen(request).read()
 657                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 658                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 659                         return
 660
 661         def _real_extract(self, url):
 662                 # Extract id and simplified title from URL
 663                 mobj = re.match(self._VALID_URL, url)
 664                 if mobj is None:
 665                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 666                         return [None]
 667
 668                 video_id = mobj.group(1)
 669
 670                 # Check if video comes from YouTube
 671                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 672                 if mobj2 is not None:
 673                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 674
 675                 simple_title = mobj.group(2).decode('utf-8')
 676                 video_extension = 'flv'
 677
 678                 # Retrieve video webpage to extract further information
 679                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 680                 try:
 681                         self.report_download_webpage(video_id)
 682                         webpage = urllib2.urlopen(request).read()
 683                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 684                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 685                         return [None]
 686
 687                 # Extract URL, uploader and title from webpage
 688                 self.report_extraction(video_id)
 689                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 690                 if mobj is None:
 691                         self.to_stderr(u'ERROR: unable to extract media URL')
 692                         return [None]
 693                 mediaURL = mobj.group(1).replace('\\', '')
 694
 695                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 696                 if mobj is None:
 697                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 698                         return [None]
 699                 gdaKey = mobj.group(1)
 700
 701                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 702
 703                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 704                 if mobj is None:
 705                         self.to_stderr(u'ERROR: unable to extract title')
 706                         return [None]
 707                 video_title = mobj.group(1).decode('utf-8')
 708
 709                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 710                 if mobj is None:
 711                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 712                         return [None]
 713                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 714
 715                 # Return information
 716                 return [{
 717                         'id':           video_id.decode('utf-8'),
 718                         'url':          video_url.decode('utf-8'),
 719                         'uploader':     video_uploader.decode('utf-8'),
 720                         'title':        video_title,
 721                         'stitle':       simple_title,
 722                         'ext':          video_extension.decode('utf-8'),
 723                         }]
 724
 725
 726 class YoutubeSearchIE(InfoExtractor):
 727         """Information Extractor for YouTube search queries."""
 728         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 729         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 730         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 731         _MORE_PAGES_INDICATOR = r'>Next</a>'
 732         _youtube_ie = None
 733
 734         def __init__(self, youtube_ie, downloader=None):
 735                 InfoExtractor.__init__(self, downloader)
 736                 self._youtube_ie = youtube_ie
 737
 738         @staticmethod
 739         def suitable(url):
 740                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 741
 742         def report_download_page(self, query, pagenum):
 743                 """Report attempt to download playlist page with given number."""
 744                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 745
 746         def _real_initialize(self):
 747                 self._youtube_ie.initialize()
 748
 749         def _real_extract(self, query):
 750                 mobj = re.match(self._VALID_QUERY, query)
 751                 if mobj is None:
 752                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
 753                         return [None]
 754
 755                 prefix, query = query.split(':')
 756                 prefix = prefix[8:]
 757                 if prefix == '':
 758                         return self._download_n_results(query, 1)
 759                 elif prefix == 'all':
 760                         return self._download_n_results(query, -1)
 761                 else:
 762                         try:
 763                                 n = int(prefix)
 764                                 if n <= 0:
 765                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 766                                         return [None]
 767                                 return self._download_n_results(query, n)
 768                         except ValueError: # parsing prefix as int fails
 769                                 return self._download_n_results(query, 1)
 770
 771         def _download_n_results(self, query, n):
 772                 """Downloads a specified number of results for a query"""
 773
 774                 video_ids = []
 775                 already_seen = set()
 776                 pagenum = 1
 777
 778                 while True:
 779                         self.report_download_page(query, pagenum)
 780                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 781                         request = urllib2.Request(result_url, None, std_headers)
 782                         try:
 783                                 page = urllib2.urlopen(request).read()
 784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 785                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 786                                 return [None]
 787
 788                         # Extract video identifiers
 789                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 790                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 791                                 if video_id not in already_seen:
 792                                         video_ids.append(video_id)
 793                                         already_seen.add(video_id)
 794                                         if len(video_ids) == n:
 795                                                 # Specified n videos reached
 796                                                 information = []
 797                                                 for id in video_ids:
 798                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 799                                                 return information
 800
 801                         if self._MORE_PAGES_INDICATOR not in page:
 802                                 information = []
 803                                 for id in video_ids:
 804                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 805                                 return information
 806
 807                         pagenum = pagenum + 1
 808
 809 class YoutubePlaylistIE(InfoExtractor):
 810         """Information Extractor for YouTube playlists."""
 811
 812         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 813         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 814         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 815         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 816         _youtube_ie = None
 817
 818         def __init__(self, youtube_ie, downloader=None):
 819                 InfoExtractor.__init__(self, downloader)
 820                 self._youtube_ie = youtube_ie
 821
 822         @staticmethod
 823         def suitable(url):
 824                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 825
 826         def report_download_page(self, playlist_id, pagenum):
 827                 """Report attempt to download playlist page with given number."""
 828                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 829
 830         def _real_initialize(self):
 831                 self._youtube_ie.initialize()
 832
 833         def _real_extract(self, url):
 834                 # Extract playlist id
 835                 mobj = re.match(self._VALID_URL, url)
 836                 if mobj is None:
 837                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 838                         return [None]
 839
 840                 # Download playlist pages
 841                 playlist_id = mobj.group(1)
 842                 video_ids = []
 843                 pagenum = 1
 844
 845                 while True:
 846                         self.report_download_page(playlist_id, pagenum)
 847                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 848                         try:
 849                                 page = urllib2.urlopen(request).read()
 850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 851                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 852                                 return [None]
 853
 854                         # Extract video identifiers
 855                         ids_in_page = []
 856                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 857                                 if mobj.group(1) not in ids_in_page:
 858                                         ids_in_page.append(mobj.group(1))
 859                         video_ids.extend(ids_in_page)
 860
 861                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 862                                 break
 863                         pagenum = pagenum + 1
 864
 865                 information = []
 866                 for id in video_ids:
 867                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 868                 return information
 869
 870 class PostProcessor(object):
 871         """Post Processor class.
 872
 873         PostProcessor objects can be added to downloaders with their
 874         add_post_processor() method. When the downloader has finished a
 875         successful download, it will take its internal chain of PostProcessors
 876         and start calling the run() method on each one of them, first with
 877         an initial argument and then with the returned value of the previous
 878         PostProcessor.
 879
 880         The chain will be stopped if one of them ever returns None or the end
 881         of the chain is reached.
 882
 883         PostProcessor objects follow a "mutual registration" process similar
 884         to InfoExtractor objects.
 885         """
 886
 887         _downloader = None
 888
 889         def __init__(self, downloader=None):
 890                 self._downloader = downloader
 891
 892         def to_stdout(self, message):
 893                 """Print message to stdout if downloader is not in quiet mode."""
 894                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 895                         print message
 896
 897         def to_stderr(self, message):
 898                 """Print message to stderr."""
 899                 print >>sys.stderr, message
 900
 901         def set_downloader(self, downloader):
 902                 """Sets the downloader for this PP."""
 903                 self._downloader = downloader
 904
 905         def run(self, information):
 906                 """Run the PostProcessor.
 907
 908                 The "information" argument is a dictionary like the ones
 909                 returned by InfoExtractors. The only difference is that this
 910                 one has an extra field called "filepath" that points to the
 911                 downloaded file.
 912
 913                 When this method returns None, the postprocessing chain is
 914                 stopped. However, this method may return an information
 915                 dictionary that will be passed to the next postprocessing
 916                 object in the chain. It can be the one it received after
 917                 changing some fields.
 918
 919                 In addition, this method may raise a PostProcessingError
 920                 exception that will be taken into account by the downloader
 921                 it was called from.
 922                 """
 923                 return information # by default, do nothing
 924
 925 ### MAIN PROGRAM ###
 926 if __name__ == '__main__':
 927         try:
 928                 # Modules needed only when running the main program
 929                 import getpass
 930                 import optparse
 931
 932                 # General configuration
 933                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 934                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 935                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 936
 937                 # Parse command line
 938                 parser = optparse.OptionParser(
 939                                 usage='Usage: %prog [options] url...',
 940                                 version='2009.01.31',
 941                                 conflict_handler='resolve',
 942                                 )
 943                 parser.add_option('-h', '--help',
 944                                 action='help', help='print this help text and exit')
 945                 parser.add_option('-v', '--version',
 946                                 action='version', help='print program version and exit')
 947                 parser.add_option('-u', '--username',
 948                                 dest='username', metavar='UN', help='account username')
 949                 parser.add_option('-p', '--password',
 950                                 dest='password', metavar='PW', help='account password')
 951                 parser.add_option('-o', '--output',
 952                                 dest='outtmpl', metavar='TPL', help='output filename template')
 953                 parser.add_option('-q', '--quiet',
 954                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 955                 parser.add_option('-s', '--simulate',
 956                                 action='store_true', dest='simulate', help='do not download video', default=False)
 957                 parser.add_option('-t', '--title',
 958                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 959                 parser.add_option('-l', '--literal',
 960                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 961                 parser.add_option('-n', '--netrc',
 962                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 963                 parser.add_option('-g', '--get-url',
 964                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 965                 parser.add_option('-e', '--get-title',
 966                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 967                 parser.add_option('-f', '--format',
 968                                 dest='format', metavar='FMT', help='video format code')
 969                 parser.add_option('-b', '--best-quality',
 970                                 action='store_const', dest='format', help='alias for -f 18', const='18')
 971                 parser.add_option('-m', '--mobile-version',
 972                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 973                 parser.add_option('-i', '--ignore-errors',
 974                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 975                 parser.add_option('-r', '--rate-limit',
 976                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 977                 parser.add_option('-a', '--batch-file',
 978                                 dest='batchfile', metavar='F', help='file containing URLs to download')
 979                 (opts, args) = parser.parse_args()
 980
 981                 # Batch file verification
 982                 batchurls = []
 983                 if opts.batchfile is not None:
 984                         try:
 985                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
 986                         except IOError:
 987                                 sys.exit(u'ERROR: batch file could not be read')
 988                 all_urls = batchurls + args
 989
 990                 # Conflicting, missing and erroneous options
 991                 if len(all_urls) < 1:
 992                         sys.exit(u'ERROR: you must provide at least one URL')
 993                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 994                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
 995                 if opts.password is not None and opts.username is None:
 996                         sys.exit(u'ERROR: account username missing')
 997                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 998                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
 999                 if opts.usetitle and opts.useliteral:
1000                         sys.exit(u'ERROR: using title conflicts with using literal title')
1001                 if opts.username is not None and opts.password is None:
1002                         opts.password = getpass.getpass(u'Type account password and press return:')
1003                 if opts.ratelimit is not None:
1004                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1005                         if numeric_limit is None:
1006                                 sys.exit(u'ERROR: invalid rate limit specified')
1007                         opts.ratelimit = numeric_limit
1008
1009                 # Information extractors
1010                 youtube_ie = YoutubeIE()
1011                 metacafe_ie = MetacafeIE(youtube_ie)
1012                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1013                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1014
1015                 # File downloader
1016                 charset = locale.getdefaultlocale()[1]
1017                 if charset is None:
1018                         charset = 'ascii'
1019                 fd = FileDownloader({
1020                         'usenetrc': opts.usenetrc,
1021                         'username': opts.username,
1022                         'password': opts.password,
1023                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1024                         'forceurl': opts.geturl,
1025                         'forcetitle': opts.gettitle,
1026                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1027                         'format': opts.format,
1028                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1029                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1030                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1031                                 or u'%(id)s.%(ext)s'),
1032                         'ignoreerrors': opts.ignoreerrors,
1033                         'ratelimit': opts.ratelimit,
1034                         })
1035                 fd.add_info_extractor(youtube_search_ie)
1036                 fd.add_info_extractor(youtube_pl_ie)
1037                 fd.add_info_extractor(metacafe_ie)
1038                 fd.add_info_extractor(youtube_ie)
1039                 retcode = fd.download(all_urls)
1040                 sys.exit(retcode)
1041
1042         except DownloadError:
1043                 sys.exit(1)
1044         except SameFileError:
1045                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1046         except KeyboardInterrupt:
1047                 sys.exit(u'\nERROR: Interrupted by user')