youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61         pass
  62
  63 class ContentTooShortError(Exception):
  64         """Content Too Short exception.
  65
  66         This exception may be raised by FileDownloader objects when a file they
  67         download is too small for what the server announced first, indicating
  68         the connection was probably interrupted.
  69         """
  70         # Both in bytes
  71         downloaded = None
  72         expected = None
  73
  74         def __init__(self, downloaded, expected):
  75                 self.downloaded = downloaded
  76                 self.expected = expected
  77
  78 class FileDownloader(object):
  79         """File Downloader class.
  80
  81         File downloader objects are the ones responsible of downloading the
  82         actual video file and writing it to disk if the user has requested
  83         it, among some other tasks. In most cases there should be one per
  84         program. As, given a video URL, the downloader doesn't know how to
  85         extract all the needed information, task that InfoExtractors do, it
  86         has to pass the URL to one of them.
  87
  88         For this, file downloader objects have a method that allows
  89         InfoExtractors to be registered in a given order. When it is passed
  90         a URL, the file downloader handles it to the first InfoExtractor it
  91         finds that reports being able to handle it. The InfoExtractor extracts
  92         all the information about the video or videos the URL refers to, and
  93         asks the FileDownloader to process the video information, possibly
  94         downloading the video.
  95
  96         File downloaders accept a lot of parameters. In order not to saturate
  97         the object constructor with arguments, it receives a dictionary of
  98         options instead. These options are available through the params
  99         attribute for the InfoExtractors to use. The FileDownloader also
 100         registers itself as the downloader in charge for the InfoExtractors
 101         that are added to it, so this is a "mutual registration".
 102
 103         Available options:
 104
 105         username:       Username for authentication purposes.
 106         password:       Password for authentication purposes.
 107         usenetrc:       Use netrc for authentication instead.
 108         quiet:          Do not print messages to stdout.
 109         forceurl:       Force printing final URL.
 110         forcetitle:     Force printing title.
 111         simulate:       Do not download the video files.
 112         format:         Video format code.
 113         outtmpl:        Template for output names.
 114         ignoreerrors:   Do not stop on download errors.
 115         ratelimit:      Download speed limit, in bytes/sec.
 116         nooverwrites:   Prevent overwriting files.
 117         """
 118
 119         params = None
 120         _ies = []
 121         _pps = []
 122         _download_retcode = None
 123
 124         def __init__(self, params):
 125                 """Create a FileDownloader object with the given options."""
 126                 self._ies = []
 127                 self._pps = []
 128                 self._download_retcode = 0
 129                 self.params = params
 130
 131         @staticmethod
 132         def pmkdir(filename):
 133                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 134                 components = filename.split(os.sep)
 135                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 136                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 137                 for dir in aggregate:
 138                         if not os.path.exists(dir):
 139                                 os.mkdir(dir)
 140
 141         @staticmethod
 142         def format_bytes(bytes):
 143                 if bytes is None:
 144                         return 'N/A'
 145                 if bytes == 0:
 146                         exponent = 0
 147                 else:
 148                         exponent = long(math.log(float(bytes), 1024.0))
 149                 suffix = 'bkMGTPEZY'[exponent]
 150                 converted = float(bytes) / float(1024**exponent)
 151                 return '%.2f%s' % (converted, suffix)
 152
 153         @staticmethod
 154         def calc_percent(byte_counter, data_len):
 155                 if data_len is None:
 156                         return '---.-%'
 157                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 158
 159         @staticmethod
 160         def calc_eta(start, now, total, current):
 161                 if total is None:
 162                         return '--:--'
 163                 dif = now - start
 164                 if current == 0 or dif < 0.001: # One millisecond
 165                         return '--:--'
 166                 rate = float(current) / dif
 167                 eta = long((float(total) - float(current)) / rate)
 168                 (eta_mins, eta_secs) = divmod(eta, 60)
 169                 if eta_mins > 99:
 170                         return '--:--'
 171                 return '%02d:%02d' % (eta_mins, eta_secs)
 172
 173         @staticmethod
 174         def calc_speed(start, now, bytes):
 175                 dif = now - start
 176                 if bytes == 0 or dif < 0.001: # One millisecond
 177                         return '%10s' % '---b/s'
 178                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 179
 180         @staticmethod
 181         def best_block_size(elapsed_time, bytes):
 182                 new_min = max(bytes / 2.0, 1.0)
 183                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 184                 if elapsed_time < 0.001:
 185                         return int(new_max)
 186                 rate = bytes / elapsed_time
 187                 if rate > new_max:
 188                         return int(new_max)
 189                 if rate < new_min:
 190                         return int(new_min)
 191                 return int(rate)
 192
 193         @staticmethod
 194         def parse_bytes(bytestr):
 195                 """Parse a string indicating a byte quantity into a long integer."""
 196                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 197                 if matchobj is None:
 198                         return None
 199                 number = float(matchobj.group(1))
 200                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 201                 return long(round(number * multiplier))
 202
 203         @staticmethod
 204         def verify_url(url):
 205                 """Verify a URL is valid and data could be downloaded."""
 206                 request = urllib2.Request(url, None, std_headers)
 207                 data = urllib2.urlopen(request)
 208                 data.read(1)
 209                 data.close()
 210
 211         def add_info_extractor(self, ie):
 212                 """Add an InfoExtractor object to the end of the list."""
 213                 self._ies.append(ie)
 214                 ie.set_downloader(self)
 215
 216         def add_post_processor(self, pp):
 217                 """Add a PostProcessor object to the end of the chain."""
 218                 self._pps.append(pp)
 219                 pp.set_downloader(self)
 220
 221         def to_stdout(self, message, skip_eol=False):
 222                 """Print message to stdout if not in quiet mode."""
 223                 if not self.params.get('quiet', False):
 224                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 225                         sys.stdout.flush()
 226
 227         def to_stderr(self, message):
 228                 """Print message to stderr."""
 229                 print >>sys.stderr, message
 230
 231         def fixed_template(self):
 232                 """Checks if the output template is fixed."""
 233                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 234
 235         def trouble(self, message=None):
 236                 """Determine action to take when a download problem appears.
 237
 238                 Depending on if the downloader has been configured to ignore
 239                 download errors or not, this method may throw an exception or
 240                 not when errors are found, after printing the message.
 241                 """
 242                 if message is not None:
 243                         self.to_stderr(message)
 244                 if not self.params.get('ignoreerrors', False):
 245                         raise DownloadError(message)
 246                 self._download_retcode = 1
 247
 248         def slow_down(self, start_time, byte_counter):
 249                 """Sleep if the download speed is over the rate limit."""
 250                 rate_limit = self.params.get('ratelimit', None)
 251                 if rate_limit is None or byte_counter == 0:
 252                         return
 253                 now = time.time()
 254                 elapsed = now - start_time
 255                 if elapsed <= 0.0:
 256                         return
 257                 speed = float(byte_counter) / elapsed
 258                 if speed > rate_limit:
 259                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 260
 261         def report_destination(self, filename):
 262                 """Report destination filename."""
 263                 self.to_stdout(u'[download] Destination: %s' % filename)
 264
 265         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 266                 """Report download progress."""
 267                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 268                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 269
 270         def report_finish(self):
 271                 """Report download finished."""
 272                 self.to_stdout(u'')
 273
 274         def process_info(self, info_dict):
 275                 """Process a single dictionary returned by an InfoExtractor."""
 276                 # Do nothing else if in simulate mode
 277                 if self.params.get('simulate', False):
 278                         try:
 279                                 self.verify_url(info_dict['url'])
 280                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 281                                 raise UnavailableFormatError
 282
 283                         # Forced printings
 284                         if self.params.get('forcetitle', False):
 285                                 print info_dict['title'].encode(locale.getpreferredencoding())
 286                         if self.params.get('forceurl', False):
 287                                 print info_dict['url'].encode(locale.getpreferredencoding())
 288
 289                         return
 290
 291                 try:
 292                         template_dict = dict(info_dict)
 293                         template_dict['epoch'] = unicode(long(time.time()))
 294                         filename = self.params['outtmpl'] % template_dict
 295                         self.report_destination(filename)
 296                 except (ValueError, KeyError), err:
 297                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 298                 if self.params['nooverwrites'] and os.path.exists(filename):
 299                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 300                         return
 301
 302                 try:
 303                         self.pmkdir(filename)
 304                 except (OSError, IOError), err:
 305                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 306                         return
 307
 308                 try:
 309                         outstream = open(filename, 'wb')
 310                 except (OSError, IOError), err:
 311                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 312                         return
 313
 314                 try:
 315                         self._do_download(outstream, info_dict['url'])
 316                         outstream.close()
 317                 except (OSError, IOError), err:
 318                         outstream.close()
 319                         os.remove(filename)
 320                         raise UnavailableFormatError
 321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 322                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 323                         return
 324                 except (ContentTooShortError, ), err:
 325                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 326                         return
 327
 328                 try:
 329                         self.post_process(filename, info_dict)
 330                 except (PostProcessingError), err:
 331                         self.trouble('ERROR: postprocessing: %s' % str(err))
 332                         return
 333
 334         def download(self, url_list):
 335                 """Download a given list of URLs."""
 336                 if len(url_list) > 1 and self.fixed_template():
 337                         raise SameFileError(self.params['outtmpl'])
 338
 339                 for url in url_list:
 340                         suitable_found = False
 341                         for ie in self._ies:
 342                                 # Go to next InfoExtractor if not suitable
 343                                 if not ie.suitable(url):
 344                                         continue
 345
 346                                 # Suitable InfoExtractor found
 347                                 suitable_found = True
 348
 349                                 # Extract information from URL and process it
 350                                 ie.extract(url)
 351
 352                                 # Suitable InfoExtractor had been found; go to next URL
 353                                 break
 354
 355                         if not suitable_found:
 356                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 357
 358                 return self._download_retcode
 359
 360         def post_process(self, filename, ie_info):
 361                 """Run the postprocessing chain on the given file."""
 362                 info = dict(ie_info)
 363                 info['filepath'] = filename
 364                 for pp in self._pps:
 365                         info = pp.run(info)
 366                         if info is None:
 367                                 break
 368
 369         def _do_download(self, stream, url):
 370                 request = urllib2.Request(url, None, std_headers)
 371                 data = urllib2.urlopen(request)
 372                 data_len = data.info().get('Content-length', None)
 373                 data_len_str = self.format_bytes(data_len)
 374                 byte_counter = 0
 375                 block_size = 1024
 376                 start = time.time()
 377                 while True:
 378                         # Progress message
 379                         percent_str = self.calc_percent(byte_counter, data_len)
 380                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 381                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 382                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 383
 384                         # Download and write
 385                         before = time.time()
 386                         data_block = data.read(block_size)
 387                         after = time.time()
 388                         data_block_len = len(data_block)
 389                         if data_block_len == 0:
 390                                 break
 391                         byte_counter += data_block_len
 392                         stream.write(data_block)
 393                         block_size = self.best_block_size(after - before, data_block_len)
 394
 395                         # Apply rate limit
 396                         self.slow_down(start, byte_counter)
 397
 398                 self.report_finish()
 399                 if data_len is not None and str(byte_counter) != data_len:
 400                         raise ContentTooShortError(byte_counter, long(data_len))
 401
 402 class InfoExtractor(object):
 403         """Information Extractor class.
 404
 405         Information extractors are the classes that, given a URL, extract
 406         information from the video (or videos) the URL refers to. This
 407         information includes the real video URL, the video title and simplified
 408         title, author and others. The information is stored in a dictionary
 409         which is then passed to the FileDownloader. The FileDownloader
 410         processes this information possibly downloading the video to the file
 411         system, among other possible outcomes. The dictionaries must include
 412         the following fields:
 413
 414         id:             Video identifier.
 415         url:            Final video URL.
 416         uploader:       Nickname of the video uploader.
 417         title:          Literal title.
 418         stitle:         Simplified title.
 419         ext:            Video filename extension.
 420
 421         Subclasses of this one should re-define the _real_initialize() and
 422         _real_extract() methods, as well as the suitable() static method.
 423         Probably, they should also be instantiated and added to the main
 424         downloader.
 425         """
 426
 427         _ready = False
 428         _downloader = None
 429
 430         def __init__(self, downloader=None):
 431                 """Constructor. Receives an optional downloader."""
 432                 self._ready = False
 433                 self.set_downloader(downloader)
 434
 435         @staticmethod
 436         def suitable(url):
 437                 """Receives a URL and returns True if suitable for this IE."""
 438                 return False
 439
 440         def initialize(self):
 441                 """Initializes an instance (authentication, etc)."""
 442                 if not self._ready:
 443                         self._real_initialize()
 444                         self._ready = True
 445
 446         def extract(self, url):
 447                 """Extracts URL information and returns it in list of dicts."""
 448                 self.initialize()
 449                 return self._real_extract(url)
 450
 451         def set_downloader(self, downloader):
 452                 """Sets the downloader for this IE."""
 453                 self._downloader = downloader
 454
 455         def _real_initialize(self):
 456                 """Real initialization process. Redefine in subclasses."""
 457                 pass
 458
 459         def _real_extract(self, url):
 460                 """Real extraction process. Redefine in subclasses."""
 461                 pass
 462
 463 class YoutubeIE(InfoExtractor):
 464         """Information extractor for youtube.com."""
 465
 466         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 467         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 468         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 469         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 470         _NETRC_MACHINE = 'youtube'
 471         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
 472         _video_extensions = {
 473                 '13': '3gp',
 474                 '17': 'mp4',
 475                 '18': 'mp4',
 476                 '22': 'mp4',
 477         }
 478
 479         @staticmethod
 480         def suitable(url):
 481                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 482
 483         @staticmethod
 484         def htmlentity_transform(matchobj):
 485                 """Transforms an HTML entity to a Unicode character."""
 486                 entity = matchobj.group(1)
 487
 488                 # Known non-numeric HTML entity
 489                 if entity in htmlentitydefs.name2codepoint:
 490                         return unichr(htmlentitydefs.name2codepoint[entity])
 491
 492                 # Unicode character
 493                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 494                 if mobj is not None:
 495                         numstr = mobj.group(1)
 496                         if numstr.startswith(u'x'):
 497                                 base = 16
 498                                 numstr = u'0%s' % numstr
 499                         else:
 500                                 base = 10
 501                         return unichr(long(numstr, base))
 502
 503                 # Unknown entity in name, return its literal representation
 504                 return (u'&%s;' % entity)
 505
 506         def report_lang(self):
 507                 """Report attempt to set language."""
 508                 self._downloader.to_stdout(u'[youtube] Setting language')
 509
 510         def report_login(self):
 511                 """Report attempt to log in."""
 512                 self._downloader.to_stdout(u'[youtube] Logging in')
 513
 514         def report_age_confirmation(self):
 515                 """Report attempt to confirm age."""
 516                 self._downloader.to_stdout(u'[youtube] Confirming age')
 517
 518         def report_webpage_download(self, video_id):
 519                 """Report attempt to download webpage."""
 520                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 521
 522         def report_information_extraction(self, video_id):
 523                 """Report attempt to extract video information."""
 524                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 525
 526         def report_video_url(self, video_id, video_real_url):
 527                 """Report extracted video URL."""
 528                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 529
 530         def report_unavailable_format(self, video_id, format):
 531                 """Report extracted video URL."""
 532                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 533
 534         def _real_initialize(self):
 535                 if self._downloader is None:
 536                         return
 537
 538                 username = None
 539                 password = None
 540                 downloader_params = self._downloader.params
 541
 542                 # Attempt to use provided username and password or .netrc data
 543                 if downloader_params.get('username', None) is not None:
 544                         username = downloader_params['username']
 545                         password = downloader_params['password']
 546                 elif downloader_params.get('usenetrc', False):
 547                         try:
 548                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 549                                 if info is not None:
 550                                         username = info[0]
 551                                         password = info[2]
 552                                 else:
 553                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 554                         except (IOError, netrc.NetrcParseError), err:
 555                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 556                                 return
 557
 558                 # Set language
 559                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 560                 try:
 561                         self.report_lang()
 562                         urllib2.urlopen(request).read()
 563                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 564                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 565                         return
 566
 567                 # No authentication to be performed
 568                 if username is None:
 569                         return
 570
 571                 # Log in
 572                 login_form = {
 573                                 'current_form': 'loginForm',
 574                                 'next':         '/',
 575                                 'action_login': 'Log In',
 576                                 'username':     username,
 577                                 'password':     password,
 578                                 }
 579                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 580                 try:
 581                         self.report_login()
 582                         login_results = urllib2.urlopen(request).read()
 583                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 584                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 585                                 return
 586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 587                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 588                         return
 589
 590                 # Confirm age
 591                 age_form = {
 592                                 'next_url':             '/',
 593                                 'action_confirm':       'Confirm',
 594                                 }
 595                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 596                 try:
 597                         self.report_age_confirmation()
 598                         age_results = urllib2.urlopen(request).read()
 599                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 600                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 601                         return
 602
 603         def _real_extract(self, url):
 604                 # Extract video id from URL
 605                 mobj = re.match(self._VALID_URL, url)
 606                 if mobj is None:
 607                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 608                         return
 609                 video_id = mobj.group(2)
 610
 611                 # Downloader parameters
 612                 best_quality = False
 613                 format_param = None
 614                 quality_index = 0
 615                 if self._downloader is not None:
 616                         params = self._downloader.params
 617                         format_param = params.get('format', None)
 618                         if format_param == '0':
 619                                 format_param = self._available_formats[quality_index]
 620                                 best_quality = True
 621
 622                 while True:
 623                         # Extension
 624                         video_extension = self._video_extensions.get(format_param, 'flv')
 625
 626                         # Normalize URL, including format
 627                         normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 628                         if format_param is not None:
 629                                 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 630                         request = urllib2.Request(normalized_url, None, std_headers)
 631                         try:
 632                                 self.report_webpage_download(video_id)
 633                                 video_webpage = urllib2.urlopen(request).read()
 634                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 635                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 636                                 return
 637                         self.report_information_extraction(video_id)
 638
 639                         # "t" param
 640                         mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 641                         if mobj is None:
 642                                 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 643                                 return
 644                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 645                         if format_param is not None:
 646                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 647                         self.report_video_url(video_id, video_real_url)
 648
 649                         # uploader
 650                         mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 651                         if mobj is None:
 652                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 653                                 return
 654                         video_uploader = mobj.group(1)
 655
 656                         # title
 657                         mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 658                         if mobj is None:
 659                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 660                                 return
 661                         video_title = mobj.group(1).decode('utf-8')
 662                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 663                         video_title = video_title.replace(os.sep, u'%')
 664
 665                         # simplified title
 666                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 667                         simple_title = simple_title.strip(ur'_')
 668
 669                         try:
 670                                 # Process video information
 671                                 self._downloader.process_info({
 672                                         'id':           video_id.decode('utf-8'),
 673                                         'url':          video_real_url.decode('utf-8'),
 674                                         'uploader':     video_uploader.decode('utf-8'),
 675                                         'title':        video_title,
 676                                         'stitle':       simple_title,
 677                                         'ext':          video_extension.decode('utf-8'),
 678                                 })
 679
 680                                 return
 681
 682                         except UnavailableFormatError, err:
 683                                 if best_quality:
 684                                         if quality_index == len(self._available_formats) - 1:
 685                                                 # I don't ever expect this to happen
 686                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 687                                                 return
 688                                         else:
 689                                                 self.report_unavailable_format(video_id, format_param)
 690                                                 quality_index += 1
 691                                                 format_param = self._available_formats[quality_index]
 692                                                 continue
 693                                 else:
 694                                         self._downloader.trouble('ERROR: format not available for video')
 695                                         return
 696
 697
 698 class MetacafeIE(InfoExtractor):
 699         """Information Extractor for metacafe.com."""
 700
 701         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 702         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 703         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 704         _youtube_ie = None
 705
 706         def __init__(self, youtube_ie, downloader=None):
 707                 InfoExtractor.__init__(self, downloader)
 708                 self._youtube_ie = youtube_ie
 709
 710         @staticmethod
 711         def suitable(url):
 712                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 713
 714         def report_disclaimer(self):
 715                 """Report disclaimer retrieval."""
 716                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 717
 718         def report_age_confirmation(self):
 719                 """Report attempt to confirm age."""
 720                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 721
 722         def report_download_webpage(self, video_id):
 723                 """Report webpage download."""
 724                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 725
 726         def report_extraction(self, video_id):
 727                 """Report information extraction."""
 728                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 729
 730         def _real_initialize(self):
 731                 # Retrieve disclaimer
 732                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 733                 try:
 734                         self.report_disclaimer()
 735                         disclaimer = urllib2.urlopen(request).read()
 736                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 737                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 738                         return
 739
 740                 # Confirm age
 741                 disclaimer_form = {
 742                         'filters': '0',
 743                         'submit': "Continue - I'm over 18",
 744                         }
 745                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 746                 try:
 747                         self.report_age_confirmation()
 748                         disclaimer = urllib2.urlopen(request).read()
 749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 750                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 751                         return
 752
 753         def _real_extract(self, url):
 754                 # Extract id and simplified title from URL
 755                 mobj = re.match(self._VALID_URL, url)
 756                 if mobj is None:
 757                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 758                         return
 759
 760                 video_id = mobj.group(1)
 761
 762                 # Check if video comes from YouTube
 763                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 764                 if mobj2 is not None:
 765                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 766                         return
 767
 768                 simple_title = mobj.group(2).decode('utf-8')
 769                 video_extension = 'flv'
 770
 771                 # Retrieve video webpage to extract further information
 772                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 773                 try:
 774                         self.report_download_webpage(video_id)
 775                         webpage = urllib2.urlopen(request).read()
 776                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 777                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 778                         return
 779
 780                 # Extract URL, uploader and title from webpage
 781                 self.report_extraction(video_id)
 782                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 783                 if mobj is None:
 784                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 785                         return
 786                 mediaURL = urllib.unquote(mobj.group(1))
 787
 788                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 789                 if mobj is None:
 790                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 791                         return
 792                 gdaKey = mobj.group(1)
 793
 794                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 795
 796                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 797                 if mobj is None:
 798                         self._downloader.trouble(u'ERROR: unable to extract title')
 799                         return
 800                 video_title = mobj.group(1).decode('utf-8')
 801
 802                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 803                 if mobj is None:
 804                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 805                         return
 806                 video_uploader = mobj.group(1)
 807
 808                 try:
 809                         # Process video information
 810                         self._downloader.process_info({
 811                                 'id':           video_id.decode('utf-8'),
 812                                 'url':          video_url.decode('utf-8'),
 813                                 'uploader':     video_uploader.decode('utf-8'),
 814                                 'title':        video_title,
 815                                 'stitle':       simple_title,
 816                                 'ext':          video_extension.decode('utf-8'),
 817                         })
 818                 except UnavailableFormatError:
 819                         self._downloader.trouble(u'ERROR: format not available for video')
 820
 821
 822 class YoutubeSearchIE(InfoExtractor):
 823         """Information Extractor for YouTube search queries."""
 824         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 825         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 826         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 827         _MORE_PAGES_INDICATOR = r'>Next</a>'
 828         _youtube_ie = None
 829         _max_youtube_results = 1000
 830
 831         def __init__(self, youtube_ie, downloader=None):
 832                 InfoExtractor.__init__(self, downloader)
 833                 self._youtube_ie = youtube_ie
 834
 835         @staticmethod
 836         def suitable(url):
 837                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 838
 839         def report_download_page(self, query, pagenum):
 840                 """Report attempt to download playlist page with given number."""
 841                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 842
 843         def _real_initialize(self):
 844                 self._youtube_ie.initialize()
 845
 846         def _real_extract(self, query):
 847                 mobj = re.match(self._VALID_QUERY, query)
 848                 if mobj is None:
 849                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 850                         return
 851
 852                 prefix, query = query.split(':')
 853                 prefix = prefix[8:]
 854                 if prefix == '':
 855                         self._download_n_results(query, 1)
 856                         return
 857                 elif prefix == 'all':
 858                         self._download_n_results(query, self._max_youtube_results)
 859                         return
 860                 else:
 861                         try:
 862                                 n = int(prefix)
 863                                 if n <= 0:
 864                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 865                                         return
 866                                 elif n > self._max_youtube_results:
 867                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 868                                         n = self._max_youtube_results
 869                                 self._download_n_results(query, n)
 870                                 return
 871                         except ValueError: # parsing prefix as int fails
 872                                 self._download_n_results(query, 1)
 873                                 return
 874
 875         def _download_n_results(self, query, n):
 876                 """Downloads a specified number of results for a query"""
 877
 878                 video_ids = []
 879                 already_seen = set()
 880                 pagenum = 1
 881
 882                 while True:
 883                         self.report_download_page(query, pagenum)
 884                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 885                         request = urllib2.Request(result_url, None, std_headers)
 886                         try:
 887                                 page = urllib2.urlopen(request).read()
 888                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 889                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 890                                 return
 891
 892                         # Extract video identifiers
 893                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 894                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 895                                 if video_id not in already_seen:
 896                                         video_ids.append(video_id)
 897                                         already_seen.add(video_id)
 898                                         if len(video_ids) == n:
 899                                                 # Specified n videos reached
 900                                                 for id in video_ids:
 901                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 902                                                 return
 903
 904                         if self._MORE_PAGES_INDICATOR not in page:
 905                                 for id in video_ids:
 906                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 907                                 return
 908
 909                         pagenum = pagenum + 1
 910
 911 class YoutubePlaylistIE(InfoExtractor):
 912         """Information Extractor for YouTube playlists."""
 913
 914         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 915         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 916         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 917         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 918         _youtube_ie = None
 919
 920         def __init__(self, youtube_ie, downloader=None):
 921                 InfoExtractor.__init__(self, downloader)
 922                 self._youtube_ie = youtube_ie
 923
 924         @staticmethod
 925         def suitable(url):
 926                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 927
 928         def report_download_page(self, playlist_id, pagenum):
 929                 """Report attempt to download playlist page with given number."""
 930                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 931
 932         def _real_initialize(self):
 933                 self._youtube_ie.initialize()
 934
 935         def _real_extract(self, url):
 936                 # Extract playlist id
 937                 mobj = re.match(self._VALID_URL, url)
 938                 if mobj is None:
 939                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 940                         return
 941
 942                 # Download playlist pages
 943                 playlist_id = mobj.group(1)
 944                 video_ids = []
 945                 pagenum = 1
 946
 947                 while True:
 948                         self.report_download_page(playlist_id, pagenum)
 949                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 950                         try:
 951                                 page = urllib2.urlopen(request).read()
 952                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 953                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 954                                 return
 955
 956                         # Extract video identifiers
 957                         ids_in_page = []
 958                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 959                                 if mobj.group(1) not in ids_in_page:
 960                                         ids_in_page.append(mobj.group(1))
 961                         video_ids.extend(ids_in_page)
 962
 963                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 964                                 break
 965                         pagenum = pagenum + 1
 966
 967                 for id in video_ids:
 968                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 969                 return
 970
 971 class PostProcessor(object):
 972         """Post Processor class.
 973
 974         PostProcessor objects can be added to downloaders with their
 975         add_post_processor() method. When the downloader has finished a
 976         successful download, it will take its internal chain of PostProcessors
 977         and start calling the run() method on each one of them, first with
 978         an initial argument and then with the returned value of the previous
 979         PostProcessor.
 980
 981         The chain will be stopped if one of them ever returns None or the end
 982         of the chain is reached.
 983
 984         PostProcessor objects follow a "mutual registration" process similar
 985         to InfoExtractor objects.
 986         """
 987
 988         _downloader = None
 989
 990         def __init__(self, downloader=None):
 991                 self._downloader = downloader
 992
 993         def set_downloader(self, downloader):
 994                 """Sets the downloader for this PP."""
 995                 self._downloader = downloader
 996
 997         def run(self, information):
 998                 """Run the PostProcessor.
 999
1000                 The "information" argument is a dictionary like the ones
1001                 composed by InfoExtractors. The only difference is that this
1002                 one has an extra field called "filepath" that points to the
1003                 downloaded file.
1004
1005                 When this method returns None, the postprocessing chain is
1006                 stopped. However, this method may return an information
1007                 dictionary that will be passed to the next postprocessing
1008                 object in the chain. It can be the one it received after
1009                 changing some fields.
1010
1011                 In addition, this method may raise a PostProcessingError
1012                 exception that will be taken into account by the downloader
1013                 it was called from.
1014                 """
1015                 return information # by default, do nothing
1016
1017 ### MAIN PROGRAM ###
1018 if __name__ == '__main__':
1019         try:
1020                 # Modules needed only when running the main program
1021                 import getpass
1022                 import optparse
1023
1024                 # General configuration
1025                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1026                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1027                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1028
1029                 # Parse command line
1030                 parser = optparse.OptionParser(
1031                         usage='Usage: %prog [options] url...',
1032                         version='INTERNAL',
1033                         conflict_handler='resolve',
1034                 )
1035
1036                 parser.add_option('-h', '--help',
1037                                 action='help', help='print this help text and exit')
1038                 parser.add_option('-v', '--version',
1039                                 action='version', help='print program version and exit')
1040                 parser.add_option('-i', '--ignore-errors',
1041                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1042                 parser.add_option('-r', '--rate-limit',
1043                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1044
1045                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1046                 authentication.add_option('-u', '--username',
1047                                 dest='username', metavar='UN', help='account username')
1048                 authentication.add_option('-p', '--password',
1049                                 dest='password', metavar='PW', help='account password')
1050                 authentication.add_option('-n', '--netrc',
1051                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1052                 parser.add_option_group(authentication)
1053
1054                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1055                 video_format.add_option('-f', '--format',
1056                                 action='append', dest='format', metavar='FMT', help='video format code')
1057                 video_format.add_option('-b', '--best-quality',
1058                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1059                 video_format.add_option('-m', '--mobile-version',
1060                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1061                 video_format.add_option('-d', '--high-def',
1062                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1063                 parser.add_option_group(video_format)
1064
1065                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1066                 verbosity.add_option('-q', '--quiet',
1067                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1068                 verbosity.add_option('-s', '--simulate',
1069                                 action='store_true', dest='simulate', help='do not download video', default=False)
1070                 verbosity.add_option('-g', '--get-url',
1071                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1072                 verbosity.add_option('-e', '--get-title',
1073                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1074                 parser.add_option_group(verbosity)
1075
1076                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1077                 filesystem.add_option('-t', '--title',
1078                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1079                 filesystem.add_option('-l', '--literal',
1080                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1081                 filesystem.add_option('-o', '--output',
1082                                 dest='outtmpl', metavar='TPL', help='output filename template')
1083                 filesystem.add_option('-a', '--batch-file',
1084                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1085                 filesystem.add_option('-w', '--no-overwrites',
1086                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1087                 parser.add_option_group(filesystem)
1088
1089                 (opts, args) = parser.parse_args()
1090
1091                 # Batch file verification
1092                 batchurls = []
1093                 if opts.batchfile is not None:
1094                         try:
1095                                 batchurls = open(opts.batchfile, 'r').readlines()
1096                                 batchurls = [x.strip() for x in batchurls]
1097                                 batchurls = [x for x in batchurls if len(x) > 0]
1098                         except IOError:
1099                                 sys.exit(u'ERROR: batch file could not be read')
1100                 all_urls = batchurls + args
1101
1102                 # Conflicting, missing and erroneous options
1103                 if len(all_urls) < 1:
1104                         parser.error(u'you must provide at least one URL')
1105                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1106                         parser.error(u'using .netrc conflicts with giving username/password')
1107                 if opts.password is not None and opts.username is None:
1108                         parser.error(u'account username missing')
1109                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1110                         parser.error(u'using output template conflicts with using title or literal title')
1111                 if opts.usetitle and opts.useliteral:
1112                         parser.error(u'using title conflicts with using literal title')
1113                 if opts.username is not None and opts.password is None:
1114                         opts.password = getpass.getpass(u'Type account password and press return:')
1115                 if opts.ratelimit is not None:
1116                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1117                         if numeric_limit is None:
1118                                 parser.error(u'invalid rate limit specified')
1119                         opts.ratelimit = numeric_limit
1120
1121                 # Information extractors
1122                 youtube_ie = YoutubeIE()
1123                 metacafe_ie = MetacafeIE(youtube_ie)
1124                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1125                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1126
1127                 # File downloader
1128                 fd = FileDownloader({
1129                         'usenetrc': opts.usenetrc,
1130                         'username': opts.username,
1131                         'password': opts.password,
1132                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1133                         'forceurl': opts.geturl,
1134                         'forcetitle': opts.gettitle,
1135                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1136                         'format': opts.format,
1137                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1138                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1139                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1140                                 or u'%(id)s.%(ext)s'),
1141                         'ignoreerrors': opts.ignoreerrors,
1142                         'ratelimit': opts.ratelimit,
1143                         'nooverwrites': opts.nooverwrites,
1144                         })
1145                 fd.add_info_extractor(youtube_search_ie)
1146                 fd.add_info_extractor(youtube_pl_ie)
1147                 fd.add_info_extractor(metacafe_ie)
1148                 fd.add_info_extractor(youtube_ie)
1149                 retcode = fd.download(all_urls)
1150                 sys.exit(retcode)
1151
1152         except DownloadError:
1153                 sys.exit(1)
1154         except SameFileError:
1155                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1156         except KeyboardInterrupt:
1157                 sys.exit(u'\nERROR: Interrupted by user')