youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor returns
  69         all the information to the FileDownloader and the latter downloads the
  70         file or does whatever it's instructed to do.
  71
  72         File downloaders accept a lot of parameters. In order not to saturate
  73         the object constructor with arguments, it receives a dictionary of
  74         options instead. These options are available through the params
  75         attribute for the InfoExtractors to use. The FileDownloader also
  76         registers itself as the downloader in charge for the InfoExtractors
  77         that are added to it, so this is a "mutual registration".
  78
  79         Available options:
  80
  81         username:       Username for authentication purposes.
  82         password:       Password for authentication purposes.
  83         usenetrc:       Use netrc for authentication instead.
  84         quiet:          Do not print messages to stdout.
  85         forceurl:       Force printing final URL.
  86         forcetitle:     Force printing title.
  87         simulate:       Do not download the video files.
  88         format:         Video format code.
  89         outtmpl:        Template for output names.
  90         ignoreerrors:   Do not stop on download errors.
  91         ratelimit:      Download speed limit, in bytes/sec.
  92         nooverwrites:   Prevent overwriting files.
  93         """
  94
  95         params = None
  96         _ies = []
  97         _pps = []
  98         _download_retcode = None
  99
 100         def __init__(self, params):
 101                 """Create a FileDownloader object with the given options."""
 102                 self._ies = []
 103                 self._pps = []
 104                 self._download_retcode = 0
 105                 self.params = params
 106
 107         @staticmethod
 108         def pmkdir(filename):
 109                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 110                 components = filename.split(os.sep)
 111                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 112                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 113                 for dir in aggregate:
 114                         if not os.path.exists(dir):
 115                                 os.mkdir(dir)
 116
 117         @staticmethod
 118         def format_bytes(bytes):
 119                 if bytes is None:
 120                         return 'N/A'
 121                 if bytes == 0:
 122                         exponent = 0
 123                 else:
 124                         exponent = long(math.log(float(bytes), 1024.0))
 125                 suffix = 'bkMGTPEZY'[exponent]
 126                 converted = float(bytes) / float(1024**exponent)
 127                 return '%.2f%s' % (converted, suffix)
 128
 129         @staticmethod
 130         def calc_percent(byte_counter, data_len):
 131                 if data_len is None:
 132                         return '---.-%'
 133                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 134
 135         @staticmethod
 136         def calc_eta(start, now, total, current):
 137                 if total is None:
 138                         return '--:--'
 139                 dif = now - start
 140                 if current == 0 or dif < 0.001: # One millisecond
 141                         return '--:--'
 142                 rate = float(current) / dif
 143                 eta = long((float(total) - float(current)) / rate)
 144                 (eta_mins, eta_secs) = divmod(eta, 60)
 145                 if eta_mins > 99:
 146                         return '--:--'
 147                 return '%02d:%02d' % (eta_mins, eta_secs)
 148
 149         @staticmethod
 150         def calc_speed(start, now, bytes):
 151                 dif = now - start
 152                 if bytes == 0 or dif < 0.001: # One millisecond
 153                         return '%10s' % '---b/s'
 154                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 155
 156         @staticmethod
 157         def best_block_size(elapsed_time, bytes):
 158                 new_min = max(bytes / 2.0, 1.0)
 159                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 160                 if elapsed_time < 0.001:
 161                         return int(new_max)
 162                 rate = bytes / elapsed_time
 163                 if rate > new_max:
 164                         return int(new_max)
 165                 if rate < new_min:
 166                         return int(new_min)
 167                 return int(rate)
 168
 169         @staticmethod
 170         def parse_bytes(bytestr):
 171                 """Parse a string indicating a byte quantity into a long integer."""
 172                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 173                 if matchobj is None:
 174                         return None
 175                 number = float(matchobj.group(1))
 176                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 177                 return long(round(number * multiplier))
 178
 179         def add_info_extractor(self, ie):
 180                 """Add an InfoExtractor object to the end of the list."""
 181                 self._ies.append(ie)
 182                 ie.set_downloader(self)
 183
 184         def add_post_processor(self, pp):
 185                 """Add a PostProcessor object to the end of the chain."""
 186                 self._pps.append(pp)
 187                 pp.set_downloader(self)
 188
 189         def to_stdout(self, message, skip_eol=False):
 190                 """Print message to stdout if not in quiet mode."""
 191                 if not self.params.get('quiet', False):
 192                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 193                         sys.stdout.flush()
 194
 195         def to_stderr(self, message):
 196                 """Print message to stderr."""
 197                 print >>sys.stderr, message
 198
 199         def fixed_template(self):
 200                 """Checks if the output template is fixed."""
 201                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 202
 203         def trouble(self, message=None):
 204                 """Determine action to take when a download problem appears.
 205
 206                 Depending on if the downloader has been configured to ignore
 207                 download errors or not, this method may throw an exception or
 208                 not when errors are found, after printing the message.
 209                 """
 210                 if message is not None:
 211                         self.to_stderr(message)
 212                 if not self.params.get('ignoreerrors', False):
 213                         raise DownloadError(message)
 214                 self._download_retcode = 1
 215
 216         def slow_down(self, start_time, byte_counter):
 217                 """Sleep if the download speed is over the rate limit."""
 218                 rate_limit = self.params.get('ratelimit', None)
 219                 if rate_limit is None or byte_counter == 0:
 220                         return
 221                 now = time.time()
 222                 elapsed = now - start_time
 223                 if elapsed <= 0.0:
 224                         return
 225                 speed = float(byte_counter) / elapsed
 226                 if speed > rate_limit:
 227                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 228
 229         def report_destination(self, filename):
 230                 """Report destination filename."""
 231                 self.to_stdout(u'[download] Destination: %s' % filename)
 232
 233         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 234                 """Report download progress."""
 235                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 236                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 237
 238         def report_finish(self):
 239                 """Report download finished."""
 240                 self.to_stdout(u'')
 241
 242         def process_info(self, info_dict):
 243                 """Process a single dictionary returned by an InfoExtractor."""
 244                 # Forced printings
 245                 if self.params.get('forcetitle', False):
 246                         print info_dict['title']
 247                 if self.params.get('forceurl', False):
 248                         print info_dict['url']
 249
 250                 # Do nothing else if in simulate mode
 251                 if self.params.get('simulate', False):
 252                         return
 253
 254                 try:
 255                         filename = self.params['outtmpl'] % info_dict
 256                         self.report_destination(filename)
 257                 except (ValueError, KeyError), err:
 258                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 259                 if self.params['nooverwrites'] and os.path.exists(filename):
 260                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 261                         return
 262                 try:
 263                         self.pmkdir(filename)
 264                 except (OSError, IOError), err:
 265                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 266                         return
 267                 try:
 268                         outstream = open(filename, 'wb')
 269                 except (OSError, IOError), err:
 270                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 271                         return
 272                 try:
 273                         self._do_download(outstream, info_dict['url'])
 274                         outstream.close()
 275                 except (OSError, IOError), err:
 276                         self.trouble('ERROR: unable to write video data: %s' % str(err))
 277                         return
 278                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 279                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 280                         return
 281                 try:
 282                         self.post_process(filename, info_dict)
 283                 except (PostProcessingError), err:
 284                         self.trouble('ERROR: postprocessing: %s' % str(err))
 285                         return
 286
 287                 return
 288
 289         def download(self, url_list):
 290                 """Download a given list of URLs."""
 291                 if len(url_list) > 1 and self.fixed_template():
 292                         raise SameFileError(self.params['outtmpl'])
 293
 294                 for url in url_list:
 295                         suitable_found = False
 296                         for ie in self._ies:
 297                                 # Go to next InfoExtractor if not suitable
 298                                 if not ie.suitable(url):
 299                                         continue
 300
 301                                 # Suitable InfoExtractor found
 302                                 suitable_found = True
 303
 304                                 # Extract information from URL
 305                                 all_results = ie.extract(url)
 306                                 results = [x for x in all_results if x is not None]
 307
 308                                 # See if there were problems extracting any information
 309                                 if len(results) != len(all_results):
 310                                         self.trouble()
 311
 312                                 # Two results could go to the same file
 313                                 if len(results) > 1 and self.fixed_template():
 314                                         raise SameFileError(self.params['outtmpl'])
 315
 316                                 # Process each result
 317                                 for result in results:
 318                                         self.process_info(result)
 319
 320                                 # Suitable InfoExtractor had been found; go to next URL
 321                                 break
 322
 323                         if not suitable_found:
 324                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 325
 326                 return self._download_retcode
 327
 328         def post_process(self, filename, ie_info):
 329                 """Run the postprocessing chain on the given file."""
 330                 info = dict(ie_info)
 331                 info['filepath'] = filename
 332                 for pp in self._pps:
 333                         info = pp.run(info)
 334                         if info is None:
 335                                 break
 336
 337         def _do_download(self, stream, url):
 338                 request = urllib2.Request(url, None, std_headers)
 339                 data = urllib2.urlopen(request)
 340                 data_len = data.info().get('Content-length', None)
 341                 data_len_str = self.format_bytes(data_len)
 342                 byte_counter = 0
 343                 block_size = 1024
 344                 start = time.time()
 345                 while True:
 346                         # Progress message
 347                         percent_str = self.calc_percent(byte_counter, data_len)
 348                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 349                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 350                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 351
 352                         # Download and write
 353                         before = time.time()
 354                         data_block = data.read(block_size)
 355                         after = time.time()
 356                         data_block_len = len(data_block)
 357                         if data_block_len == 0:
 358                                 break
 359                         byte_counter += data_block_len
 360                         stream.write(data_block)
 361                         block_size = self.best_block_size(after - before, data_block_len)
 362
 363                         # Apply rate limit
 364                         self.slow_down(start, byte_counter)
 365
 366                 self.report_finish()
 367                 if data_len is not None and str(byte_counter) != data_len:
 368                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 369
 370 class InfoExtractor(object):
 371         """Information Extractor class.
 372
 373         Information extractors are the classes that, given a URL, extract
 374         information from the video (or videos) the URL refers to. This
 375         information includes the real video URL, the video title and simplified
 376         title, author and others. It is returned in a list of dictionaries when
 377         calling its extract() method. It is a list because a URL can refer to
 378         more than one video (think of playlists). The dictionaries must include
 379         the following fields:
 380
 381         id:             Video identifier.
 382         url:            Final video URL.
 383         uploader:       Nickname of the video uploader.
 384         title:          Literal title.
 385         stitle:         Simplified title.
 386         ext:            Video filename extension.
 387
 388         Subclasses of this one should re-define the _real_initialize() and
 389         _real_extract() methods, as well as the suitable() static method.
 390         Probably, they should also be instantiated and added to the main
 391         downloader.
 392         """
 393
 394         _ready = False
 395         _downloader = None
 396
 397         def __init__(self, downloader=None):
 398                 """Constructor. Receives an optional downloader."""
 399                 self._ready = False
 400                 self.set_downloader(downloader)
 401
 402         @staticmethod
 403         def suitable(url):
 404                 """Receives a URL and returns True if suitable for this IE."""
 405                 return False
 406
 407         def initialize(self):
 408                 """Initializes an instance (authentication, etc)."""
 409                 if not self._ready:
 410                         self._real_initialize()
 411                         self._ready = True
 412
 413         def extract(self, url):
 414                 """Extracts URL information and returns it in list of dicts."""
 415                 self.initialize()
 416                 return self._real_extract(url)
 417
 418         def set_downloader(self, downloader):
 419                 """Sets the downloader for this IE."""
 420                 self._downloader = downloader
 421
 422         def _real_initialize(self):
 423                 """Real initialization process. Redefine in subclasses."""
 424                 pass
 425
 426         def _real_extract(self, url):
 427                 """Real extraction process. Redefine in subclasses."""
 428                 pass
 429
 430 class YoutubeIE(InfoExtractor):
 431         """Information extractor for youtube.com."""
 432
 433         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 434         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 435         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 436         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 437         _NETRC_MACHINE = 'youtube'
 438
 439         @staticmethod
 440         def suitable(url):
 441                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 442
 443         @staticmethod
 444         def htmlentity_transform(matchobj):
 445                 """Transforms an HTML entity to a Unicode character."""
 446                 entity = matchobj.group(1)
 447
 448                 # Known non-numeric HTML entity
 449                 if entity in htmlentitydefs.name2codepoint:
 450                         return unichr(htmlentitydefs.name2codepoint[entity])
 451
 452                 # Unicode character
 453                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 454                 if mobj is not None:
 455                         numstr = mobj.group(1)
 456                         if numstr.startswith(u'x'):
 457                                 base = 16
 458                                 numstr = u'0%s' % numstr
 459                         else:
 460                                 base = 10
 461                         return unichr(long(numstr, base))
 462
 463                 # Unknown entity in name, return its literal representation
 464                 return (u'&%s;' % entity)
 465
 466         def report_lang(self):
 467                 """Report attempt to set language."""
 468                 self._downloader.to_stdout(u'[youtube] Setting language')
 469
 470         def report_login(self):
 471                 """Report attempt to log in."""
 472                 self._downloader.to_stdout(u'[youtube] Logging in')
 473
 474         def report_age_confirmation(self):
 475                 """Report attempt to confirm age."""
 476                 self._downloader.to_stdout(u'[youtube] Confirming age')
 477
 478         def report_webpage_download(self, video_id):
 479                 """Report attempt to download webpage."""
 480                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 481
 482         def report_information_extraction(self, video_id):
 483                 """Report attempt to extract video information."""
 484                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 485
 486         def report_video_url(self, video_id, video_real_url):
 487                 """Report extracted video URL."""
 488                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 489
 490         def _real_initialize(self):
 491                 if self._downloader is None:
 492                         return
 493
 494                 username = None
 495                 password = None
 496                 downloader_params = self._downloader.params
 497
 498                 # Attempt to use provided username and password or .netrc data
 499                 if downloader_params.get('username', None) is not None:
 500                         username = downloader_params['username']
 501                         password = downloader_params['password']
 502                 elif downloader_params.get('usenetrc', False):
 503                         try:
 504                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 505                                 if info is not None:
 506                                         username = info[0]
 507                                         password = info[2]
 508                                 else:
 509                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 510                         except (IOError, netrc.NetrcParseError), err:
 511                                 self._downloader.trouble(u'WARNING: parsing .netrc: %s' % str(err))
 512                                 return
 513
 514                 # Set language
 515                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 516                 try:
 517                         self.report_lang()
 518                         urllib2.urlopen(request).read()
 519                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 520                         self._downloader.trouble(u'WARNING: unable to set language: %s' % str(err))
 521                         return
 522
 523                 # No authentication to be performed
 524                 if username is None:
 525                         return
 526
 527                 # Log in
 528                 login_form = {
 529                                 'current_form': 'loginForm',
 530                                 'next':         '/',
 531                                 'action_login': 'Log In',
 532                                 'username':     username,
 533                                 'password':     password,
 534                                 }
 535                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 536                 try:
 537                         self.report_login()
 538                         login_results = urllib2.urlopen(request).read()
 539                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 540                                 self._downloader.trouble(u'WARNING: unable to log in: bad username or password')
 541                                 return
 542                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 543                         self._downloader.trouble(u'WARNING: unable to log in: %s' % str(err))
 544                         return
 545
 546                 # Confirm age
 547                 age_form = {
 548                                 'next_url':             '/',
 549                                 'action_confirm':       'Confirm',
 550                                 }
 551                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 552                 try:
 553                         self.report_age_confirmation()
 554                         age_results = urllib2.urlopen(request).read()
 555                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 556                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 557                         return
 558
 559         def _real_extract(self, url):
 560                 # Extract video id from URL
 561                 mobj = re.match(self._VALID_URL, url)
 562                 if mobj is None:
 563                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 564                         return [None]
 565                 video_id = mobj.group(2)
 566
 567                 # Downloader parameters
 568                 format_param = None
 569                 if self._downloader is not None:
 570                         params = self._downloader.params
 571                         format_param = params.get('format', None)
 572
 573                 # Extension
 574                 video_extension = {
 575                         '17': '3gp',
 576                         '18': 'mp4',
 577                         '22': 'mp4',
 578                 }.get(format_param, 'flv')
 579
 580                 # Normalize URL, including format
 581                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 582                 if format_param is not None:
 583                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 584                 request = urllib2.Request(normalized_url, None, std_headers)
 585                 try:
 586                         self.report_webpage_download(video_id)
 587                         video_webpage = urllib2.urlopen(request).read()
 588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 589                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 590                         return [None]
 591                 self.report_information_extraction(video_id)
 592
 593                 # "t" param
 594                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 595                 if mobj is None:
 596                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 597                         return [None]
 598                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 599                 if format_param is not None:
 600                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 601                 self.report_video_url(video_id, video_real_url)
 602
 603                 # uploader
 604                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 605                 if mobj is None:
 606                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 607                         return [None]
 608                 video_uploader = mobj.group(1)
 609
 610                 # title
 611                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 612                 if mobj is None:
 613                         self._downloader.trouble(u'ERROR: unable to extract video title')
 614                         return [None]
 615                 video_title = mobj.group(1).decode('utf-8')
 616                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 617                 video_title = video_title.replace(os.sep, u'%')
 618
 619                 # simplified title
 620                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 621                 simple_title = simple_title.strip(ur'_')
 622
 623                 # Process video information
 624                 return [{
 625                         'id':           video_id.decode('utf-8'),
 626                         'url':          video_real_url.decode('utf-8'),
 627                         'uploader':     video_uploader.decode('utf-8'),
 628                         'title':        video_title,
 629                         'stitle':       simple_title,
 630                         'ext':          video_extension.decode('utf-8'),
 631                         }]
 632
 633 class MetacafeIE(InfoExtractor):
 634         """Information Extractor for metacafe.com."""
 635
 636         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 637         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 638         _youtube_ie = None
 639
 640         def __init__(self, youtube_ie, downloader=None):
 641                 InfoExtractor.__init__(self, downloader)
 642                 self._youtube_ie = youtube_ie
 643
 644         @staticmethod
 645         def suitable(url):
 646                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 647
 648         def report_disclaimer(self):
 649                 """Report disclaimer retrieval."""
 650                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 651
 652         def report_age_confirmation(self):
 653                 """Report attempt to confirm age."""
 654                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 655
 656         def report_download_webpage(self, video_id):
 657                 """Report webpage download."""
 658                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 659
 660         def report_extraction(self, video_id):
 661                 """Report information extraction."""
 662                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 663
 664         def _real_initialize(self):
 665                 # Retrieve disclaimer
 666                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 667                 try:
 668                         self.report_disclaimer()
 669                         disclaimer = urllib2.urlopen(request).read()
 670                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 671                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 672                         return
 673
 674                 # Confirm age
 675                 disclaimer_form = {
 676                         'filters': '0',
 677                         'submit': "Continue - I'm over 18",
 678                         }
 679                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 680                 try:
 681                         self.report_age_confirmation()
 682                         disclaimer = urllib2.urlopen(request).read()
 683                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 684                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 685                         return
 686
 687         def _real_extract(self, url):
 688                 # Extract id and simplified title from URL
 689                 mobj = re.match(self._VALID_URL, url)
 690                 if mobj is None:
 691                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 692                         return [None]
 693
 694                 video_id = mobj.group(1)
 695
 696                 # Check if video comes from YouTube
 697                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 698                 if mobj2 is not None:
 699                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 700
 701                 simple_title = mobj.group(2).decode('utf-8')
 702                 video_extension = 'flv'
 703
 704                 # Retrieve video webpage to extract further information
 705                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 706                 try:
 707                         self.report_download_webpage(video_id)
 708                         webpage = urllib2.urlopen(request).read()
 709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 710                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 711                         return [None]
 712
 713                 # Extract URL, uploader and title from webpage
 714                 self.report_extraction(video_id)
 715                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 716                 if mobj is None:
 717                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 718                         return [None]
 719                 mediaURL = mobj.group(1).replace('\\', '')
 720
 721                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 722                 if mobj is None:
 723                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 724                         return [None]
 725                 gdaKey = mobj.group(1)
 726
 727                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 728
 729                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 730                 if mobj is None:
 731                         self._downloader.trouble(u'ERROR: unable to extract title')
 732                         return [None]
 733                 video_title = mobj.group(1).decode('utf-8')
 734
 735                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 736                 if mobj is None:
 737                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 738                         return [None]
 739                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 740
 741                 # Return information
 742                 return [{
 743                         'id':           video_id.decode('utf-8'),
 744                         'url':          video_url.decode('utf-8'),
 745                         'uploader':     video_uploader.decode('utf-8'),
 746                         'title':        video_title,
 747                         'stitle':       simple_title,
 748                         'ext':          video_extension.decode('utf-8'),
 749                         }]
 750
 751
 752 class YoutubeSearchIE(InfoExtractor):
 753         """Information Extractor for YouTube search queries."""
 754         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 755         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 756         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 757         _MORE_PAGES_INDICATOR = r'>Next</a>'
 758         _youtube_ie = None
 759         _max_youtube_results = 1000
 760
 761         def __init__(self, youtube_ie, downloader=None):
 762                 InfoExtractor.__init__(self, downloader)
 763                 self._youtube_ie = youtube_ie
 764
 765         @staticmethod
 766         def suitable(url):
 767                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 768
 769         def report_download_page(self, query, pagenum):
 770                 """Report attempt to download playlist page with given number."""
 771                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 772
 773         def _real_initialize(self):
 774                 self._youtube_ie.initialize()
 775
 776         def _real_extract(self, query):
 777                 mobj = re.match(self._VALID_QUERY, query)
 778                 if mobj is None:
 779                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 780                         return [None]
 781
 782                 prefix, query = query.split(':')
 783                 prefix = prefix[8:]
 784                 if prefix == '':
 785                         return self._download_n_results(query, 1)
 786                 elif prefix == 'all':
 787                         return self._download_n_results(query, self._max_youtube_results)
 788                 else:
 789                         try:
 790                                 n = int(prefix)
 791                                 if n <= 0:
 792                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 793                                         return [None]
 794                                 elif n > self._max_youtube_results:
 795                                         self._downloader.trouble(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 796                                         n = self._max_youtube_results
 797                                 return self._download_n_results(query, n)
 798                         except ValueError: # parsing prefix as int fails
 799                                 return self._download_n_results(query, 1)
 800
 801         def _download_n_results(self, query, n):
 802                 """Downloads a specified number of results for a query"""
 803
 804                 video_ids = []
 805                 already_seen = set()
 806                 pagenum = 1
 807
 808                 while True:
 809                         self.report_download_page(query, pagenum)
 810                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 811                         request = urllib2.Request(result_url, None, std_headers)
 812                         try:
 813                                 page = urllib2.urlopen(request).read()
 814                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 815                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 816                                 return [None]
 817
 818                         # Extract video identifiers
 819                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 820                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 821                                 if video_id not in already_seen:
 822                                         video_ids.append(video_id)
 823                                         already_seen.add(video_id)
 824                                         if len(video_ids) == n:
 825                                                 # Specified n videos reached
 826                                                 information = []
 827                                                 for id in video_ids:
 828                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 829                                                 return information
 830
 831                         if self._MORE_PAGES_INDICATOR not in page:
 832                                 information = []
 833                                 for id in video_ids:
 834                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 835                                 return information
 836
 837                         pagenum = pagenum + 1
 838
 839 class YoutubePlaylistIE(InfoExtractor):
 840         """Information Extractor for YouTube playlists."""
 841
 842         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 843         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 844         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 845         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 846         _youtube_ie = None
 847
 848         def __init__(self, youtube_ie, downloader=None):
 849                 InfoExtractor.__init__(self, downloader)
 850                 self._youtube_ie = youtube_ie
 851
 852         @staticmethod
 853         def suitable(url):
 854                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 855
 856         def report_download_page(self, playlist_id, pagenum):
 857                 """Report attempt to download playlist page with given number."""
 858                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 859
 860         def _real_initialize(self):
 861                 self._youtube_ie.initialize()
 862
 863         def _real_extract(self, url):
 864                 # Extract playlist id
 865                 mobj = re.match(self._VALID_URL, url)
 866                 if mobj is None:
 867                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 868                         return [None]
 869
 870                 # Download playlist pages
 871                 playlist_id = mobj.group(1)
 872                 video_ids = []
 873                 pagenum = 1
 874
 875                 while True:
 876                         self.report_download_page(playlist_id, pagenum)
 877                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 878                         try:
 879                                 page = urllib2.urlopen(request).read()
 880                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 881                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 882                                 return [None]
 883
 884                         # Extract video identifiers
 885                         ids_in_page = []
 886                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 887                                 if mobj.group(1) not in ids_in_page:
 888                                         ids_in_page.append(mobj.group(1))
 889                         video_ids.extend(ids_in_page)
 890
 891                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 892                                 break
 893                         pagenum = pagenum + 1
 894
 895                 information = []
 896                 for id in video_ids:
 897                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 898                 return information
 899
 900 class PostProcessor(object):
 901         """Post Processor class.
 902
 903         PostProcessor objects can be added to downloaders with their
 904         add_post_processor() method. When the downloader has finished a
 905         successful download, it will take its internal chain of PostProcessors
 906         and start calling the run() method on each one of them, first with
 907         an initial argument and then with the returned value of the previous
 908         PostProcessor.
 909
 910         The chain will be stopped if one of them ever returns None or the end
 911         of the chain is reached.
 912
 913         PostProcessor objects follow a "mutual registration" process similar
 914         to InfoExtractor objects.
 915         """
 916
 917         _downloader = None
 918
 919         def __init__(self, downloader=None):
 920                 self._downloader = downloader
 921
 922         def set_downloader(self, downloader):
 923                 """Sets the downloader for this PP."""
 924                 self._downloader = downloader
 925
 926         def run(self, information):
 927                 """Run the PostProcessor.
 928
 929                 The "information" argument is a dictionary like the ones
 930                 returned by InfoExtractors. The only difference is that this
 931                 one has an extra field called "filepath" that points to the
 932                 downloaded file.
 933
 934                 When this method returns None, the postprocessing chain is
 935                 stopped. However, this method may return an information
 936                 dictionary that will be passed to the next postprocessing
 937                 object in the chain. It can be the one it received after
 938                 changing some fields.
 939
 940                 In addition, this method may raise a PostProcessingError
 941                 exception that will be taken into account by the downloader
 942                 it was called from.
 943                 """
 944                 return information # by default, do nothing
 945
 946 ### MAIN PROGRAM ###
 947 if __name__ == '__main__':
 948         try:
 949                 # Modules needed only when running the main program
 950                 import getpass
 951                 import optparse
 952
 953                 # General configuration
 954                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 955                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 956                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 957
 958                 # Parse command line
 959                 parser = optparse.OptionParser(
 960                                 usage='Usage: %prog [options] url...',
 961                                 version='INTERNAL',
 962                                 conflict_handler='resolve',
 963                                 )
 964                 parser.add_option('-h', '--help',
 965                                 action='help', help='print this help text and exit')
 966                 parser.add_option('-v', '--version',
 967                                 action='version', help='print program version and exit')
 968                 parser.add_option('-u', '--username',
 969                                 dest='username', metavar='UN', help='account username')
 970                 parser.add_option('-p', '--password',
 971                                 dest='password', metavar='PW', help='account password')
 972                 parser.add_option('-o', '--output',
 973                                 dest='outtmpl', metavar='TPL', help='output filename template')
 974                 parser.add_option('-q', '--quiet',
 975                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 976                 parser.add_option('-s', '--simulate',
 977                                 action='store_true', dest='simulate', help='do not download video', default=False)
 978                 parser.add_option('-t', '--title',
 979                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 980                 parser.add_option('-l', '--literal',
 981                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 982                 parser.add_option('-n', '--netrc',
 983                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 984                 parser.add_option('-g', '--get-url',
 985                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 986                 parser.add_option('-e', '--get-title',
 987                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 988                 parser.add_option('-f', '--format',
 989                                 dest='format', metavar='FMT', help='video format code')
 990                 parser.add_option('-m', '--mobile-version',
 991                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 992                 parser.add_option('-d', '--high-def',
 993                                 action='store_const', dest='format', help='alias for -f 22', const='22')
 994                 parser.add_option('-i', '--ignore-errors',
 995                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 996                 parser.add_option('-r', '--rate-limit',
 997                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 998                 parser.add_option('-a', '--batch-file',
 999                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1000                 parser.add_option('-w', '--no-overwrites',
1001                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1002                 (opts, args) = parser.parse_args()
1003
1004                 # Batch file verification
1005                 batchurls = []
1006                 if opts.batchfile is not None:
1007                         try:
1008                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1009                         except IOError:
1010                                 sys.exit(u'ERROR: batch file could not be read')
1011                 all_urls = batchurls + args
1012
1013                 # Conflicting, missing and erroneous options
1014                 if len(all_urls) < 1:
1015                         sys.exit(u'ERROR: you must provide at least one URL')
1016                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1017                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1018                 if opts.password is not None and opts.username is None:
1019                         sys.exit(u'ERROR: account username missing')
1020                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1021                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1022                 if opts.usetitle and opts.useliteral:
1023                         sys.exit(u'ERROR: using title conflicts with using literal title')
1024                 if opts.username is not None and opts.password is None:
1025                         opts.password = getpass.getpass(u'Type account password and press return:')
1026                 if opts.ratelimit is not None:
1027                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1028                         if numeric_limit is None:
1029                                 sys.exit(u'ERROR: invalid rate limit specified')
1030                         opts.ratelimit = numeric_limit
1031
1032                 # Information extractors
1033                 youtube_ie = YoutubeIE()
1034                 metacafe_ie = MetacafeIE(youtube_ie)
1035                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1036                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1037
1038                 # File downloader
1039                 charset = locale.getpreferredencoding()
1040                 if charset is None:
1041                         charset = 'ascii'
1042                 fd = FileDownloader({
1043                         'usenetrc': opts.usenetrc,
1044                         'username': opts.username,
1045                         'password': opts.password,
1046                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1047                         'forceurl': opts.geturl,
1048                         'forcetitle': opts.gettitle,
1049                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1050                         'format': opts.format,
1051                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1052                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1053                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1054                                 or u'%(id)s.%(ext)s'),
1055                         'ignoreerrors': opts.ignoreerrors,
1056                         'ratelimit': opts.ratelimit,
1057                         'nooverwrites': opts.nooverwrites,
1058                         })
1059                 fd.add_info_extractor(youtube_search_ie)
1060                 fd.add_info_extractor(youtube_pl_ie)
1061                 fd.add_info_extractor(metacafe_ie)
1062                 fd.add_info_extractor(youtube_ie)
1063                 retcode = fd.download(all_urls)
1064                 sys.exit(retcode)
1065
1066         except DownloadError:
1067                 sys.exit(1)
1068         except SameFileError:
1069                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1070         except KeyboardInterrupt:
1071                 sys.exit(u'\nERROR: Interrupted by user')