youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename.
  82
  83         This triggers different transformations based on the platform we
  84         are running.
  85         """
  86         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  87         if sys.platform == 'win32':
  88                 return re.replace(ur'<>:"\|\?\*', u'-', title)
  89         return utitle.replace(unicode(os.sep), u'%')
  90
  91 class DownloadError(Exception):
  92         """Download Error exception.
  93
  94         This exception may be thrown by FileDownloader objects if they are not
  95         configured to continue on errors. They will contain the appropriate
  96         error message.
  97         """
  98         pass
  99
 100 class SameFileError(Exception):
 101         """Same File exception.
 102
 103         This exception will be thrown by FileDownloader objects if they detect
 104         multiple files would have to be downloaded to the same file on disk.
 105         """
 106         pass
 107
 108 class PostProcessingError(Exception):
 109         """Post Processing exception.
 110
 111         This exception may be raised by PostProcessor's .run() method to
 112         indicate an error in the postprocessing task.
 113         """
 114         pass
 115
 116 class UnavailableFormatError(Exception):
 117         """Unavailable Format exception.
 118
 119         This exception will be thrown when a video is requested
 120         in a format that is not available for that video.
 121         """
 122         pass
 123
 124 class ContentTooShortError(Exception):
 125         """Content Too Short exception.
 126
 127         This exception may be raised by FileDownloader objects when a file they
 128         download is too small for what the server announced first, indicating
 129         the connection was probably interrupted.
 130         """
 131         # Both in bytes
 132         downloaded = None
 133         expected = None
 134
 135         def __init__(self, downloaded, expected):
 136                 self.downloaded = downloaded
 137                 self.expected = expected
 138
 139 class FileDownloader(object):
 140         """File Downloader class.
 141
 142         File downloader objects are the ones responsible of downloading the
 143         actual video file and writing it to disk if the user has requested
 144         it, among some other tasks. In most cases there should be one per
 145         program. As, given a video URL, the downloader doesn't know how to
 146         extract all the needed information, task that InfoExtractors do, it
 147         has to pass the URL to one of them.
 148
 149         For this, file downloader objects have a method that allows
 150         InfoExtractors to be registered in a given order. When it is passed
 151         a URL, the file downloader handles it to the first InfoExtractor it
 152         finds that reports being able to handle it. The InfoExtractor extracts
 153         all the information about the video or videos the URL refers to, and
 154         asks the FileDownloader to process the video information, possibly
 155         downloading the video.
 156
 157         File downloaders accept a lot of parameters. In order not to saturate
 158         the object constructor with arguments, it receives a dictionary of
 159         options instead. These options are available through the params
 160         attribute for the InfoExtractors to use. The FileDownloader also
 161         registers itself as the downloader in charge for the InfoExtractors
 162         that are added to it, so this is a "mutual registration".
 163
 164         Available options:
 165
 166         username:       Username for authentication purposes.
 167         password:       Password for authentication purposes.
 168         usenetrc:       Use netrc for authentication instead.
 169         quiet:          Do not print messages to stdout.
 170         forceurl:       Force printing final URL.
 171         forcetitle:     Force printing title.
 172         simulate:       Do not download the video files.
 173         format:         Video format code.
 174         outtmpl:        Template for output names.
 175         ignoreerrors:   Do not stop on download errors.
 176         ratelimit:      Download speed limit, in bytes/sec.
 177         nooverwrites:   Prevent overwriting files.
 178         continuedl:     Try to continue downloads if possible.
 179         """
 180
 181         params = None
 182         _ies = []
 183         _pps = []
 184         _download_retcode = None
 185
 186         def __init__(self, params):
 187                 """Create a FileDownloader object with the given options."""
 188                 self._ies = []
 189                 self._pps = []
 190                 self._download_retcode = 0
 191                 self.params = params
 192
 193         @staticmethod
 194         def pmkdir(filename):
 195                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 196                 components = filename.split(os.sep)
 197                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 198                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 199                 for dir in aggregate:
 200                         if not os.path.exists(dir):
 201                                 os.mkdir(dir)
 202
 203         @staticmethod
 204         def format_bytes(bytes):
 205                 if bytes is None:
 206                         return 'N/A'
 207                 if type(bytes) is str:
 208                         bytes = float(bytes)
 209                 if bytes == 0.0:
 210                         exponent = 0
 211                 else:
 212                         exponent = long(math.log(bytes, 1024.0))
 213                 suffix = 'bkMGTPEZY'[exponent]
 214                 converted = float(bytes) / float(1024**exponent)
 215                 return '%.2f%s' % (converted, suffix)
 216
 217         @staticmethod
 218         def calc_percent(byte_counter, data_len):
 219                 if data_len is None:
 220                         return '---.-%'
 221                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 222
 223         @staticmethod
 224         def calc_eta(start, now, total, current):
 225                 if total is None:
 226                         return '--:--'
 227                 dif = now - start
 228                 if current == 0 or dif < 0.001: # One millisecond
 229                         return '--:--'
 230                 rate = float(current) / dif
 231                 eta = long((float(total) - float(current)) / rate)
 232                 (eta_mins, eta_secs) = divmod(eta, 60)
 233                 if eta_mins > 99:
 234                         return '--:--'
 235                 return '%02d:%02d' % (eta_mins, eta_secs)
 236
 237         @staticmethod
 238         def calc_speed(start, now, bytes):
 239                 dif = now - start
 240                 if bytes == 0 or dif < 0.001: # One millisecond
 241                         return '%10s' % '---b/s'
 242                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 243
 244         @staticmethod
 245         def best_block_size(elapsed_time, bytes):
 246                 new_min = max(bytes / 2.0, 1.0)
 247                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 248                 if elapsed_time < 0.001:
 249                         return long(new_max)
 250                 rate = bytes / elapsed_time
 251                 if rate > new_max:
 252                         return long(new_max)
 253                 if rate < new_min:
 254                         return long(new_min)
 255                 return long(rate)
 256
 257         @staticmethod
 258         def parse_bytes(bytestr):
 259                 """Parse a string indicating a byte quantity into a long integer."""
 260                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 261                 if matchobj is None:
 262                         return None
 263                 number = float(matchobj.group(1))
 264                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 265                 return long(round(number * multiplier))
 266
 267         @staticmethod
 268         def verify_url(url):
 269                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 270                 request = urllib2.Request(url, None, std_headers)
 271                 data = urllib2.urlopen(request)
 272                 data.read(1)
 273                 url = data.geturl()
 274                 data.close()
 275                 return url
 276
 277         def add_info_extractor(self, ie):
 278                 """Add an InfoExtractor object to the end of the list."""
 279                 self._ies.append(ie)
 280                 ie.set_downloader(self)
 281
 282         def add_post_processor(self, pp):
 283                 """Add a PostProcessor object to the end of the chain."""
 284                 self._pps.append(pp)
 285                 pp.set_downloader(self)
 286
 287         def to_stdout(self, message, skip_eol=False):
 288                 """Print message to stdout if not in quiet mode."""
 289                 if not self.params.get('quiet', False):
 290                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 291                         sys.stdout.flush()
 292
 293         def to_stderr(self, message):
 294                 """Print message to stderr."""
 295                 print >>sys.stderr, message.encode(preferredencoding())
 296
 297         def fixed_template(self):
 298                 """Checks if the output template is fixed."""
 299                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 300
 301         def trouble(self, message=None):
 302                 """Determine action to take when a download problem appears.
 303
 304                 Depending on if the downloader has been configured to ignore
 305                 download errors or not, this method may throw an exception or
 306                 not when errors are found, after printing the message.
 307                 """
 308                 if message is not None:
 309                         self.to_stderr(message)
 310                 if not self.params.get('ignoreerrors', False):
 311                         raise DownloadError(message)
 312                 self._download_retcode = 1
 313
 314         def slow_down(self, start_time, byte_counter):
 315                 """Sleep if the download speed is over the rate limit."""
 316                 rate_limit = self.params.get('ratelimit', None)
 317                 if rate_limit is None or byte_counter == 0:
 318                         return
 319                 now = time.time()
 320                 elapsed = now - start_time
 321                 if elapsed <= 0.0:
 322                         return
 323                 speed = float(byte_counter) / elapsed
 324                 if speed > rate_limit:
 325                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 326
 327         def report_destination(self, filename):
 328                 """Report destination filename."""
 329                 self.to_stdout(u'[download] Destination: %s' % filename)
 330
 331         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 332                 """Report download progress."""
 333                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 334                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 335
 336         def report_resuming_byte(self, resume_len):
 337                 """Report attemtp to resume at given byte."""
 338                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 339
 340         def report_file_already_downloaded(self, file_name):
 341                 """Report file has already been fully downloaded."""
 342                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 343
 344         def report_unable_to_resume(self):
 345                 """Report it was impossible to resume download."""
 346                 self.to_stdout(u'[download] Unable to resume')
 347
 348         def report_finish(self):
 349                 """Report download finished."""
 350                 self.to_stdout(u'')
 351
 352         def process_info(self, info_dict):
 353                 """Process a single dictionary returned by an InfoExtractor."""
 354                 # Do nothing else if in simulate mode
 355                 if self.params.get('simulate', False):
 356                         # Verify URL if it's an HTTP one
 357                         if info_dict['url'].startswith('http'):
 358                                 try:
 359                                         info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 360                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 361                                         raise UnavailableFormatError
 362
 363                         # Forced printings
 364                         if self.params.get('forcetitle', False):
 365                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 366                         if self.params.get('forceurl', False):
 367                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 368
 369                         return
 370
 371                 try:
 372                         template_dict = dict(info_dict)
 373                         template_dict['epoch'] = unicode(long(time.time()))
 374                         filename = self.params['outtmpl'] % template_dict
 375                 except (ValueError, KeyError), err:
 376                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 377                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 378                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 379                         return
 380
 381                 try:
 382                         self.pmkdir(filename)
 383                 except (OSError, IOError), err:
 384                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 385                         return
 386
 387                 try:
 388                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 389                 except (OSError, IOError), err:
 390                         raise UnavailableFormatError
 391                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 393                         return
 394                 except (ContentTooShortError, ), err:
 395                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 396                         return
 397
 398                 if success:
 399                         try:
 400                                 self.post_process(filename, info_dict)
 401                         except (PostProcessingError), err:
 402                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 403                                 return
 404
 405         def download(self, url_list):
 406                 """Download a given list of URLs."""
 407                 if len(url_list) > 1 and self.fixed_template():
 408                         raise SameFileError(self.params['outtmpl'])
 409
 410                 for url in url_list:
 411                         suitable_found = False
 412                         for ie in self._ies:
 413                                 # Go to next InfoExtractor if not suitable
 414                                 if not ie.suitable(url):
 415                                         continue
 416
 417                                 # Suitable InfoExtractor found
 418                                 suitable_found = True
 419
 420                                 # Extract information from URL and process it
 421                                 ie.extract(url)
 422
 423                                 # Suitable InfoExtractor had been found; go to next URL
 424                                 break
 425
 426                         if not suitable_found:
 427                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 428
 429                 return self._download_retcode
 430
 431         def post_process(self, filename, ie_info):
 432                 """Run the postprocessing chain on the given file."""
 433                 info = dict(ie_info)
 434                 info['filepath'] = filename
 435                 for pp in self._pps:
 436                         info = pp.run(info)
 437                         if info is None:
 438                                 break
 439
 440         def _download_with_rtmpdump(self, filename, url):
 441                 self.report_destination(filename)
 442
 443                 # Check for rtmpdump first
 444                 try:
 445                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 446                 except (OSError, IOError):
 447                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 448                         return False
 449
 450                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 451                 # the connection was interrumpted and resuming appears to be
 452                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 453                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 454                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 455                 while retval == 2 or retval == 1:
 456                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 457                         time.sleep(2.0) # This seems to be needed
 458                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 459                 if retval == 0:
 460                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 461                         return True
 462                 else:
 463                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
 464                         return False
 465
 466         def _do_download(self, filename, url):
 467                 # Attempt to download using rtmpdump
 468                 if url.startswith('rtmp'):
 469                         return self._download_with_rtmpdump(filename, url)
 470
 471                 stream = None
 472                 open_mode = 'wb'
 473                 basic_request = urllib2.Request(url, None, std_headers)
 474                 request = urllib2.Request(url, None, std_headers)
 475
 476                 # Establish possible resume length
 477                 if os.path.isfile(filename):
 478                         resume_len = os.path.getsize(filename)
 479                 else:
 480                         resume_len = 0
 481
 482                 # Request parameters in case of being able to resume
 483                 if self.params.get('continuedl', False) and resume_len != 0:
 484                         self.report_resuming_byte(resume_len)
 485                         request.add_header('Range','bytes=%d-' % resume_len)
 486                         open_mode = 'ab'
 487
 488                 # Establish connection
 489                 try:
 490                         data = urllib2.urlopen(request)
 491                 except (urllib2.HTTPError, ), err:
 492                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 493                                 raise
 494                         # Unable to resume
 495                         data = urllib2.urlopen(basic_request)
 496                         content_length = data.info()['Content-Length']
 497
 498                         if content_length is not None and long(content_length) == resume_len:
 499                                 # Because the file had already been fully downloaded
 500                                 self.report_file_already_downloaded(filename)
 501                                 return True
 502                         else:
 503                                 # Because the server didn't let us
 504                                 self.report_unable_to_resume()
 505                                 open_mode = 'wb'
 506
 507                 data_len = data.info().get('Content-length', None)
 508                 data_len_str = self.format_bytes(data_len)
 509                 byte_counter = 0
 510                 block_size = 1024
 511                 start = time.time()
 512                 while True:
 513                         # Download and write
 514                         before = time.time()
 515                         data_block = data.read(block_size)
 516                         after = time.time()
 517                         data_block_len = len(data_block)
 518                         if data_block_len == 0:
 519                                 break
 520                         byte_counter += data_block_len
 521
 522                         # Open file just in time
 523                         if stream is None:
 524                                 try:
 525                                         stream = open(filename, open_mode)
 526                                         self.report_destination(filename)
 527                                 except (OSError, IOError), err:
 528                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 529                                         return False
 530                         stream.write(data_block)
 531                         block_size = self.best_block_size(after - before, data_block_len)
 532
 533                         # Progress message
 534                         percent_str = self.calc_percent(byte_counter, data_len)
 535                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 536                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 537                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 538
 539                         # Apply rate limit
 540                         self.slow_down(start, byte_counter)
 541
 542                 self.report_finish()
 543                 if data_len is not None and str(byte_counter) != data_len:
 544                         raise ContentTooShortError(byte_counter, long(data_len))
 545                 return True
 546
 547 class InfoExtractor(object):
 548         """Information Extractor class.
 549
 550         Information extractors are the classes that, given a URL, extract
 551         information from the video (or videos) the URL refers to. This
 552         information includes the real video URL, the video title and simplified
 553         title, author and others. The information is stored in a dictionary
 554         which is then passed to the FileDownloader. The FileDownloader
 555         processes this information possibly downloading the video to the file
 556         system, among other possible outcomes. The dictionaries must include
 557         the following fields:
 558
 559         id:             Video identifier.
 560         url:            Final video URL.
 561         uploader:       Nickname of the video uploader.
 562         title:          Literal title.
 563         stitle:         Simplified title.
 564         ext:            Video filename extension.
 565
 566         Subclasses of this one should re-define the _real_initialize() and
 567         _real_extract() methods, as well as the suitable() static method.
 568         Probably, they should also be instantiated and added to the main
 569         downloader.
 570         """
 571
 572         _ready = False
 573         _downloader = None
 574
 575         def __init__(self, downloader=None):
 576                 """Constructor. Receives an optional downloader."""
 577                 self._ready = False
 578                 self.set_downloader(downloader)
 579
 580         @staticmethod
 581         def suitable(url):
 582                 """Receives a URL and returns True if suitable for this IE."""
 583                 return False
 584
 585         def initialize(self):
 586                 """Initializes an instance (authentication, etc)."""
 587                 if not self._ready:
 588                         self._real_initialize()
 589                         self._ready = True
 590
 591         def extract(self, url):
 592                 """Extracts URL information and returns it in list of dicts."""
 593                 self.initialize()
 594                 return self._real_extract(url)
 595
 596         def set_downloader(self, downloader):
 597                 """Sets the downloader for this IE."""
 598                 self._downloader = downloader
 599
 600         def _real_initialize(self):
 601                 """Real initialization process. Redefine in subclasses."""
 602                 pass
 603
 604         def _real_extract(self, url):
 605                 """Real extraction process. Redefine in subclasses."""
 606                 pass
 607
 608 class YoutubeIE(InfoExtractor):
 609         """Information extractor for youtube.com."""
 610
 611         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 612         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 613         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 614         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 615         _NETRC_MACHINE = 'youtube'
 616         _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
 617         _video_extensions = {
 618                 '13': '3gp',
 619                 '17': 'mp4',
 620                 '18': 'mp4',
 621                 '22': 'mp4',
 622                 '37': 'mp4',
 623         }
 624
 625         @staticmethod
 626         def suitable(url):
 627                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 628
 629         def report_lang(self):
 630                 """Report attempt to set language."""
 631                 self._downloader.to_stdout(u'[youtube] Setting language')
 632
 633         def report_login(self):
 634                 """Report attempt to log in."""
 635                 self._downloader.to_stdout(u'[youtube] Logging in')
 636
 637         def report_age_confirmation(self):
 638                 """Report attempt to confirm age."""
 639                 self._downloader.to_stdout(u'[youtube] Confirming age')
 640
 641         def report_video_info_webpage_download(self, video_id):
 642                 """Report attempt to download video info webpage."""
 643                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 644
 645         def report_information_extraction(self, video_id):
 646                 """Report attempt to extract video information."""
 647                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 648
 649         def report_unavailable_format(self, video_id, format):
 650                 """Report extracted video URL."""
 651                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 652
 653         def report_rtmp_download(self):
 654                 """Indicate the download will use the RTMP protocol."""
 655                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 656
 657         def _real_initialize(self):
 658                 if self._downloader is None:
 659                         return
 660
 661                 username = None
 662                 password = None
 663                 downloader_params = self._downloader.params
 664
 665                 # Attempt to use provided username and password or .netrc data
 666                 if downloader_params.get('username', None) is not None:
 667                         username = downloader_params['username']
 668                         password = downloader_params['password']
 669                 elif downloader_params.get('usenetrc', False):
 670                         try:
 671                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 672                                 if info is not None:
 673                                         username = info[0]
 674                                         password = info[2]
 675                                 else:
 676                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 677                         except (IOError, netrc.NetrcParseError), err:
 678                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 679                                 return
 680
 681                 # Set language
 682                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 683                 try:
 684                         self.report_lang()
 685                         urllib2.urlopen(request).read()
 686                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 687                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 688                         return
 689
 690                 # No authentication to be performed
 691                 if username is None:
 692                         return
 693
 694                 # Log in
 695                 login_form = {
 696                                 'current_form': 'loginForm',
 697                                 'next':         '/',
 698                                 'action_login': 'Log In',
 699                                 'username':     username,
 700                                 'password':     password,
 701                                 }
 702                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 703                 try:
 704                         self.report_login()
 705                         login_results = urllib2.urlopen(request).read()
 706                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 707                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 708                                 return
 709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 710                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 711                         return
 712
 713                 # Confirm age
 714                 age_form = {
 715                                 'next_url':             '/',
 716                                 'action_confirm':       'Confirm',
 717                                 }
 718                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 719                 try:
 720                         self.report_age_confirmation()
 721                         age_results = urllib2.urlopen(request).read()
 722                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 723                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 724                         return
 725
 726         def _real_extract(self, url):
 727                 # Extract video id from URL
 728                 mobj = re.match(self._VALID_URL, url)
 729                 if mobj is None:
 730                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 731                         return
 732                 video_id = mobj.group(2)
 733
 734                 # Downloader parameters
 735                 best_quality = False
 736                 format_param = None
 737                 quality_index = 0
 738                 if self._downloader is not None:
 739                         params = self._downloader.params
 740                         format_param = params.get('format', None)
 741                         if format_param == '0':
 742                                 format_param = self._available_formats[quality_index]
 743                                 best_quality = True
 744
 745                 while True:
 746                         # Extension
 747                         video_extension = self._video_extensions.get(format_param, 'flv')
 748
 749                         # Get video info
 750                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
 751                         request = urllib2.Request(video_info_url, None, std_headers)
 752                         try:
 753                                 self.report_video_info_webpage_download(video_id)
 754                                 video_info_webpage = urllib2.urlopen(request).read()
 755                                 video_info = parse_qs(video_info_webpage)
 756                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 757                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 758                                 return
 759                         self.report_information_extraction(video_id)
 760
 761                         # "t" param
 762                         if 'token' not in video_info:
 763                                 # Attempt to see if YouTube has issued an error message
 764                                 if 'reason' not in video_info:
 765                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 766                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 767                                         stream.write(video_info_webpage)
 768                                         stream.close()
 769                                 else:
 770                                         reason = urllib.unquote_plus(video_info['reason'][0])
 771                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 772                                 return
 773                         token = urllib.unquote_plus(video_info['token'][0])
 774                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 775                         if format_param is not None:
 776                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 777
 778                         # Check possible RTMP download
 779                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 780                                 self.report_rtmp_download()
 781                                 video_real_url = video_info['conn'][0]
 782
 783                         # uploader
 784                         if 'author' not in video_info:
 785                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 786                                 return
 787                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 788
 789                         # title
 790                         if 'title' not in video_info:
 791                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 792                                 return
 793                         video_title = urllib.unquote_plus(video_info['title'][0])
 794                         video_title = video_title.decode('utf-8')
 795                         video_title = sanitize_title(video_title)
 796
 797                         # simplified title
 798                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 799                         simple_title = simple_title.strip(ur'_')
 800
 801                         try:
 802                                 # Process video information
 803                                 self._downloader.process_info({
 804                                         'id':           video_id.decode('utf-8'),
 805                                         'url':          video_real_url.decode('utf-8'),
 806                                         'uploader':     video_uploader.decode('utf-8'),
 807                                         'title':        video_title,
 808                                         'stitle':       simple_title,
 809                                         'ext':          video_extension.decode('utf-8'),
 810                                 })
 811
 812                                 return
 813
 814                         except UnavailableFormatError, err:
 815                                 if best_quality:
 816                                         if quality_index == len(self._available_formats) - 1:
 817                                                 # I don't ever expect this to happen
 818                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 819                                                 return
 820                                         else:
 821                                                 self.report_unavailable_format(video_id, format_param)
 822                                                 quality_index += 1
 823                                                 format_param = self._available_formats[quality_index]
 824                                                 continue
 825                                 else:
 826                                         self._downloader.trouble('ERROR: format not available for video')
 827                                         return
 828
 829
 830 class MetacafeIE(InfoExtractor):
 831         """Information Extractor for metacafe.com."""
 832
 833         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 834         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 835         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 836         _youtube_ie = None
 837
 838         def __init__(self, youtube_ie, downloader=None):
 839                 InfoExtractor.__init__(self, downloader)
 840                 self._youtube_ie = youtube_ie
 841
 842         @staticmethod
 843         def suitable(url):
 844                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 845
 846         def report_disclaimer(self):
 847                 """Report disclaimer retrieval."""
 848                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 849
 850         def report_age_confirmation(self):
 851                 """Report attempt to confirm age."""
 852                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 853
 854         def report_download_webpage(self, video_id):
 855                 """Report webpage download."""
 856                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 857
 858         def report_extraction(self, video_id):
 859                 """Report information extraction."""
 860                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 861
 862         def _real_initialize(self):
 863                 # Retrieve disclaimer
 864                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 865                 try:
 866                         self.report_disclaimer()
 867                         disclaimer = urllib2.urlopen(request).read()
 868                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 869                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 870                         return
 871
 872                 # Confirm age
 873                 disclaimer_form = {
 874                         'filters': '0',
 875                         'submit': "Continue - I'm over 18",
 876                         }
 877                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 878                 try:
 879                         self.report_age_confirmation()
 880                         disclaimer = urllib2.urlopen(request).read()
 881                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 882                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 883                         return
 884
 885         def _real_extract(self, url):
 886                 # Extract id and simplified title from URL
 887                 mobj = re.match(self._VALID_URL, url)
 888                 if mobj is None:
 889                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 890                         return
 891
 892                 video_id = mobj.group(1)
 893
 894                 # Check if video comes from YouTube
 895                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 896                 if mobj2 is not None:
 897                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 898                         return
 899
 900                 simple_title = mobj.group(2).decode('utf-8')
 901                 video_extension = 'flv'
 902
 903                 # Retrieve video webpage to extract further information
 904                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 905                 try:
 906                         self.report_download_webpage(video_id)
 907                         webpage = urllib2.urlopen(request).read()
 908                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 909                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 910                         return
 911
 912                 # Extract URL, uploader and title from webpage
 913                 self.report_extraction(video_id)
 914                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 915                 if mobj is None:
 916                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 917                         return
 918                 mediaURL = urllib.unquote(mobj.group(1))
 919
 920                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 921                 #if mobj is None:
 922                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 923                 #       return
 924                 #gdaKey = mobj.group(1)
 925                 #
 926                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 927
 928                 video_url = mediaURL
 929
 930                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 931                 if mobj is None:
 932                         self._downloader.trouble(u'ERROR: unable to extract title')
 933                         return
 934                 video_title = mobj.group(1).decode('utf-8')
 935                 video_title = sanitize_title(video_title)
 936
 937                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 938                 if mobj is None:
 939                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 940                         return
 941                 video_uploader = mobj.group(1)
 942
 943                 try:
 944                         # Process video information
 945                         self._downloader.process_info({
 946                                 'id':           video_id.decode('utf-8'),
 947                                 'url':          video_url.decode('utf-8'),
 948                                 'uploader':     video_uploader.decode('utf-8'),
 949                                 'title':        video_title,
 950                                 'stitle':       simple_title,
 951                                 'ext':          video_extension.decode('utf-8'),
 952                         })
 953                 except UnavailableFormatError:
 954                         self._downloader.trouble(u'ERROR: format not available for video')
 955
 956
 957 class GoogleIE(InfoExtractor):
 958         """Information extractor for video.google.com."""
 959
 960         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 961
 962         def __init__(self, downloader=None):
 963                 InfoExtractor.__init__(self, downloader)
 964
 965         @staticmethod
 966         def suitable(url):
 967                 return (re.match(GoogleIE._VALID_URL, url) is not None)
 968
 969         def report_download_webpage(self, video_id):
 970                 """Report webpage download."""
 971                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
 972
 973         def report_extraction(self, video_id):
 974                 """Report information extraction."""
 975                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
 976
 977         def _real_initialize(self):
 978                 return
 979
 980         def _real_extract(self, url):
 981                 # Extract id from URL
 982                 mobj = re.match(self._VALID_URL, url)
 983                 if mobj is None:
 984                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 985                         return
 986
 987                 video_id = mobj.group(1)
 988
 989                 video_extension = 'mp4'
 990
 991                 # Retrieve video webpage to extract further information
 992                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 993                 try:
 994                         self.report_download_webpage(video_id)
 995                         webpage = urllib2.urlopen(request).read()
 996                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 997                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 998                         return
 999
1000                 # Extract URL, uploader, and title from webpage
1001                 self.report_extraction(video_id)
1002                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1003                 if mobj is None:
1004                         video_extension = 'flv'
1005                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1006                 if mobj is None:
1007                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1008                         return
1009                 mediaURL = urllib.unquote(mobj.group(1))
1010                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1011                 mediaURL = mediaURL.replace('\\x26', '\x26')
1012
1013                 video_url = mediaURL
1014
1015                 mobj = re.search(r'<title>(.*)</title>', webpage)
1016                 if mobj is None:
1017                         self._downloader.trouble(u'ERROR: unable to extract title')
1018                         return
1019                 video_title = mobj.group(1).decode('utf-8')
1020                 video_title = sanitize_title(video_title)
1021
1022                 # Google Video doesn't show uploader nicknames?
1023                 video_uploader = 'NA'
1024
1025                 try:
1026                         # Process video information
1027                         self._downloader.process_info({
1028                                 'id':           video_id.decode('utf-8'),
1029                                 'url':          video_url.decode('utf-8'),
1030                                 'uploader':     video_uploader.decode('utf-8'),
1031                                 'title':        video_title,
1032                                 'stitle':       video_title,
1033                                 'ext':          video_extension.decode('utf-8'),
1034                         })
1035                 except UnavailableFormatError:
1036                         self._downloader.trouble(u'ERROR: format not available for video')
1037
1038
1039 class PhotobucketIE(InfoExtractor):
1040         """Information extractor for photobucket.com."""
1041
1042         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1043
1044         def __init__(self, downloader=None):
1045                 InfoExtractor.__init__(self, downloader)
1046
1047         @staticmethod
1048         def suitable(url):
1049                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1050
1051         def report_download_webpage(self, video_id):
1052                 """Report webpage download."""
1053                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1054
1055         def report_extraction(self, video_id):
1056                 """Report information extraction."""
1057                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1058
1059         def _real_initialize(self):
1060                 return
1061
1062         def _real_extract(self, url):
1063                 # Extract id from URL
1064                 mobj = re.match(self._VALID_URL, url)
1065                 if mobj is None:
1066                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1067                         return
1068
1069                 video_id = mobj.group(1)
1070
1071                 video_extension = 'flv'
1072
1073                 # Retrieve video webpage to extract further information
1074                 request = urllib2.Request(url)
1075                 try:
1076                         self.report_download_webpage(video_id)
1077                         webpage = urllib2.urlopen(request).read()
1078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1079                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1080                         return
1081
1082                 # Extract URL, uploader, and title from webpage
1083                 self.report_extraction(video_id)
1084                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1085                 if mobj is None:
1086                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1087                         return
1088                 mediaURL = urllib.unquote(mobj.group(1))
1089
1090                 video_url = mediaURL
1091
1092                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1093                 if mobj is None:
1094                         self._downloader.trouble(u'ERROR: unable to extract title')
1095                         return
1096                 video_title = mobj.group(1).decode('utf-8')
1097                 video_title = sanitize_title(video_title)
1098
1099                 video_uploader = mobj.group(2).decode('utf-8')
1100
1101                 try:
1102                         # Process video information
1103                         self._downloader.process_info({
1104                                 'id':           video_id.decode('utf-8'),
1105                                 'url':          video_url.decode('utf-8'),
1106                                 'uploader':     video_uploader,
1107                                 'title':        video_title,
1108                                 'stitle':       video_title,
1109                                 'ext':          video_extension.decode('utf-8'),
1110                         })
1111                 except UnavailableFormatError:
1112                         self._downloader.trouble(u'ERROR: format not available for video')
1113
1114
1115 class GenericIE(InfoExtractor):
1116         """Generic last-resort information extractor."""
1117
1118         def __init__(self, downloader=None):
1119                 InfoExtractor.__init__(self, downloader)
1120
1121         @staticmethod
1122         def suitable(url):
1123                 return True
1124
1125         def report_download_webpage(self, video_id):
1126                 """Report webpage download."""
1127                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1128                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1129
1130         def report_extraction(self, video_id):
1131                 """Report information extraction."""
1132                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1133
1134         def _real_initialize(self):
1135                 return
1136
1137         def _real_extract(self, url):
1138                 video_id = url.split('/')[-1]
1139                 request = urllib2.Request(url)
1140                 try:
1141                         self.report_download_webpage(video_id)
1142                         webpage = urllib2.urlopen(request).read()
1143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1144                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1145                         return
1146                 except ValueError, err:
1147                         # since this is the last-resort InfoExtractor, if
1148                         # this error is thrown, it'll be thrown here
1149                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1150                         return
1151
1152                 # Start with something easy: JW Player in SWFObject
1153                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1154                 if mobj is None:
1155                         # Broaden the search a little bit
1156                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1157                 if mobj is None:
1158                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1159                         return
1160
1161                 # It's possible that one of the regexes
1162                 # matched, but returned an empty group:
1163                 if mobj.group(1) is None:
1164                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165                         return
1166
1167                 video_url = urllib.unquote(mobj.group(1))
1168                 video_id  = os.path.basename(video_url)
1169
1170                 # here's a fun little line of code for you:
1171                 video_extension = os.path.splitext(video_id)[1][1:]
1172                 video_id        = os.path.splitext(video_id)[0]
1173
1174                 # it's tempting to parse this further, but you would
1175                 # have to take into account all the variations like
1176                 #   Video Title - Site Name
1177                 #   Site Name | Video Title
1178                 #   Video Title - Tagline | Site Name
1179                 # and so on and so forth; it's just not practical
1180                 mobj = re.search(r'<title>(.*)</title>', webpage)
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: unable to extract title')
1183                         return
1184                 video_title = mobj.group(1).decode('utf-8')
1185                 video_title = sanitize_title(video_title)
1186
1187                 # video uploader is domain name
1188                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1189                 if mobj is None:
1190                         self._downloader.trouble(u'ERROR: unable to extract title')
1191                         return
1192                 video_uploader = mobj.group(1).decode('utf-8')
1193
1194                 try:
1195                         # Process video information
1196                         self._downloader.process_info({
1197                                 'id':           video_id.decode('utf-8'),
1198                                 'url':          video_url.decode('utf-8'),
1199                                 'uploader':     video_uploader,
1200                                 'title':        video_title,
1201                                 'stitle':       video_title,
1202                                 'ext':          video_extension.decode('utf-8'),
1203                         })
1204                 except UnavailableFormatError:
1205                         self._downloader.trouble(u'ERROR: format not available for video')
1206
1207
1208 class YoutubeSearchIE(InfoExtractor):
1209         """Information Extractor for YouTube search queries."""
1210         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1211         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1212         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1213         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1214         _youtube_ie = None
1215         _max_youtube_results = 1000
1216
1217         def __init__(self, youtube_ie, downloader=None):
1218                 InfoExtractor.__init__(self, downloader)
1219                 self._youtube_ie = youtube_ie
1220
1221         @staticmethod
1222         def suitable(url):
1223                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1224
1225         def report_download_page(self, query, pagenum):
1226                 """Report attempt to download playlist page with given number."""
1227                 query = query.decode(preferredencoding())
1228                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1229
1230         def _real_initialize(self):
1231                 self._youtube_ie.initialize()
1232
1233         def _real_extract(self, query):
1234                 mobj = re.match(self._VALID_QUERY, query)
1235                 if mobj is None:
1236                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1237                         return
1238
1239                 prefix, query = query.split(':')
1240                 prefix = prefix[8:]
1241                 query  = query.encode('utf-8')
1242                 if prefix == '':
1243                         self._download_n_results(query, 1)
1244                         return
1245                 elif prefix == 'all':
1246                         self._download_n_results(query, self._max_youtube_results)
1247                         return
1248                 else:
1249                         try:
1250                                 n = long(prefix)
1251                                 if n <= 0:
1252                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1253                                         return
1254                                 elif n > self._max_youtube_results:
1255                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1256                                         n = self._max_youtube_results
1257                                 self._download_n_results(query, n)
1258                                 return
1259                         except ValueError: # parsing prefix as integer fails
1260                                 self._download_n_results(query, 1)
1261                                 return
1262
1263         def _download_n_results(self, query, n):
1264                 """Downloads a specified number of results for a query"""
1265
1266                 video_ids = []
1267                 already_seen = set()
1268                 pagenum = 1
1269
1270                 while True:
1271                         self.report_download_page(query, pagenum)
1272                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1273                         request = urllib2.Request(result_url, None, std_headers)
1274                         try:
1275                                 page = urllib2.urlopen(request).read()
1276                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1277                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1278                                 return
1279
1280                         # Extract video identifiers
1281                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1282                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1283                                 if video_id not in already_seen:
1284                                         video_ids.append(video_id)
1285                                         already_seen.add(video_id)
1286                                         if len(video_ids) == n:
1287                                                 # Specified n videos reached
1288                                                 for id in video_ids:
1289                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1290                                                 return
1291
1292                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1293                                 for id in video_ids:
1294                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1295                                 return
1296
1297                         pagenum = pagenum + 1
1298
1299 class YoutubePlaylistIE(InfoExtractor):
1300         """Information Extractor for YouTube playlists."""
1301
1302         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1303         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1304         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1305         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1306         _youtube_ie = None
1307
1308         def __init__(self, youtube_ie, downloader=None):
1309                 InfoExtractor.__init__(self, downloader)
1310                 self._youtube_ie = youtube_ie
1311
1312         @staticmethod
1313         def suitable(url):
1314                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1315
1316         def report_download_page(self, playlist_id, pagenum):
1317                 """Report attempt to download playlist page with given number."""
1318                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1319
1320         def _real_initialize(self):
1321                 self._youtube_ie.initialize()
1322
1323         def _real_extract(self, url):
1324                 # Extract playlist id
1325                 mobj = re.match(self._VALID_URL, url)
1326                 if mobj is None:
1327                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1328                         return
1329
1330                 # Download playlist pages
1331                 playlist_id = mobj.group(1)
1332                 video_ids = []
1333                 pagenum = 1
1334
1335                 while True:
1336                         self.report_download_page(playlist_id, pagenum)
1337                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1338                         try:
1339                                 page = urllib2.urlopen(request).read()
1340                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1341                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1342                                 return
1343
1344                         # Extract video identifiers
1345                         ids_in_page = []
1346                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1347                                 if mobj.group(1) not in ids_in_page:
1348                                         ids_in_page.append(mobj.group(1))
1349                         video_ids.extend(ids_in_page)
1350
1351                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1352                                 break
1353                         pagenum = pagenum + 1
1354
1355                 for id in video_ids:
1356                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1357                 return
1358
1359 class YoutubeUserIE(InfoExtractor):
1360         """Information Extractor for YouTube users."""
1361
1362         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1363         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1364         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1365         _youtube_ie = None
1366
1367         def __init__(self, youtube_ie, downloader=None):
1368                 InfoExtractor.__init__(self, downloader)
1369                 self._youtube_ie = youtube_ie
1370
1371         @staticmethod
1372         def suitable(url):
1373                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1374
1375         def report_download_page(self, username):
1376                 """Report attempt to download user page."""
1377                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1378
1379         def _real_initialize(self):
1380                 self._youtube_ie.initialize()
1381
1382         def _real_extract(self, url):
1383                 # Extract username
1384                 mobj = re.match(self._VALID_URL, url)
1385                 if mobj is None:
1386                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1387                         return
1388
1389                 # Download user page
1390                 username = mobj.group(1)
1391                 video_ids = []
1392                 pagenum = 1
1393
1394                 self.report_download_page(username)
1395                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1396                 try:
1397                         page = urllib2.urlopen(request).read()
1398                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1400                         return
1401
1402                 # Extract video identifiers
1403                 ids_in_page = []
1404
1405                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1406                         if mobj.group(1) not in ids_in_page:
1407                                 ids_in_page.append(mobj.group(1))
1408                 video_ids.extend(ids_in_page)
1409
1410                 for id in video_ids:
1411                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1412                 return
1413
1414 class PostProcessor(object):
1415         """Post Processor class.
1416
1417         PostProcessor objects can be added to downloaders with their
1418         add_post_processor() method. When the downloader has finished a
1419         successful download, it will take its internal chain of PostProcessors
1420         and start calling the run() method on each one of them, first with
1421         an initial argument and then with the returned value of the previous
1422         PostProcessor.
1423
1424         The chain will be stopped if one of them ever returns None or the end
1425         of the chain is reached.
1426
1427         PostProcessor objects follow a "mutual registration" process similar
1428         to InfoExtractor objects.
1429         """
1430
1431         _downloader = None
1432
1433         def __init__(self, downloader=None):
1434                 self._downloader = downloader
1435
1436         def set_downloader(self, downloader):
1437                 """Sets the downloader for this PP."""
1438                 self._downloader = downloader
1439
1440         def run(self, information):
1441                 """Run the PostProcessor.
1442
1443                 The "information" argument is a dictionary like the ones
1444                 composed by InfoExtractors. The only difference is that this
1445                 one has an extra field called "filepath" that points to the
1446                 downloaded file.
1447
1448                 When this method returns None, the postprocessing chain is
1449                 stopped. However, this method may return an information
1450                 dictionary that will be passed to the next postprocessing
1451                 object in the chain. It can be the one it received after
1452                 changing some fields.
1453
1454                 In addition, this method may raise a PostProcessingError
1455                 exception that will be taken into account by the downloader
1456                 it was called from.
1457                 """
1458                 return information # by default, do nothing
1459
1460 ### MAIN PROGRAM ###
1461 if __name__ == '__main__':
1462         try:
1463                 # Modules needed only when running the main program
1464                 import getpass
1465                 import optparse
1466
1467                 # Function to update the program file with the latest version from bitbucket.org
1468                 def update_self(downloader, filename):
1469                         # Note: downloader only used for options
1470                         if not os.access (filename, os.W_OK):
1471                                 sys.exit('ERROR: no write permissions on %s' % filename)
1472
1473                         downloader.to_stdout('Updating to latest stable version...')
1474                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1475                         latest_version = urllib.urlopen(latest_url).read().strip()
1476                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1477                         newcontent = urllib.urlopen(prog_url).read()
1478                         stream = open(filename, 'w')
1479                         stream.write(newcontent)
1480                         stream.close()
1481                         downloader.to_stdout('Updated to version %s' % latest_version)
1482
1483                 # General configuration
1484                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1485                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1486                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1487
1488                 # Parse command line
1489                 parser = optparse.OptionParser(
1490                         usage='Usage: %prog [options] url...',
1491                         version='INTERNAL',
1492                         conflict_handler='resolve',
1493                 )
1494
1495                 parser.add_option('-h', '--help',
1496                                 action='help', help='print this help text and exit')
1497                 parser.add_option('-v', '--version',
1498                                 action='version', help='print program version and exit')
1499                 parser.add_option('-U', '--update',
1500                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1501                 parser.add_option('-i', '--ignore-errors',
1502                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1503                 parser.add_option('-r', '--rate-limit',
1504                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1505
1506                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1507                 authentication.add_option('-u', '--username',
1508                                 dest='username', metavar='UN', help='account username')
1509                 authentication.add_option('-p', '--password',
1510                                 dest='password', metavar='PW', help='account password')
1511                 authentication.add_option('-n', '--netrc',
1512                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1513                 parser.add_option_group(authentication)
1514
1515                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1516                 video_format.add_option('-f', '--format',
1517                                 action='store', dest='format', metavar='FMT', help='video format code')
1518                 video_format.add_option('-b', '--best-quality',
1519                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1520                 video_format.add_option('-m', '--mobile-version',
1521                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1522                 video_format.add_option('-d', '--high-def',
1523                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1524                 parser.add_option_group(video_format)
1525
1526                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1527                 verbosity.add_option('-q', '--quiet',
1528                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1529                 verbosity.add_option('-s', '--simulate',
1530                                 action='store_true', dest='simulate', help='do not download video', default=False)
1531                 verbosity.add_option('-g', '--get-url',
1532                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1533                 verbosity.add_option('-e', '--get-title',
1534                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1535                 parser.add_option_group(verbosity)
1536
1537                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1538                 filesystem.add_option('-t', '--title',
1539                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1540                 filesystem.add_option('-l', '--literal',
1541                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1542                 filesystem.add_option('-o', '--output',
1543                                 dest='outtmpl', metavar='TPL', help='output filename template')
1544                 filesystem.add_option('-a', '--batch-file',
1545                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1546                 filesystem.add_option('-w', '--no-overwrites',
1547                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1548                 filesystem.add_option('-c', '--continue',
1549                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1550                 parser.add_option_group(filesystem)
1551
1552                 (opts, args) = parser.parse_args()
1553
1554                 # Batch file verification
1555                 batchurls = []
1556                 if opts.batchfile is not None:
1557                         try:
1558                                 batchurls = open(opts.batchfile, 'r').readlines()
1559                                 batchurls = [x.strip() for x in batchurls]
1560                                 batchurls = [x for x in batchurls if len(x) > 0]
1561                         except IOError:
1562                                 sys.exit(u'ERROR: batch file could not be read')
1563                 all_urls = batchurls + args
1564
1565                 # Make sure all URLs are in our preferred encoding
1566                 for i in range(0, len(all_urls)):
1567                         all_urls[i] = unicode(all_urls[i], preferredencoding())
1568
1569                 # Conflicting, missing and erroneous options
1570                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1571                         parser.error(u'using .netrc conflicts with giving username/password')
1572                 if opts.password is not None and opts.username is None:
1573                         parser.error(u'account username missing')
1574                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1575                         parser.error(u'using output template conflicts with using title or literal title')
1576                 if opts.usetitle and opts.useliteral:
1577                         parser.error(u'using title conflicts with using literal title')
1578                 if opts.username is not None and opts.password is None:
1579                         opts.password = getpass.getpass(u'Type account password and press return:')
1580                 if opts.ratelimit is not None:
1581                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1582                         if numeric_limit is None:
1583                                 parser.error(u'invalid rate limit specified')
1584                         opts.ratelimit = numeric_limit
1585
1586                 # Information extractors
1587                 youtube_ie = YoutubeIE()
1588                 metacafe_ie = MetacafeIE(youtube_ie)
1589                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1590                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1591                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1592                 google_ie = GoogleIE()
1593                 photobucket_ie = PhotobucketIE()
1594                 generic_ie = GenericIE()
1595
1596                 # File downloader
1597                 fd = FileDownloader({
1598                         'usenetrc': opts.usenetrc,
1599                         'username': opts.username,
1600                         'password': opts.password,
1601                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1602                         'forceurl': opts.geturl,
1603                         'forcetitle': opts.gettitle,
1604                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1605                         'format': opts.format,
1606                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1607                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1608                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1609                                 or u'%(id)s.%(ext)s'),
1610                         'ignoreerrors': opts.ignoreerrors,
1611                         'ratelimit': opts.ratelimit,
1612                         'nooverwrites': opts.nooverwrites,
1613                         'continuedl': opts.continue_dl,
1614                         })
1615                 fd.add_info_extractor(youtube_search_ie)
1616                 fd.add_info_extractor(youtube_pl_ie)
1617                 fd.add_info_extractor(youtube_user_ie)
1618                 fd.add_info_extractor(metacafe_ie)
1619                 fd.add_info_extractor(youtube_ie)
1620                 fd.add_info_extractor(google_ie)
1621                 fd.add_info_extractor(photobucket_ie)
1622
1623                 # This must come last since it's the
1624                 # fallback if none of the others work
1625                 fd.add_info_extractor(generic_ie)
1626
1627                 # Update version
1628                 if opts.update_self:
1629                         update_self(fd, sys.argv[0])
1630
1631                 # Maybe do nothing
1632                 if len(all_urls) < 1:
1633                         if not opts.update_self:
1634                                 parser.error(u'you must provide at least one URL')
1635                         else:
1636                                 sys.exit()
1637                 retcode = fd.download(all_urls)
1638                 sys.exit(retcode)
1639
1640         except DownloadError:
1641                 sys.exit(1)
1642         except SameFileError:
1643                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1644         except KeyboardInterrupt:
1645                 sys.exit(u'\nERROR: Interrupted by user')