youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204         _num_downloads = None
 205
 206         def __init__(self, params):
 207                 """Create a FileDownloader object with the given options."""
 208                 self._ies = []
 209                 self._pps = []
 210                 self._download_retcode = 0
 211                 self._num_downloads = 0
 212                 self.params = params
 213
 214         @staticmethod
 215         def pmkdir(filename):
 216                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 217                 components = filename.split(os.sep)
 218                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 219                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 220                 for dir in aggregate:
 221                         if not os.path.exists(dir):
 222                                 os.mkdir(dir)
 223
 224         @staticmethod
 225         def format_bytes(bytes):
 226                 if bytes is None:
 227                         return 'N/A'
 228                 if type(bytes) is str:
 229                         bytes = float(bytes)
 230                 if bytes == 0.0:
 231                         exponent = 0
 232                 else:
 233                         exponent = long(math.log(bytes, 1024.0))
 234                 suffix = 'bkMGTPEZY'[exponent]
 235                 converted = float(bytes) / float(1024**exponent)
 236                 return '%.2f%s' % (converted, suffix)
 237
 238         @staticmethod
 239         def calc_percent(byte_counter, data_len):
 240                 if data_len is None:
 241                         return '---.-%'
 242                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 243
 244         @staticmethod
 245         def calc_eta(start, now, total, current):
 246                 if total is None:
 247                         return '--:--'
 248                 dif = now - start
 249                 if current == 0 or dif < 0.001: # One millisecond
 250                         return '--:--'
 251                 rate = float(current) / dif
 252                 eta = long((float(total) - float(current)) / rate)
 253                 (eta_mins, eta_secs) = divmod(eta, 60)
 254                 if eta_mins > 99:
 255                         return '--:--'
 256                 return '%02d:%02d' % (eta_mins, eta_secs)
 257
 258         @staticmethod
 259         def calc_speed(start, now, bytes):
 260                 dif = now - start
 261                 if bytes == 0 or dif < 0.001: # One millisecond
 262                         return '%10s' % '---b/s'
 263                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 264
 265         @staticmethod
 266         def best_block_size(elapsed_time, bytes):
 267                 new_min = max(bytes / 2.0, 1.0)
 268                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 269                 if elapsed_time < 0.001:
 270                         return long(new_max)
 271                 rate = bytes / elapsed_time
 272                 if rate > new_max:
 273                         return long(new_max)
 274                 if rate < new_min:
 275                         return long(new_min)
 276                 return long(rate)
 277
 278         @staticmethod
 279         def parse_bytes(bytestr):
 280                 """Parse a string indicating a byte quantity into a long integer."""
 281                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 282                 if matchobj is None:
 283                         return None
 284                 number = float(matchobj.group(1))
 285                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 286                 return long(round(number * multiplier))
 287
 288         @staticmethod
 289         def verify_url(url):
 290                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 291                 request = urllib2.Request(url, None, std_headers)
 292                 data = urllib2.urlopen(request)
 293                 data.read(1)
 294                 url = data.geturl()
 295                 data.close()
 296                 return url
 297
 298         def add_info_extractor(self, ie):
 299                 """Add an InfoExtractor object to the end of the list."""
 300                 self._ies.append(ie)
 301                 ie.set_downloader(self)
 302
 303         def add_post_processor(self, pp):
 304                 """Add a PostProcessor object to the end of the chain."""
 305                 self._pps.append(pp)
 306                 pp.set_downloader(self)
 307
 308         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 309                 """Print message to stdout if not in quiet mode."""
 310                 try:
 311                         if not self.params.get('quiet', False):
 312                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 313                         sys.stdout.flush()
 314                 except (UnicodeEncodeError), err:
 315                         if not ignore_encoding_errors:
 316                                 raise
 317
 318         def to_stderr(self, message):
 319                 """Print message to stderr."""
 320                 print >>sys.stderr, message.encode(preferredencoding())
 321
 322         def fixed_template(self):
 323                 """Checks if the output template is fixed."""
 324                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 325
 326         def trouble(self, message=None):
 327                 """Determine action to take when a download problem appears.
 328
 329                 Depending on if the downloader has been configured to ignore
 330                 download errors or not, this method may throw an exception or
 331                 not when errors are found, after printing the message.
 332                 """
 333                 if message is not None:
 334                         self.to_stderr(message)
 335                 if not self.params.get('ignoreerrors', False):
 336                         raise DownloadError(message)
 337                 self._download_retcode = 1
 338
 339         def slow_down(self, start_time, byte_counter):
 340                 """Sleep if the download speed is over the rate limit."""
 341                 rate_limit = self.params.get('ratelimit', None)
 342                 if rate_limit is None or byte_counter == 0:
 343                         return
 344                 now = time.time()
 345                 elapsed = now - start_time
 346                 if elapsed <= 0.0:
 347                         return
 348                 speed = float(byte_counter) / elapsed
 349                 if speed > rate_limit:
 350                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 351
 352         def report_destination(self, filename):
 353                 """Report destination filename."""
 354                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 355
 356         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 357                 """Report download progress."""
 358                 if self.params.get('noprogress', False):
 359                         return
 360                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 361                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 362
 363         def report_resuming_byte(self, resume_len):
 364                 """Report attemtp to resume at given byte."""
 365                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 366
 367         def report_file_already_downloaded(self, file_name):
 368                 """Report file has already been fully downloaded."""
 369                 try:
 370                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 371                 except (UnicodeEncodeError), err:
 372                         self.to_stdout(u'[download] The file has already been downloaded')
 373
 374         def report_unable_to_resume(self):
 375                 """Report it was impossible to resume download."""
 376                 self.to_stdout(u'[download] Unable to resume')
 377
 378         def report_finish(self):
 379                 """Report download finished."""
 380                 if self.params.get('noprogress', False):
 381                         self.to_stdout(u'[download] Download completed')
 382                 else:
 383                         self.to_stdout(u'')
 384
 385         def process_info(self, info_dict):
 386                 """Process a single dictionary returned by an InfoExtractor."""
 387                 # Do nothing else if in simulate mode
 388                 if self.params.get('simulate', False):
 389                         # Verify URL if it's an HTTP one
 390                         if info_dict['url'].startswith('http'):
 391                                 try:
 392                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 393                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 394                                         raise UnavailableFormatError
 395
 396                         # Forced printings
 397                         if self.params.get('forcetitle', False):
 398                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 399                         if self.params.get('forceurl', False):
 400                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 401                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 402                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 403                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 404                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 405
 406                         return
 407
 408                 try:
 409                         template_dict = dict(info_dict)
 410                         template_dict['epoch'] = unicode(long(time.time()))
 411                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 412                         filename = self.params['outtmpl'] % template_dict
 413                 except (ValueError, KeyError), err:
 414                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 415                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 416                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 417                         return
 418
 419                 try:
 420                         self.pmkdir(filename)
 421                 except (OSError, IOError), err:
 422                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 423                         return
 424
 425                 try:
 426                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 427                 except (OSError, IOError), err:
 428                         raise UnavailableFormatError
 429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 430                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 431                         return
 432                 except (ContentTooShortError, ), err:
 433                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 434                         return
 435
 436                 if success:
 437                         try:
 438                                 self.post_process(filename, info_dict)
 439                         except (PostProcessingError), err:
 440                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 441                                 return
 442
 443         def download(self, url_list):
 444                 """Download a given list of URLs."""
 445                 if len(url_list) > 1 and self.fixed_template():
 446                         raise SameFileError(self.params['outtmpl'])
 447
 448                 for url in url_list:
 449                         suitable_found = False
 450                         for ie in self._ies:
 451                                 # Go to next InfoExtractor if not suitable
 452                                 if not ie.suitable(url):
 453                                         continue
 454
 455                                 # Suitable InfoExtractor found
 456                                 suitable_found = True
 457
 458                                 # Extract information from URL and process it
 459                                 ie.extract(url)
 460
 461                                 # Suitable InfoExtractor had been found; go to next URL
 462                                 break
 463
 464                         if not suitable_found:
 465                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 466
 467                 return self._download_retcode
 468
 469         def post_process(self, filename, ie_info):
 470                 """Run the postprocessing chain on the given file."""
 471                 info = dict(ie_info)
 472                 info['filepath'] = filename
 473                 for pp in self._pps:
 474                         info = pp.run(info)
 475                         if info is None:
 476                                 break
 477
 478         def _download_with_rtmpdump(self, filename, url):
 479                 self.report_destination(filename)
 480
 481                 # Check for rtmpdump first
 482                 try:
 483                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 484                 except (OSError, IOError):
 485                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 486                         return False
 487
 488                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 489                 # the connection was interrumpted and resuming appears to be
 490                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 491                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 492                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 493                 while retval == 2 or retval == 1:
 494                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 495                         time.sleep(2.0) # This seems to be needed
 496                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 497                 if retval == 0:
 498                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 499                         return True
 500                 else:
 501                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 502                         return False
 503
 504         def _do_download(self, filename, url):
 505                 # Attempt to download using rtmpdump
 506                 if url.startswith('rtmp'):
 507                         return self._download_with_rtmpdump(filename, url)
 508
 509                 stream = None
 510                 open_mode = 'wb'
 511                 basic_request = urllib2.Request(url, None, std_headers)
 512                 request = urllib2.Request(url, None, std_headers)
 513
 514                 # Establish possible resume length
 515                 if os.path.isfile(filename):
 516                         resume_len = os.path.getsize(filename)
 517                 else:
 518                         resume_len = 0
 519
 520                 # Request parameters in case of being able to resume
 521                 if self.params.get('continuedl', False) and resume_len != 0:
 522                         self.report_resuming_byte(resume_len)
 523                         request.add_header('Range','bytes=%d-' % resume_len)
 524                         open_mode = 'ab'
 525
 526                 # Establish connection
 527                 try:
 528                         data = urllib2.urlopen(request)
 529                 except (urllib2.HTTPError, ), err:
 530                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 531                                 raise
 532                         # Unable to resume
 533                         data = urllib2.urlopen(basic_request)
 534                         content_length = data.info()['Content-Length']
 535
 536                         if content_length is not None and long(content_length) == resume_len:
 537                                 # Because the file had already been fully downloaded
 538                                 self.report_file_already_downloaded(filename)
 539                                 return True
 540                         else:
 541                                 # Because the server didn't let us
 542                                 self.report_unable_to_resume()
 543                                 open_mode = 'wb'
 544
 545                 data_len = data.info().get('Content-length', None)
 546                 data_len_str = self.format_bytes(data_len)
 547                 byte_counter = 0
 548                 block_size = 1024
 549                 start = time.time()
 550                 while True:
 551                         # Download and write
 552                         before = time.time()
 553                         data_block = data.read(block_size)
 554                         after = time.time()
 555                         data_block_len = len(data_block)
 556                         if data_block_len == 0:
 557                                 break
 558                         byte_counter += data_block_len
 559
 560                         # Open file just in time
 561                         if stream is None:
 562                                 try:
 563                                         (stream, filename) = sanitize_open(filename, open_mode)
 564                                         self.report_destination(filename)
 565                                         self._num_downloads += 1
 566                                 except (OSError, IOError), err:
 567                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 568                                         return False
 569                         stream.write(data_block)
 570                         block_size = self.best_block_size(after - before, data_block_len)
 571
 572                         # Progress message
 573                         percent_str = self.calc_percent(byte_counter, data_len)
 574                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 575                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 576                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 577
 578                         # Apply rate limit
 579                         self.slow_down(start, byte_counter)
 580
 581                 self.report_finish()
 582                 if data_len is not None and str(byte_counter) != data_len:
 583                         raise ContentTooShortError(byte_counter, long(data_len))
 584                 return True
 585
 586 class InfoExtractor(object):
 587         """Information Extractor class.
 588
 589         Information extractors are the classes that, given a URL, extract
 590         information from the video (or videos) the URL refers to. This
 591         information includes the real video URL, the video title and simplified
 592         title, author and others. The information is stored in a dictionary
 593         which is then passed to the FileDownloader. The FileDownloader
 594         processes this information possibly downloading the video to the file
 595         system, among other possible outcomes. The dictionaries must include
 596         the following fields:
 597
 598         id:             Video identifier.
 599         url:            Final video URL.
 600         uploader:       Nickname of the video uploader.
 601         title:          Literal title.
 602         stitle:         Simplified title.
 603         ext:            Video filename extension.
 604         format:         Video format.
 605
 606         The following fields are optional. Their primary purpose is to allow
 607         youtube-dl to serve as the backend for a video search function, such
 608         as the one in youtube2mp3.  They are only used when their respective
 609         forced printing functions are called:
 610
 611         thumbnail:      Full URL to a video thumbnail image.
 612         description:    One-line video description.
 613
 614         Subclasses of this one should re-define the _real_initialize() and
 615         _real_extract() methods, as well as the suitable() static method.
 616         Probably, they should also be instantiated and added to the main
 617         downloader.
 618         """
 619
 620         _ready = False
 621         _downloader = None
 622
 623         def __init__(self, downloader=None):
 624                 """Constructor. Receives an optional downloader."""
 625                 self._ready = False
 626                 self.set_downloader(downloader)
 627
 628         @staticmethod
 629         def suitable(url):
 630                 """Receives a URL and returns True if suitable for this IE."""
 631                 return False
 632
 633         def initialize(self):
 634                 """Initializes an instance (authentication, etc)."""
 635                 if not self._ready:
 636                         self._real_initialize()
 637                         self._ready = True
 638
 639         def extract(self, url):
 640                 """Extracts URL information and returns it in list of dicts."""
 641                 self.initialize()
 642                 return self._real_extract(url)
 643
 644         def set_downloader(self, downloader):
 645                 """Sets the downloader for this IE."""
 646                 self._downloader = downloader
 647
 648         def _real_initialize(self):
 649                 """Real initialization process. Redefine in subclasses."""
 650                 pass
 651
 652         def _real_extract(self, url):
 653                 """Real extraction process. Redefine in subclasses."""
 654                 pass
 655
 656 class YoutubeIE(InfoExtractor):
 657         """Information extractor for youtube.com."""
 658
 659         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 660         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 661         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 662         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 663         _NETRC_MACHINE = 'youtube'
 664         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 665         _video_extensions = {
 666                 '13': '3gp',
 667                 '17': 'mp4',
 668                 '18': 'mp4',
 669                 '22': 'mp4',
 670                 '37': 'mp4',
 671         }
 672
 673         @staticmethod
 674         def suitable(url):
 675                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 676
 677         def report_lang(self):
 678                 """Report attempt to set language."""
 679                 self._downloader.to_stdout(u'[youtube] Setting language')
 680
 681         def report_login(self):
 682                 """Report attempt to log in."""
 683                 self._downloader.to_stdout(u'[youtube] Logging in')
 684
 685         def report_age_confirmation(self):
 686                 """Report attempt to confirm age."""
 687                 self._downloader.to_stdout(u'[youtube] Confirming age')
 688
 689         def report_video_info_webpage_download(self, video_id):
 690                 """Report attempt to download video info webpage."""
 691                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 692
 693         def report_information_extraction(self, video_id):
 694                 """Report attempt to extract video information."""
 695                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 696
 697         def report_unavailable_format(self, video_id, format):
 698                 """Report extracted video URL."""
 699                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 700
 701         def report_rtmp_download(self):
 702                 """Indicate the download will use the RTMP protocol."""
 703                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 704
 705         def _real_initialize(self):
 706                 if self._downloader is None:
 707                         return
 708
 709                 username = None
 710                 password = None
 711                 downloader_params = self._downloader.params
 712
 713                 # Attempt to use provided username and password or .netrc data
 714                 if downloader_params.get('username', None) is not None:
 715                         username = downloader_params['username']
 716                         password = downloader_params['password']
 717                 elif downloader_params.get('usenetrc', False):
 718                         try:
 719                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 720                                 if info is not None:
 721                                         username = info[0]
 722                                         password = info[2]
 723                                 else:
 724                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 725                         except (IOError, netrc.NetrcParseError), err:
 726                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 727                                 return
 728
 729                 # Set language
 730                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 731                 try:
 732                         self.report_lang()
 733                         urllib2.urlopen(request).read()
 734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 735                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 736                         return
 737
 738                 # No authentication to be performed
 739                 if username is None:
 740                         return
 741
 742                 # Log in
 743                 login_form = {
 744                                 'current_form': 'loginForm',
 745                                 'next':         '/',
 746                                 'action_login': 'Log In',
 747                                 'username':     username,
 748                                 'password':     password,
 749                                 }
 750                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 751                 try:
 752                         self.report_login()
 753                         login_results = urllib2.urlopen(request).read()
 754                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 755                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 756                                 return
 757                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 758                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 759                         return
 760
 761                 # Confirm age
 762                 age_form = {
 763                                 'next_url':             '/',
 764                                 'action_confirm':       'Confirm',
 765                                 }
 766                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 767                 try:
 768                         self.report_age_confirmation()
 769                         age_results = urllib2.urlopen(request).read()
 770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 771                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 772                         return
 773
 774         def _real_extract(self, url):
 775                 # Extract video id from URL
 776                 mobj = re.match(self._VALID_URL, url)
 777                 if mobj is None:
 778                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 779                         return
 780                 video_id = mobj.group(2)
 781
 782                 # Downloader parameters
 783                 best_quality = False
 784                 all_formats = False
 785                 format_param = None
 786                 quality_index = 0
 787                 if self._downloader is not None:
 788                         params = self._downloader.params
 789                         format_param = params.get('format', None)
 790                         if format_param == '0':
 791                                 format_param = self._available_formats[quality_index]
 792                                 best_quality = True
 793                         elif format_param == '-1':
 794                                 format_param = self._available_formats[quality_index]
 795                                 all_formats = True
 796
 797                 while True:
 798                         # Extension
 799                         video_extension = self._video_extensions.get(format_param, 'flv')
 800
 801                         # Get video info
 802                         self.report_video_info_webpage_download(video_id)
 803                         for el_type in ['embedded', 'detailpage', 'vevo']:
 804                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
 805                                                    % (video_id, el_type))
 806                                 request = urllib2.Request(video_info_url, None, std_headers)
 807                                 try:
 808                                         video_info_webpage = urllib2.urlopen(request).read()
 809                                         video_info = parse_qs(video_info_webpage)
 810                                         if 'token' in video_info:
 811                                                 break
 812                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 813                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 814                                         return
 815                         self.report_information_extraction(video_id)
 816
 817                         # "t" param
 818                         if 'token' not in video_info:
 819                                 # Attempt to see if YouTube has issued an error message
 820                                 if 'reason' not in video_info:
 821                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 822                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 823                                         stream.write(video_info_webpage)
 824                                         stream.close()
 825                                 else:
 826                                         reason = urllib.unquote_plus(video_info['reason'][0])
 827                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 828                                 return
 829                         token = urllib.unquote_plus(video_info['token'][0])
 830                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 831                         if format_param is not None:
 832                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 833
 834                         # Check possible RTMP download
 835                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 836                                 self.report_rtmp_download()
 837                                 video_real_url = video_info['conn'][0]
 838
 839                         # uploader
 840                         if 'author' not in video_info:
 841                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 842                                 return
 843                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 844
 845                         # title
 846                         if 'title' not in video_info:
 847                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 848                                 return
 849                         video_title = urllib.unquote_plus(video_info['title'][0])
 850                         video_title = video_title.decode('utf-8')
 851                         video_title = sanitize_title(video_title)
 852
 853                         # simplified title
 854                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 855                         simple_title = simple_title.strip(ur'_')
 856
 857                         # thumbnail image
 858                         if 'thumbnail_url' not in video_info:
 859                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 860                                 video_thumbnail = ''
 861                         else:   # don't panic if we can't find it
 862                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 863
 864                         # get video description
 865                         video_description = 'No description available.'    # we need something to pass to self._downloader
 866                         # this requires an additional HTTP request and a little
 867                         # more time, so don't do it unless absolutely necessary
 868                         if self._downloader.params.get('forcedescription', False):
 869                                 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
 870                                 request = urllib2.Request(video_page_url, None, std_headers)
 871                                 try:
 872                                         video_page_webpage = urllib2.urlopen(request).read()
 873                                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
 874                                         if mobj is not None:
 875                                                 video_description = mobj.group(1)
 876                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 877                                         pass    # don't panic if we can't find it
 878
 879                         try:
 880                                 # Process video information
 881                                 self._downloader.process_info({
 882                                         'id':           video_id.decode('utf-8'),
 883                                         'url':          video_real_url.decode('utf-8'),
 884                                         'uploader':     video_uploader.decode('utf-8'),
 885                                         'title':        video_title,
 886                                         'stitle':       simple_title,
 887                                         'ext':          video_extension.decode('utf-8'),
 888                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 889                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 890                                         'description':  video_description.decode('utf-8'),
 891                                 })
 892
 893                                 if all_formats:
 894                                         if quality_index == len(self._available_formats) - 1:
 895                                                 # None left to get
 896                                                 return
 897                                         else:
 898                                                 quality_index += 1
 899                                                 format_param = self._available_formats[quality_index]
 900                                                 if format_param == None:
 901                                                         return
 902                                                 continue
 903
 904                                 return
 905
 906                         except UnavailableFormatError, err:
 907                                 if best_quality or all_formats:
 908                                         if quality_index == len(self._available_formats) - 1:
 909                                                 # I don't ever expect this to happen
 910                                                 if not all_formats:
 911                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 912                                                 return
 913                                         else:
 914                                                 self.report_unavailable_format(video_id, format_param)
 915                                                 quality_index += 1
 916                                                 format_param = self._available_formats[quality_index]
 917                                                 if format_param == None:
 918                                                         return
 919                                                 continue
 920                                 else:
 921                                         self._downloader.trouble('ERROR: format not available for video')
 922                                         return
 923
 924
 925 class MetacafeIE(InfoExtractor):
 926         """Information Extractor for metacafe.com."""
 927
 928         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 929         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 930         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 931         _youtube_ie = None
 932
 933         def __init__(self, youtube_ie, downloader=None):
 934                 InfoExtractor.__init__(self, downloader)
 935                 self._youtube_ie = youtube_ie
 936
 937         @staticmethod
 938         def suitable(url):
 939                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 940
 941         def report_disclaimer(self):
 942                 """Report disclaimer retrieval."""
 943                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 944
 945         def report_age_confirmation(self):
 946                 """Report attempt to confirm age."""
 947                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 948
 949         def report_download_webpage(self, video_id):
 950                 """Report webpage download."""
 951                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 952
 953         def report_extraction(self, video_id):
 954                 """Report information extraction."""
 955                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 956
 957         def _real_initialize(self):
 958                 # Retrieve disclaimer
 959                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 960                 try:
 961                         self.report_disclaimer()
 962                         disclaimer = urllib2.urlopen(request).read()
 963                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 964                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 965                         return
 966
 967                 # Confirm age
 968                 disclaimer_form = {
 969                         'filters': '0',
 970                         'submit': "Continue - I'm over 18",
 971                         }
 972                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 973                 try:
 974                         self.report_age_confirmation()
 975                         disclaimer = urllib2.urlopen(request).read()
 976                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 977                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 978                         return
 979
 980         def _real_extract(self, url):
 981                 # Extract id and simplified title from URL
 982                 mobj = re.match(self._VALID_URL, url)
 983                 if mobj is None:
 984                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 985                         return
 986
 987                 video_id = mobj.group(1)
 988
 989                 # Check if video comes from YouTube
 990                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 991                 if mobj2 is not None:
 992                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 993                         return
 994
 995                 simple_title = mobj.group(2).decode('utf-8')
 996                 video_extension = 'flv'
 997
 998                 # Retrieve video webpage to extract further information
 999                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1000                 try:
1001                         self.report_download_webpage(video_id)
1002                         webpage = urllib2.urlopen(request).read()
1003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1005                         return
1006
1007                 # Extract URL, uploader and title from webpage
1008                 self.report_extraction(video_id)
1009                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1010                 if mobj is None:
1011                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1012                         return
1013                 mediaURL = urllib.unquote(mobj.group(1))
1014
1015                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1016                 #if mobj is None:
1017                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1018                 #       return
1019                 #gdaKey = mobj.group(1)
1020                 #
1021                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1022
1023                 video_url = mediaURL
1024
1025                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1026                 if mobj is None:
1027                         self._downloader.trouble(u'ERROR: unable to extract title')
1028                         return
1029                 video_title = mobj.group(1).decode('utf-8')
1030                 video_title = sanitize_title(video_title)
1031
1032                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1033                 if mobj is None:
1034                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1035                         return
1036                 video_uploader = mobj.group(1)
1037
1038                 try:
1039                         # Process video information
1040                         self._downloader.process_info({
1041                                 'id':           video_id.decode('utf-8'),
1042                                 'url':          video_url.decode('utf-8'),
1043                                 'uploader':     video_uploader.decode('utf-8'),
1044                                 'title':        video_title,
1045                                 'stitle':       simple_title,
1046                                 'ext':          video_extension.decode('utf-8'),
1047                                 'format':       u'NA',
1048                         })
1049                 except UnavailableFormatError:
1050                         self._downloader.trouble(u'ERROR: format not available for video')
1051
1052
1053 class GoogleIE(InfoExtractor):
1054         """Information extractor for video.google.com."""
1055
1056         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1057
1058         def __init__(self, downloader=None):
1059                 InfoExtractor.__init__(self, downloader)
1060
1061         @staticmethod
1062         def suitable(url):
1063                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1064
1065         def report_download_webpage(self, video_id):
1066                 """Report webpage download."""
1067                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1068
1069         def report_extraction(self, video_id):
1070                 """Report information extraction."""
1071                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1072
1073         def _real_initialize(self):
1074                 return
1075
1076         def _real_extract(self, url):
1077                 # Extract id from URL
1078                 mobj = re.match(self._VALID_URL, url)
1079                 if mobj is None:
1080                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1081                         return
1082
1083                 video_id = mobj.group(1)
1084
1085                 video_extension = 'mp4'
1086
1087                 # Retrieve video webpage to extract further information
1088                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1089                 try:
1090                         self.report_download_webpage(video_id)
1091                         webpage = urllib2.urlopen(request).read()
1092                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1093                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1094                         return
1095
1096                 # Extract URL, uploader, and title from webpage
1097                 self.report_extraction(video_id)
1098                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1099                 if mobj is None:
1100                         video_extension = 'flv'
1101                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1102                 if mobj is None:
1103                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1104                         return
1105                 mediaURL = urllib.unquote(mobj.group(1))
1106                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1107                 mediaURL = mediaURL.replace('\\x26', '\x26')
1108
1109                 video_url = mediaURL
1110
1111                 mobj = re.search(r'<title>(.*)</title>', webpage)
1112                 if mobj is None:
1113                         self._downloader.trouble(u'ERROR: unable to extract title')
1114                         return
1115                 video_title = mobj.group(1).decode('utf-8')
1116                 video_title = sanitize_title(video_title)
1117                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1118
1119                 # Extract video description
1120                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1121                 if mobj is None:
1122                         self._downloader.trouble(u'ERROR: unable to extract video description')
1123                         return
1124                 video_description = mobj.group(1).decode('utf-8')
1125                 if not video_description:
1126                         video_description = 'No description available.'
1127
1128                 # Extract video thumbnail
1129                 if self._downloader.params.get('forcethumbnail', False):
1130                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1131                         try:
1132                                 webpage = urllib2.urlopen(request).read()
1133                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1134                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1135                                 return
1136                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1137                         if mobj is None:
1138                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1139                                 return
1140                         video_thumbnail = mobj.group(1)
1141                 else:   # we need something to pass to process_info
1142                         video_thumbnail = ''
1143
1144
1145                 try:
1146                         # Process video information
1147                         self._downloader.process_info({
1148                                 'id':           video_id.decode('utf-8'),
1149                                 'url':          video_url.decode('utf-8'),
1150                                 'uploader':     u'NA',
1151                                 'title':        video_title,
1152                                 'stitle':       simple_title,
1153                                 'ext':          video_extension.decode('utf-8'),
1154                                 'format':       u'NA',
1155                         })
1156                 except UnavailableFormatError:
1157                         self._downloader.trouble(u'ERROR: format not available for video')
1158
1159
1160 class PhotobucketIE(InfoExtractor):
1161         """Information extractor for photobucket.com."""
1162
1163         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1164
1165         def __init__(self, downloader=None):
1166                 InfoExtractor.__init__(self, downloader)
1167
1168         @staticmethod
1169         def suitable(url):
1170                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1171
1172         def report_download_webpage(self, video_id):
1173                 """Report webpage download."""
1174                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1175
1176         def report_extraction(self, video_id):
1177                 """Report information extraction."""
1178                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1179
1180         def _real_initialize(self):
1181                 return
1182
1183         def _real_extract(self, url):
1184                 # Extract id from URL
1185                 mobj = re.match(self._VALID_URL, url)
1186                 if mobj is None:
1187                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188                         return
1189
1190                 video_id = mobj.group(1)
1191
1192                 video_extension = 'flv'
1193
1194                 # Retrieve video webpage to extract further information
1195                 request = urllib2.Request(url)
1196                 try:
1197                         self.report_download_webpage(video_id)
1198                         webpage = urllib2.urlopen(request).read()
1199                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1200                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1201                         return
1202
1203                 # Extract URL, uploader, and title from webpage
1204                 self.report_extraction(video_id)
1205                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1208                         return
1209                 mediaURL = urllib.unquote(mobj.group(1))
1210
1211                 video_url = mediaURL
1212
1213                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1214                 if mobj is None:
1215                         self._downloader.trouble(u'ERROR: unable to extract title')
1216                         return
1217                 video_title = mobj.group(1).decode('utf-8')
1218                 video_title = sanitize_title(video_title)
1219                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1220
1221                 video_uploader = mobj.group(2).decode('utf-8')
1222
1223                 try:
1224                         # Process video information
1225                         self._downloader.process_info({
1226                                 'id':           video_id.decode('utf-8'),
1227                                 'url':          video_url.decode('utf-8'),
1228                                 'uploader':     video_uploader,
1229                                 'title':        video_title,
1230                                 'stitle':       simple_title,
1231                                 'ext':          video_extension.decode('utf-8'),
1232                                 'format':       u'NA',
1233                         })
1234                 except UnavailableFormatError:
1235                         self._downloader.trouble(u'ERROR: format not available for video')
1236
1237
1238 class YahooIE(InfoExtractor):
1239         """Information extractor for video.yahoo.com."""
1240
1241         # _VALID_URL matches all Yahoo! Video URLs
1242         # _VPAGE_URL matches only the extractable '/watch/' URLs
1243         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1244         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1245
1246         def __init__(self, downloader=None):
1247                 InfoExtractor.__init__(self, downloader)
1248
1249         @staticmethod
1250         def suitable(url):
1251                 return (re.match(YahooIE._VALID_URL, url) is not None)
1252
1253         def report_download_webpage(self, video_id):
1254                 """Report webpage download."""
1255                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1256
1257         def report_extraction(self, video_id):
1258                 """Report information extraction."""
1259                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1260
1261         def _real_initialize(self):
1262                 return
1263
1264         def _real_extract(self, url):
1265                 # Extract ID from URL
1266                 mobj = re.match(self._VALID_URL, url)
1267                 if mobj is None:
1268                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1269                         return
1270
1271                 video_id = mobj.group(2)
1272                 video_extension = 'flv'
1273
1274                 # Rewrite valid but non-extractable URLs as
1275                 # extractable English language /watch/ URLs
1276                 if re.match(self._VPAGE_URL, url) is None:
1277                         request = urllib2.Request(url)
1278                         try:
1279                                 webpage = urllib2.urlopen(request).read()
1280                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1281                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1282                                 return
1283
1284                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1285                         if mobj is None:
1286                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1287                                 return
1288                         yahoo_id = mobj.group(1)
1289
1290                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1291                         if mobj is None:
1292                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1293                                 return
1294                         yahoo_vid = mobj.group(1)
1295
1296                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1297                         return self._real_extract(url)
1298
1299                 # Retrieve video webpage to extract further information
1300                 request = urllib2.Request(url)
1301                 try:
1302                         self.report_download_webpage(video_id)
1303                         webpage = urllib2.urlopen(request).read()
1304                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1305                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1306                         return
1307
1308                 # Extract uploader and title from webpage
1309                 self.report_extraction(video_id)
1310                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1311                 if mobj is None:
1312                         self._downloader.trouble(u'ERROR: unable to extract video title')
1313                         return
1314                 video_title = mobj.group(1).decode('utf-8')
1315                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1316
1317                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1318                 if mobj is None:
1319                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1320                         return
1321                 video_uploader = mobj.group(1).decode('utf-8')
1322
1323                 # Extract video thumbnail
1324                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1325                 if mobj is None:
1326                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1327                         return
1328                 video_thumbnail = mobj.group(1).decode('utf-8')
1329
1330                 # Extract video description
1331                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1332                 if mobj is None:
1333                         self._downloader.trouble(u'ERROR: unable to extract video description')
1334                         return
1335                 video_description = mobj.group(1).decode('utf-8')
1336                 if not video_description: video_description = 'No description available.'
1337
1338                 # Extract video height and width
1339                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1340                 if mobj is None:
1341                         self._downloader.trouble(u'ERROR: unable to extract video height')
1342                         return
1343                 yv_video_height = mobj.group(1)
1344
1345                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1346                 if mobj is None:
1347                         self._downloader.trouble(u'ERROR: unable to extract video width')
1348                         return
1349                 yv_video_width = mobj.group(1)
1350
1351                 # Retrieve video playlist to extract media URL
1352                 # I'm not completely sure what all these options are, but we
1353                 # seem to need most of them, otherwise the server sends a 401.
1354                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1355                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1356                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1357                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1358                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1359                 try:
1360                         self.report_download_webpage(video_id)
1361                         webpage = urllib2.urlopen(request).read()
1362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1364                         return
1365
1366                 # Extract media URL from playlist XML
1367                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1368                 if mobj is None:
1369                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1370                         return
1371                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1372                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1373
1374                 try:
1375                         # Process video information
1376                         self._downloader.process_info({
1377                                 'id':           video_id.decode('utf-8'),
1378                                 'url':          video_url,
1379                                 'uploader':     video_uploader,
1380                                 'title':        video_title,
1381                                 'stitle':       simple_title,
1382                                 'ext':          video_extension.decode('utf-8'),
1383                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1384                                 'description':  video_description,
1385                                 'thumbnail':    video_thumbnail,
1386                                 'description':  video_description,
1387                         })
1388                 except UnavailableFormatError:
1389                         self._downloader.trouble(u'ERROR: format not available for video')
1390
1391
1392 class GenericIE(InfoExtractor):
1393         """Generic last-resort information extractor."""
1394
1395         def __init__(self, downloader=None):
1396                 InfoExtractor.__init__(self, downloader)
1397
1398         @staticmethod
1399         def suitable(url):
1400                 return True
1401
1402         def report_download_webpage(self, video_id):
1403                 """Report webpage download."""
1404                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1405                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1406
1407         def report_extraction(self, video_id):
1408                 """Report information extraction."""
1409                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1410
1411         def _real_initialize(self):
1412                 return
1413
1414         def _real_extract(self, url):
1415                 video_id = url.split('/')[-1]
1416                 request = urllib2.Request(url)
1417                 try:
1418                         self.report_download_webpage(video_id)
1419                         webpage = urllib2.urlopen(request).read()
1420                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1422                         return
1423                 except ValueError, err:
1424                         # since this is the last-resort InfoExtractor, if
1425                         # this error is thrown, it'll be thrown here
1426                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1427                         return
1428
1429                 # Start with something easy: JW Player in SWFObject
1430                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1431                 if mobj is None:
1432                         # Broaden the search a little bit
1433                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1434                 if mobj is None:
1435                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1436                         return
1437
1438                 # It's possible that one of the regexes
1439                 # matched, but returned an empty group:
1440                 if mobj.group(1) is None:
1441                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1442                         return
1443
1444                 video_url = urllib.unquote(mobj.group(1))
1445                 video_id  = os.path.basename(video_url)
1446
1447                 # here's a fun little line of code for you:
1448                 video_extension = os.path.splitext(video_id)[1][1:]
1449                 video_id        = os.path.splitext(video_id)[0]
1450
1451                 # it's tempting to parse this further, but you would
1452                 # have to take into account all the variations like
1453                 #   Video Title - Site Name
1454                 #   Site Name | Video Title
1455                 #   Video Title - Tagline | Site Name
1456                 # and so on and so forth; it's just not practical
1457                 mobj = re.search(r'<title>(.*)</title>', webpage)
1458                 if mobj is None:
1459                         self._downloader.trouble(u'ERROR: unable to extract title')
1460                         return
1461                 video_title = mobj.group(1).decode('utf-8')
1462                 video_title = sanitize_title(video_title)
1463                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1464
1465                 # video uploader is domain name
1466                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1467                 if mobj is None:
1468                         self._downloader.trouble(u'ERROR: unable to extract title')
1469                         return
1470                 video_uploader = mobj.group(1).decode('utf-8')
1471
1472                 try:
1473                         # Process video information
1474                         self._downloader.process_info({
1475                                 'id':           video_id.decode('utf-8'),
1476                                 'url':          video_url.decode('utf-8'),
1477                                 'uploader':     video_uploader,
1478                                 'title':        video_title,
1479                                 'stitle':       simple_title,
1480                                 'ext':          video_extension.decode('utf-8'),
1481                                 'format':       u'NA',
1482                         })
1483                 except UnavailableFormatError:
1484                         self._downloader.trouble(u'ERROR: format not available for video')
1485
1486
1487 class YoutubeSearchIE(InfoExtractor):
1488         """Information Extractor for YouTube search queries."""
1489         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1490         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1491         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1492         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1493         _youtube_ie = None
1494         _max_youtube_results = 1000
1495
1496         def __init__(self, youtube_ie, downloader=None):
1497                 InfoExtractor.__init__(self, downloader)
1498                 self._youtube_ie = youtube_ie
1499
1500         @staticmethod
1501         def suitable(url):
1502                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1503
1504         def report_download_page(self, query, pagenum):
1505                 """Report attempt to download playlist page with given number."""
1506                 query = query.decode(preferredencoding())
1507                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1508
1509         def _real_initialize(self):
1510                 self._youtube_ie.initialize()
1511
1512         def _real_extract(self, query):
1513                 mobj = re.match(self._VALID_QUERY, query)
1514                 if mobj is None:
1515                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1516                         return
1517
1518                 prefix, query = query.split(':')
1519                 prefix = prefix[8:]
1520                 query  = query.encode('utf-8')
1521                 if prefix == '':
1522                         self._download_n_results(query, 1)
1523                         return
1524                 elif prefix == 'all':
1525                         self._download_n_results(query, self._max_youtube_results)
1526                         return
1527                 else:
1528                         try:
1529                                 n = long(prefix)
1530                                 if n <= 0:
1531                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1532                                         return
1533                                 elif n > self._max_youtube_results:
1534                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1535                                         n = self._max_youtube_results
1536                                 self._download_n_results(query, n)
1537                                 return
1538                         except ValueError: # parsing prefix as integer fails
1539                                 self._download_n_results(query, 1)
1540                                 return
1541
1542         def _download_n_results(self, query, n):
1543                 """Downloads a specified number of results for a query"""
1544
1545                 video_ids = []
1546                 already_seen = set()
1547                 pagenum = 1
1548
1549                 while True:
1550                         self.report_download_page(query, pagenum)
1551                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1552                         request = urllib2.Request(result_url, None, std_headers)
1553                         try:
1554                                 page = urllib2.urlopen(request).read()
1555                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1557                                 return
1558
1559                         # Extract video identifiers
1560                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1561                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1562                                 if video_id not in already_seen:
1563                                         video_ids.append(video_id)
1564                                         already_seen.add(video_id)
1565                                         if len(video_ids) == n:
1566                                                 # Specified n videos reached
1567                                                 for id in video_ids:
1568                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1569                                                 return
1570
1571                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1572                                 for id in video_ids:
1573                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1574                                 return
1575
1576                         pagenum = pagenum + 1
1577
1578 class GoogleSearchIE(InfoExtractor):
1579         """Information Extractor for Google Video search queries."""
1580         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1581         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1582         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1583         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1584         _google_ie = None
1585         _max_google_results = 1000
1586
1587         def __init__(self, google_ie, downloader=None):
1588                 InfoExtractor.__init__(self, downloader)
1589                 self._google_ie = google_ie
1590
1591         @staticmethod
1592         def suitable(url):
1593                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1594
1595         def report_download_page(self, query, pagenum):
1596                 """Report attempt to download playlist page with given number."""
1597                 query = query.decode(preferredencoding())
1598                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1599
1600         def _real_initialize(self):
1601                 self._google_ie.initialize()
1602
1603         def _real_extract(self, query):
1604                 mobj = re.match(self._VALID_QUERY, query)
1605                 if mobj is None:
1606                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607                         return
1608
1609                 prefix, query = query.split(':')
1610                 prefix = prefix[8:]
1611                 query  = query.encode('utf-8')
1612                 if prefix == '':
1613                         self._download_n_results(query, 1)
1614                         return
1615                 elif prefix == 'all':
1616                         self._download_n_results(query, self._max_google_results)
1617                         return
1618                 else:
1619                         try:
1620                                 n = long(prefix)
1621                                 if n <= 0:
1622                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623                                         return
1624                                 elif n > self._max_google_results:
1625                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1626                                         n = self._max_google_results
1627                                 self._download_n_results(query, n)
1628                                 return
1629                         except ValueError: # parsing prefix as integer fails
1630                                 self._download_n_results(query, 1)
1631                                 return
1632
1633         def _download_n_results(self, query, n):
1634                 """Downloads a specified number of results for a query"""
1635
1636                 video_ids = []
1637                 already_seen = set()
1638                 pagenum = 1
1639
1640                 while True:
1641                         self.report_download_page(query, pagenum)
1642                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1643                         request = urllib2.Request(result_url, None, std_headers)
1644                         try:
1645                                 page = urllib2.urlopen(request).read()
1646                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1648                                 return
1649
1650                         # Extract video identifiers
1651                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652                                 video_id = mobj.group(1)
1653                                 if video_id not in already_seen:
1654                                         video_ids.append(video_id)
1655                                         already_seen.add(video_id)
1656                                         if len(video_ids) == n:
1657                                                 # Specified n videos reached
1658                                                 for id in video_ids:
1659                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1660                                                 return
1661
1662                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663                                 for id in video_ids:
1664                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1665                                 return
1666
1667                         pagenum = pagenum + 1
1668
1669 class YahooSearchIE(InfoExtractor):
1670         """Information Extractor for Yahoo! Video search queries."""
1671         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1672         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1673         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1674         _MORE_PAGES_INDICATOR = r'\s*Next'
1675         _yahoo_ie = None
1676         _max_yahoo_results = 1000
1677
1678         def __init__(self, yahoo_ie, downloader=None):
1679                 InfoExtractor.__init__(self, downloader)
1680                 self._yahoo_ie = yahoo_ie
1681
1682         @staticmethod
1683         def suitable(url):
1684                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1685
1686         def report_download_page(self, query, pagenum):
1687                 """Report attempt to download playlist page with given number."""
1688                 query = query.decode(preferredencoding())
1689                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1690
1691         def _real_initialize(self):
1692                 self._yahoo_ie.initialize()
1693
1694         def _real_extract(self, query):
1695                 mobj = re.match(self._VALID_QUERY, query)
1696                 if mobj is None:
1697                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1698                         return
1699
1700                 prefix, query = query.split(':')
1701                 prefix = prefix[8:]
1702                 query  = query.encode('utf-8')
1703                 if prefix == '':
1704                         self._download_n_results(query, 1)
1705                         return
1706                 elif prefix == 'all':
1707                         self._download_n_results(query, self._max_yahoo_results)
1708                         return
1709                 else:
1710                         try:
1711                                 n = long(prefix)
1712                                 if n <= 0:
1713                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1714                                         return
1715                                 elif n > self._max_yahoo_results:
1716                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1717                                         n = self._max_yahoo_results
1718                                 self._download_n_results(query, n)
1719                                 return
1720                         except ValueError: # parsing prefix as integer fails
1721                                 self._download_n_results(query, 1)
1722                                 return
1723
1724         def _download_n_results(self, query, n):
1725                 """Downloads a specified number of results for a query"""
1726
1727                 video_ids = []
1728                 already_seen = set()
1729                 pagenum = 1
1730
1731                 while True:
1732                         self.report_download_page(query, pagenum)
1733                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1734                         request = urllib2.Request(result_url, None, std_headers)
1735                         try:
1736                                 page = urllib2.urlopen(request).read()
1737                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1738                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1739                                 return
1740
1741                         # Extract video identifiers
1742                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1743                                 video_id = mobj.group(1)
1744                                 if video_id not in already_seen:
1745                                         video_ids.append(video_id)
1746                                         already_seen.add(video_id)
1747                                         if len(video_ids) == n:
1748                                                 # Specified n videos reached
1749                                                 for id in video_ids:
1750                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1751                                                 return
1752
1753                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1754                                 for id in video_ids:
1755                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1756                                 return
1757
1758                         pagenum = pagenum + 1
1759
1760 class YoutubePlaylistIE(InfoExtractor):
1761         """Information Extractor for YouTube playlists."""
1762
1763         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1764         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1765         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1766         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1767         _youtube_ie = None
1768
1769         def __init__(self, youtube_ie, downloader=None):
1770                 InfoExtractor.__init__(self, downloader)
1771                 self._youtube_ie = youtube_ie
1772
1773         @staticmethod
1774         def suitable(url):
1775                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1776
1777         def report_download_page(self, playlist_id, pagenum):
1778                 """Report attempt to download playlist page with given number."""
1779                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1780
1781         def _real_initialize(self):
1782                 self._youtube_ie.initialize()
1783
1784         def _real_extract(self, url):
1785                 # Extract playlist id
1786                 mobj = re.match(self._VALID_URL, url)
1787                 if mobj is None:
1788                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1789                         return
1790
1791                 # Download playlist pages
1792                 playlist_id = mobj.group(1)
1793                 video_ids = []
1794                 pagenum = 1
1795
1796                 while True:
1797                         self.report_download_page(playlist_id, pagenum)
1798                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1799                         try:
1800                                 page = urllib2.urlopen(request).read()
1801                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1803                                 return
1804
1805                         # Extract video identifiers
1806                         ids_in_page = []
1807                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1808                                 if mobj.group(1) not in ids_in_page:
1809                                         ids_in_page.append(mobj.group(1))
1810                         video_ids.extend(ids_in_page)
1811
1812                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1813                                 break
1814                         pagenum = pagenum + 1
1815
1816                 for id in video_ids:
1817                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1818                 return
1819
1820 class YoutubeUserIE(InfoExtractor):
1821         """Information Extractor for YouTube users."""
1822
1823         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1824         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1825         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1826         _youtube_ie = None
1827
1828         def __init__(self, youtube_ie, downloader=None):
1829                 InfoExtractor.__init__(self, downloader)
1830                 self._youtube_ie = youtube_ie
1831
1832         @staticmethod
1833         def suitable(url):
1834                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1835
1836         def report_download_page(self, username):
1837                 """Report attempt to download user page."""
1838                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1839
1840         def _real_initialize(self):
1841                 self._youtube_ie.initialize()
1842
1843         def _real_extract(self, url):
1844                 # Extract username
1845                 mobj = re.match(self._VALID_URL, url)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848                         return
1849
1850                 # Download user page
1851                 username = mobj.group(1)
1852                 video_ids = []
1853                 pagenum = 1
1854
1855                 self.report_download_page(username)
1856                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1857                 try:
1858                         page = urllib2.urlopen(request).read()
1859                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1860                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1861                         return
1862
1863                 # Extract video identifiers
1864                 ids_in_page = []
1865
1866                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                         if mobj.group(1) not in ids_in_page:
1868                                 ids_in_page.append(mobj.group(1))
1869                 video_ids.extend(ids_in_page)
1870
1871                 for id in video_ids:
1872                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1873                 return
1874
1875 class PostProcessor(object):
1876         """Post Processor class.
1877
1878         PostProcessor objects can be added to downloaders with their
1879         add_post_processor() method. When the downloader has finished a
1880         successful download, it will take its internal chain of PostProcessors
1881         and start calling the run() method on each one of them, first with
1882         an initial argument and then with the returned value of the previous
1883         PostProcessor.
1884
1885         The chain will be stopped if one of them ever returns None or the end
1886         of the chain is reached.
1887
1888         PostProcessor objects follow a "mutual registration" process similar
1889         to InfoExtractor objects.
1890         """
1891
1892         _downloader = None
1893
1894         def __init__(self, downloader=None):
1895                 self._downloader = downloader
1896
1897         def set_downloader(self, downloader):
1898                 """Sets the downloader for this PP."""
1899                 self._downloader = downloader
1900
1901         def run(self, information):
1902                 """Run the PostProcessor.
1903
1904                 The "information" argument is a dictionary like the ones
1905                 composed by InfoExtractors. The only difference is that this
1906                 one has an extra field called "filepath" that points to the
1907                 downloaded file.
1908
1909                 When this method returns None, the postprocessing chain is
1910                 stopped. However, this method may return an information
1911                 dictionary that will be passed to the next postprocessing
1912                 object in the chain. It can be the one it received after
1913                 changing some fields.
1914
1915                 In addition, this method may raise a PostProcessingError
1916                 exception that will be taken into account by the downloader
1917                 it was called from.
1918                 """
1919                 return information # by default, do nothing
1920
1921 ### MAIN PROGRAM ###
1922 if __name__ == '__main__':
1923         try:
1924                 # Modules needed only when running the main program
1925                 import getpass
1926                 import optparse
1927
1928                 # Function to update the program file with the latest version from bitbucket.org
1929                 def update_self(downloader, filename):
1930                         # Note: downloader only used for options
1931                         if not os.access (filename, os.W_OK):
1932                                 sys.exit('ERROR: no write permissions on %s' % filename)
1933
1934                         downloader.to_stdout('Updating to latest stable version...')
1935                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1936                         latest_version = urllib.urlopen(latest_url).read().strip()
1937                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1938                         newcontent = urllib.urlopen(prog_url).read()
1939                         stream = open(filename, 'w')
1940                         stream.write(newcontent)
1941                         stream.close()
1942                         downloader.to_stdout('Updated to version %s' % latest_version)
1943
1944                 # General configuration
1945                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1946                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1947                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1948
1949                 # Parse command line
1950                 parser = optparse.OptionParser(
1951                         usage='Usage: %prog [options] url...',
1952                         version='2010.04.04',
1953                         conflict_handler='resolve',
1954                 )
1955
1956                 parser.add_option('-h', '--help',
1957                                 action='help', help='print this help text and exit')
1958                 parser.add_option('-v', '--version',
1959                                 action='version', help='print program version and exit')
1960                 parser.add_option('-U', '--update',
1961                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1962                 parser.add_option('-i', '--ignore-errors',
1963                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1964                 parser.add_option('-r', '--rate-limit',
1965                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1966
1967                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1968                 authentication.add_option('-u', '--username',
1969                                 dest='username', metavar='UN', help='account username')
1970                 authentication.add_option('-p', '--password',
1971                                 dest='password', metavar='PW', help='account password')
1972                 authentication.add_option('-n', '--netrc',
1973                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1974                 parser.add_option_group(authentication)
1975
1976                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1977                 video_format.add_option('-f', '--format',
1978                                 action='store', dest='format', metavar='FMT', help='video format code')
1979                 video_format.add_option('-b', '--best-quality',
1980                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1981                 video_format.add_option('-m', '--mobile-version',
1982                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1983                 video_format.add_option('-d', '--high-def',
1984                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1985                 video_format.add_option('--all-formats',
1986                                 action='store_const', dest='format', help='download all available video formats', const='-1')
1987                 parser.add_option_group(video_format)
1988
1989                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1990                 verbosity.add_option('-q', '--quiet',
1991                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1992                 verbosity.add_option('-s', '--simulate',
1993                                 action='store_true', dest='simulate', help='do not download video', default=False)
1994                 verbosity.add_option('-g', '--get-url',
1995                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1996                 verbosity.add_option('-e', '--get-title',
1997                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1998                 verbosity.add_option('--get-thumbnail',
1999                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2000                 verbosity.add_option('--get-description',
2001                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2002                 verbosity.add_option('--no-progress',
2003                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2004                 parser.add_option_group(verbosity)
2005
2006                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2007                 filesystem.add_option('-t', '--title',
2008                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2009                 filesystem.add_option('-l', '--literal',
2010                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2011                 filesystem.add_option('-o', '--output',
2012                                 dest='outtmpl', metavar='TPL', help='output filename template')
2013                 filesystem.add_option('-a', '--batch-file',
2014                                 dest='batchfile', metavar='F', help='file containing URLs to download')
2015                 filesystem.add_option('-w', '--no-overwrites',
2016                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2017                 filesystem.add_option('-c', '--continue',
2018                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2019                 parser.add_option_group(filesystem)
2020
2021                 (opts, args) = parser.parse_args()
2022
2023                 # Batch file verification
2024                 batchurls = []
2025                 if opts.batchfile is not None:
2026                         try:
2027                                 batchurls = open(opts.batchfile, 'r').readlines()
2028                                 batchurls = [x.strip() for x in batchurls]
2029                                 batchurls = [x for x in batchurls if len(x) > 0]
2030                         except IOError:
2031                                 sys.exit(u'ERROR: batch file could not be read')
2032                 all_urls = batchurls + args
2033
2034                 # Conflicting, missing and erroneous options
2035                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2036                         parser.error(u'using .netrc conflicts with giving username/password')
2037                 if opts.password is not None and opts.username is None:
2038                         parser.error(u'account username missing')
2039                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2040                         parser.error(u'using output template conflicts with using title or literal title')
2041                 if opts.usetitle and opts.useliteral:
2042                         parser.error(u'using title conflicts with using literal title')
2043                 if opts.username is not None and opts.password is None:
2044                         opts.password = getpass.getpass(u'Type account password and press return:')
2045                 if opts.ratelimit is not None:
2046                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2047                         if numeric_limit is None:
2048                                 parser.error(u'invalid rate limit specified')
2049                         opts.ratelimit = numeric_limit
2050
2051                 # Information extractors
2052                 youtube_ie = YoutubeIE()
2053                 metacafe_ie = MetacafeIE(youtube_ie)
2054                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2055                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2056                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2057                 google_ie = GoogleIE()
2058                 google_search_ie = GoogleSearchIE(google_ie)
2059                 photobucket_ie = PhotobucketIE()
2060                 yahoo_ie = YahooIE()
2061                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2062                 generic_ie = GenericIE()
2063
2064                 # File downloader
2065                 fd = FileDownloader({
2066                         'usenetrc': opts.usenetrc,
2067                         'username': opts.username,
2068                         'password': opts.password,
2069                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2070                         'forceurl': opts.geturl,
2071                         'forcetitle': opts.gettitle,
2072                         'forcethumbnail': opts.getthumbnail,
2073                         'forcedescription': opts.getdescription,
2074                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2075                         'format': opts.format,
2076                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2077                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2078                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2079                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2080                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2081                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2082                                 or u'%(id)s.%(ext)s'),
2083                         'ignoreerrors': opts.ignoreerrors,
2084                         'ratelimit': opts.ratelimit,
2085                         'nooverwrites': opts.nooverwrites,
2086                         'continuedl': opts.continue_dl,
2087                         'noprogress': opts.noprogress,
2088                         })
2089                 fd.add_info_extractor(youtube_search_ie)
2090                 fd.add_info_extractor(youtube_pl_ie)
2091                 fd.add_info_extractor(youtube_user_ie)
2092                 fd.add_info_extractor(metacafe_ie)
2093                 fd.add_info_extractor(youtube_ie)
2094                 fd.add_info_extractor(google_ie)
2095                 fd.add_info_extractor(google_search_ie)
2096                 fd.add_info_extractor(photobucket_ie)
2097                 fd.add_info_extractor(yahoo_ie)
2098                 fd.add_info_extractor(yahoo_search_ie)
2099
2100                 # This must come last since it's the
2101                 # fallback if none of the others work
2102                 fd.add_info_extractor(generic_ie)
2103
2104                 # Update version
2105                 if opts.update_self:
2106                         update_self(fd, sys.argv[0])
2107
2108                 # Maybe do nothing
2109                 if len(all_urls) < 1:
2110                         if not opts.update_self:
2111                                 parser.error(u'you must provide at least one URL')
2112                         else:
2113                                 sys.exit()
2114                 retcode = fd.download(all_urls)
2115                 sys.exit(retcode)
2116
2117         except DownloadError:
2118                 sys.exit(1)
2119         except SameFileError:
2120                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2121         except KeyboardInterrupt:
2122                 sys.exit(u'\nERROR: Interrupted by user')