youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204         _num_downloads = None
 205
 206         def __init__(self, params):
 207                 """Create a FileDownloader object with the given options."""
 208                 self._ies = []
 209                 self._pps = []
 210                 self._download_retcode = 0
 211                 self._num_downloads = 0
 212                 self.params = params
 213
 214         @staticmethod
 215         def pmkdir(filename):
 216                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 217                 components = filename.split(os.sep)
 218                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 219                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 220                 for dir in aggregate:
 221                         if not os.path.exists(dir):
 222                                 os.mkdir(dir)
 223
 224         @staticmethod
 225         def format_bytes(bytes):
 226                 if bytes is None:
 227                         return 'N/A'
 228                 if type(bytes) is str:
 229                         bytes = float(bytes)
 230                 if bytes == 0.0:
 231                         exponent = 0
 232                 else:
 233                         exponent = long(math.log(bytes, 1024.0))
 234                 suffix = 'bkMGTPEZY'[exponent]
 235                 converted = float(bytes) / float(1024**exponent)
 236                 return '%.2f%s' % (converted, suffix)
 237
 238         @staticmethod
 239         def calc_percent(byte_counter, data_len):
 240                 if data_len is None:
 241                         return '---.-%'
 242                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 243
 244         @staticmethod
 245         def calc_eta(start, now, total, current):
 246                 if total is None:
 247                         return '--:--'
 248                 dif = now - start
 249                 if current == 0 or dif < 0.001: # One millisecond
 250                         return '--:--'
 251                 rate = float(current) / dif
 252                 eta = long((float(total) - float(current)) / rate)
 253                 (eta_mins, eta_secs) = divmod(eta, 60)
 254                 if eta_mins > 99:
 255                         return '--:--'
 256                 return '%02d:%02d' % (eta_mins, eta_secs)
 257
 258         @staticmethod
 259         def calc_speed(start, now, bytes):
 260                 dif = now - start
 261                 if bytes == 0 or dif < 0.001: # One millisecond
 262                         return '%10s' % '---b/s'
 263                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 264
 265         @staticmethod
 266         def best_block_size(elapsed_time, bytes):
 267                 new_min = max(bytes / 2.0, 1.0)
 268                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 269                 if elapsed_time < 0.001:
 270                         return long(new_max)
 271                 rate = bytes / elapsed_time
 272                 if rate > new_max:
 273                         return long(new_max)
 274                 if rate < new_min:
 275                         return long(new_min)
 276                 return long(rate)
 277
 278         @staticmethod
 279         def parse_bytes(bytestr):
 280                 """Parse a string indicating a byte quantity into a long integer."""
 281                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 282                 if matchobj is None:
 283                         return None
 284                 number = float(matchobj.group(1))
 285                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 286                 return long(round(number * multiplier))
 287
 288         @staticmethod
 289         def verify_url(url):
 290                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 291                 request = urllib2.Request(url, None, std_headers)
 292                 data = urllib2.urlopen(request)
 293                 data.read(1)
 294                 url = data.geturl()
 295                 data.close()
 296                 return url
 297
 298         def add_info_extractor(self, ie):
 299                 """Add an InfoExtractor object to the end of the list."""
 300                 self._ies.append(ie)
 301                 ie.set_downloader(self)
 302
 303         def add_post_processor(self, pp):
 304                 """Add a PostProcessor object to the end of the chain."""
 305                 self._pps.append(pp)
 306                 pp.set_downloader(self)
 307
 308         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 309                 """Print message to stdout if not in quiet mode."""
 310                 try:
 311                         if not self.params.get('quiet', False):
 312                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 313                         sys.stdout.flush()
 314                 except (UnicodeEncodeError), err:
 315                         if not ignore_encoding_errors:
 316                                 raise
 317
 318         def to_stderr(self, message):
 319                 """Print message to stderr."""
 320                 print >>sys.stderr, message.encode(preferredencoding())
 321
 322         def fixed_template(self):
 323                 """Checks if the output template is fixed."""
 324                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 325
 326         def trouble(self, message=None):
 327                 """Determine action to take when a download problem appears.
 328
 329                 Depending on if the downloader has been configured to ignore
 330                 download errors or not, this method may throw an exception or
 331                 not when errors are found, after printing the message.
 332                 """
 333                 if message is not None:
 334                         self.to_stderr(message)
 335                 if not self.params.get('ignoreerrors', False):
 336                         raise DownloadError(message)
 337                 self._download_retcode = 1
 338
 339         def slow_down(self, start_time, byte_counter):
 340                 """Sleep if the download speed is over the rate limit."""
 341                 rate_limit = self.params.get('ratelimit', None)
 342                 if rate_limit is None or byte_counter == 0:
 343                         return
 344                 now = time.time()
 345                 elapsed = now - start_time
 346                 if elapsed <= 0.0:
 347                         return
 348                 speed = float(byte_counter) / elapsed
 349                 if speed > rate_limit:
 350                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 351
 352         def report_destination(self, filename):
 353                 """Report destination filename."""
 354                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 355
 356         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 357                 """Report download progress."""
 358                 if self.params.get('noprogress', False):
 359                         return
 360                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 361                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 362
 363         def report_resuming_byte(self, resume_len):
 364                 """Report attemtp to resume at given byte."""
 365                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 366
 367         def report_file_already_downloaded(self, file_name):
 368                 """Report file has already been fully downloaded."""
 369                 try:
 370                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 371                 except (UnicodeEncodeError), err:
 372                         self.to_stdout(u'[download] The file has already been downloaded')
 373
 374         def report_unable_to_resume(self):
 375                 """Report it was impossible to resume download."""
 376                 self.to_stdout(u'[download] Unable to resume')
 377
 378         def report_finish(self):
 379                 """Report download finished."""
 380                 if self.params.get('noprogress', False):
 381                         self.to_stdout(u'[download] Download completed')
 382                 else:
 383                         self.to_stdout(u'')
 384
 385         def process_info(self, info_dict):
 386                 """Process a single dictionary returned by an InfoExtractor."""
 387                 # Do nothing else if in simulate mode
 388                 if self.params.get('simulate', False):
 389                         # Verify URL if it's an HTTP one
 390                         if info_dict['url'].startswith('http'):
 391                                 try:
 392                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 393                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 394                                         raise UnavailableFormatError
 395
 396                         # Forced printings
 397                         if self.params.get('forcetitle', False):
 398                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 399                         if self.params.get('forceurl', False):
 400                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 401                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 402                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 403                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 404                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 405
 406                         return
 407
 408                 try:
 409                         template_dict = dict(info_dict)
 410                         template_dict['epoch'] = unicode(long(time.time()))
 411                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 412                         filename = self.params['outtmpl'] % template_dict
 413                 except (ValueError, KeyError), err:
 414                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 415                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 416                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 417                         return
 418
 419                 try:
 420                         self.pmkdir(filename)
 421                 except (OSError, IOError), err:
 422                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 423                         return
 424
 425                 try:
 426                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 427                 except (OSError, IOError), err:
 428                         raise UnavailableFormatError
 429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 430                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 431                         return
 432                 except (ContentTooShortError, ), err:
 433                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 434                         return
 435
 436                 if success:
 437                         try:
 438                                 self.post_process(filename, info_dict)
 439                         except (PostProcessingError), err:
 440                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 441                                 return
 442
 443         def download(self, url_list):
 444                 """Download a given list of URLs."""
 445                 if len(url_list) > 1 and self.fixed_template():
 446                         raise SameFileError(self.params['outtmpl'])
 447
 448                 for url in url_list:
 449                         suitable_found = False
 450                         for ie in self._ies:
 451                                 # Go to next InfoExtractor if not suitable
 452                                 if not ie.suitable(url):
 453                                         continue
 454
 455                                 # Suitable InfoExtractor found
 456                                 suitable_found = True
 457
 458                                 # Extract information from URL and process it
 459                                 ie.extract(url)
 460
 461                                 # Suitable InfoExtractor had been found; go to next URL
 462                                 break
 463
 464                         if not suitable_found:
 465                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 466
 467                 return self._download_retcode
 468
 469         def post_process(self, filename, ie_info):
 470                 """Run the postprocessing chain on the given file."""
 471                 info = dict(ie_info)
 472                 info['filepath'] = filename
 473                 for pp in self._pps:
 474                         info = pp.run(info)
 475                         if info is None:
 476                                 break
 477
 478         def _download_with_rtmpdump(self, filename, url):
 479                 self.report_destination(filename)
 480
 481                 # Check for rtmpdump first
 482                 try:
 483                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 484                 except (OSError, IOError):
 485                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 486                         return False
 487
 488                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 489                 # the connection was interrumpted and resuming appears to be
 490                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 491                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 492                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 493                 while retval == 2 or retval == 1:
 494                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 495                         time.sleep(2.0) # This seems to be needed
 496                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 497                 if retval == 0:
 498                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 499                         return True
 500                 else:
 501                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 502                         return False
 503
 504         def _do_download(self, filename, url):
 505                 # Attempt to download using rtmpdump
 506                 if url.startswith('rtmp'):
 507                         return self._download_with_rtmpdump(filename, url)
 508
 509                 stream = None
 510                 open_mode = 'wb'
 511                 basic_request = urllib2.Request(url, None, std_headers)
 512                 request = urllib2.Request(url, None, std_headers)
 513
 514                 # Establish possible resume length
 515                 if os.path.isfile(filename):
 516                         resume_len = os.path.getsize(filename)
 517                 else:
 518                         resume_len = 0
 519
 520                 # Request parameters in case of being able to resume
 521                 if self.params.get('continuedl', False) and resume_len != 0:
 522                         self.report_resuming_byte(resume_len)
 523                         request.add_header('Range','bytes=%d-' % resume_len)
 524                         open_mode = 'ab'
 525
 526                 # Establish connection
 527                 try:
 528                         data = urllib2.urlopen(request)
 529                 except (urllib2.HTTPError, ), err:
 530                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 531                                 raise
 532                         # Unable to resume
 533                         data = urllib2.urlopen(basic_request)
 534                         content_length = data.info()['Content-Length']
 535
 536                         if content_length is not None and long(content_length) == resume_len:
 537                                 # Because the file had already been fully downloaded
 538                                 self.report_file_already_downloaded(filename)
 539                                 self._num_downloads += 1
 540                                 return True
 541                         else:
 542                                 # Because the server didn't let us
 543                                 self.report_unable_to_resume()
 544                                 open_mode = 'wb'
 545
 546                 data_len = data.info().get('Content-length', None)
 547                 data_len_str = self.format_bytes(data_len)
 548                 byte_counter = 0
 549                 block_size = 1024
 550                 start = time.time()
 551                 while True:
 552                         # Download and write
 553                         before = time.time()
 554                         data_block = data.read(block_size)
 555                         after = time.time()
 556                         data_block_len = len(data_block)
 557                         if data_block_len == 0:
 558                                 break
 559                         byte_counter += data_block_len
 560
 561                         # Open file just in time
 562                         if stream is None:
 563                                 try:
 564                                         (stream, filename) = sanitize_open(filename, open_mode)
 565                                         self.report_destination(filename)
 566                                         self._num_downloads += 1
 567                                 except (OSError, IOError), err:
 568                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 569                                         return False
 570                         try:
 571                                 stream.write(data_block)
 572                         except (IOError, OSError), err:
 573                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 574                         block_size = self.best_block_size(after - before, data_block_len)
 575
 576                         # Progress message
 577                         percent_str = self.calc_percent(byte_counter, data_len)
 578                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 579                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 580                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 581
 582                         # Apply rate limit
 583                         self.slow_down(start, byte_counter)
 584
 585                 self.report_finish()
 586                 if data_len is not None and str(byte_counter) != data_len:
 587                         raise ContentTooShortError(byte_counter, long(data_len))
 588                 return True
 589
 590 class InfoExtractor(object):
 591         """Information Extractor class.
 592
 593         Information extractors are the classes that, given a URL, extract
 594         information from the video (or videos) the URL refers to. This
 595         information includes the real video URL, the video title and simplified
 596         title, author and others. The information is stored in a dictionary
 597         which is then passed to the FileDownloader. The FileDownloader
 598         processes this information possibly downloading the video to the file
 599         system, among other possible outcomes. The dictionaries must include
 600         the following fields:
 601
 602         id:             Video identifier.
 603         url:            Final video URL.
 604         uploader:       Nickname of the video uploader.
 605         title:          Literal title.
 606         stitle:         Simplified title.
 607         ext:            Video filename extension.
 608         format:         Video format.
 609
 610         The following fields are optional. Their primary purpose is to allow
 611         youtube-dl to serve as the backend for a video search function, such
 612         as the one in youtube2mp3.  They are only used when their respective
 613         forced printing functions are called:
 614
 615         thumbnail:      Full URL to a video thumbnail image.
 616         description:    One-line video description.
 617
 618         Subclasses of this one should re-define the _real_initialize() and
 619         _real_extract() methods, as well as the suitable() static method.
 620         Probably, they should also be instantiated and added to the main
 621         downloader.
 622         """
 623
 624         _ready = False
 625         _downloader = None
 626
 627         def __init__(self, downloader=None):
 628                 """Constructor. Receives an optional downloader."""
 629                 self._ready = False
 630                 self.set_downloader(downloader)
 631
 632         @staticmethod
 633         def suitable(url):
 634                 """Receives a URL and returns True if suitable for this IE."""
 635                 return False
 636
 637         def initialize(self):
 638                 """Initializes an instance (authentication, etc)."""
 639                 if not self._ready:
 640                         self._real_initialize()
 641                         self._ready = True
 642
 643         def extract(self, url):
 644                 """Extracts URL information and returns it in list of dicts."""
 645                 self.initialize()
 646                 return self._real_extract(url)
 647
 648         def set_downloader(self, downloader):
 649                 """Sets the downloader for this IE."""
 650                 self._downloader = downloader
 651
 652         def _real_initialize(self):
 653                 """Real initialization process. Redefine in subclasses."""
 654                 pass
 655
 656         def _real_extract(self, url):
 657                 """Real extraction process. Redefine in subclasses."""
 658                 pass
 659
 660 class YoutubeIE(InfoExtractor):
 661         """Information extractor for youtube.com."""
 662
 663         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 664         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 665         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 666         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 667         _NETRC_MACHINE = 'youtube'
 668         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 669         _video_extensions = {
 670                 '13': '3gp',
 671                 '17': 'mp4',
 672                 '18': 'mp4',
 673                 '22': 'mp4',
 674                 '37': 'mp4',
 675         }
 676
 677         @staticmethod
 678         def suitable(url):
 679                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 680
 681         def report_lang(self):
 682                 """Report attempt to set language."""
 683                 self._downloader.to_stdout(u'[youtube] Setting language')
 684
 685         def report_login(self):
 686                 """Report attempt to log in."""
 687                 self._downloader.to_stdout(u'[youtube] Logging in')
 688
 689         def report_age_confirmation(self):
 690                 """Report attempt to confirm age."""
 691                 self._downloader.to_stdout(u'[youtube] Confirming age')
 692
 693         def report_video_info_webpage_download(self, video_id):
 694                 """Report attempt to download video info webpage."""
 695                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 696
 697         def report_information_extraction(self, video_id):
 698                 """Report attempt to extract video information."""
 699                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 700
 701         def report_unavailable_format(self, video_id, format):
 702                 """Report extracted video URL."""
 703                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 704
 705         def report_rtmp_download(self):
 706                 """Indicate the download will use the RTMP protocol."""
 707                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 708
 709         def _real_initialize(self):
 710                 if self._downloader is None:
 711                         return
 712
 713                 username = None
 714                 password = None
 715                 downloader_params = self._downloader.params
 716
 717                 # Attempt to use provided username and password or .netrc data
 718                 if downloader_params.get('username', None) is not None:
 719                         username = downloader_params['username']
 720                         password = downloader_params['password']
 721                 elif downloader_params.get('usenetrc', False):
 722                         try:
 723                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 724                                 if info is not None:
 725                                         username = info[0]
 726                                         password = info[2]
 727                                 else:
 728                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 729                         except (IOError, netrc.NetrcParseError), err:
 730                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 731                                 return
 732
 733                 # Set language
 734                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 735                 try:
 736                         self.report_lang()
 737                         urllib2.urlopen(request).read()
 738                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 739                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 740                         return
 741
 742                 # No authentication to be performed
 743                 if username is None:
 744                         return
 745
 746                 # Log in
 747                 login_form = {
 748                                 'current_form': 'loginForm',
 749                                 'next':         '/',
 750                                 'action_login': 'Log In',
 751                                 'username':     username,
 752                                 'password':     password,
 753                                 }
 754                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 755                 try:
 756                         self.report_login()
 757                         login_results = urllib2.urlopen(request).read()
 758                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 759                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 760                                 return
 761                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 762                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 763                         return
 764
 765                 # Confirm age
 766                 age_form = {
 767                                 'next_url':             '/',
 768                                 'action_confirm':       'Confirm',
 769                                 }
 770                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 771                 try:
 772                         self.report_age_confirmation()
 773                         age_results = urllib2.urlopen(request).read()
 774                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 775                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 776                         return
 777
 778         def _real_extract(self, url):
 779                 # Extract video id from URL
 780                 mobj = re.match(self._VALID_URL, url)
 781                 if mobj is None:
 782                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 783                         return
 784                 video_id = mobj.group(2)
 785
 786                 # Downloader parameters
 787                 best_quality = False
 788                 all_formats = False
 789                 format_param = None
 790                 quality_index = 0
 791                 if self._downloader is not None:
 792                         params = self._downloader.params
 793                         format_param = params.get('format', None)
 794                         if format_param == '0':
 795                                 format_param = self._available_formats[quality_index]
 796                                 best_quality = True
 797                         elif format_param == '-1':
 798                                 format_param = self._available_formats[quality_index]
 799                                 all_formats = True
 800
 801                 while True:
 802                         # Extension
 803                         video_extension = self._video_extensions.get(format_param, 'flv')
 804
 805                         # Get video info
 806                         self.report_video_info_webpage_download(video_id)
 807                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 808                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 809                                                    % (video_id, el_type))
 810                                 request = urllib2.Request(video_info_url, None, std_headers)
 811                                 try:
 812                                         video_info_webpage = urllib2.urlopen(request).read()
 813                                         video_info = parse_qs(video_info_webpage)
 814                                         if 'token' in video_info:
 815                                                 break
 816                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 817                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 818                                         return
 819                         self.report_information_extraction(video_id)
 820
 821                         # "t" param
 822                         if 'token' not in video_info:
 823                                 # Attempt to see if YouTube has issued an error message
 824                                 if 'reason' not in video_info:
 825                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 826                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 827                                         stream.write(video_info_webpage)
 828                                         stream.close()
 829                                 else:
 830                                         reason = urllib.unquote_plus(video_info['reason'][0])
 831                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 832                                 return
 833                         token = urllib.unquote_plus(video_info['token'][0])
 834                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 835                         if format_param is not None:
 836                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 837
 838                         # Check possible RTMP download
 839                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 840                                 self.report_rtmp_download()
 841                                 video_real_url = video_info['conn'][0]
 842
 843                         # uploader
 844                         if 'author' not in video_info:
 845                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 846                                 return
 847                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 848
 849                         # title
 850                         if 'title' not in video_info:
 851                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 852                                 return
 853                         video_title = urllib.unquote_plus(video_info['title'][0])
 854                         video_title = video_title.decode('utf-8')
 855                         video_title = sanitize_title(video_title)
 856
 857                         # simplified title
 858                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 859                         simple_title = simple_title.strip(ur'_')
 860
 861                         # thumbnail image
 862                         if 'thumbnail_url' not in video_info:
 863                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 864                                 video_thumbnail = ''
 865                         else:   # don't panic if we can't find it
 866                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 867
 868                         # get video description
 869                         video_description = 'No description available.'    # we need something to pass to self._downloader
 870                         # this requires an additional HTTP request and a little
 871                         # more time, so don't do it unless absolutely necessary
 872                         if self._downloader.params.get('forcedescription', False):
 873                                 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
 874                                 request = urllib2.Request(video_page_url, None, std_headers)
 875                                 try:
 876                                         video_page_webpage = urllib2.urlopen(request).read()
 877                                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
 878                                         if mobj is not None:
 879                                                 video_description = mobj.group(1)
 880                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 881                                         pass    # don't panic if we can't find it
 882
 883                         try:
 884                                 # Process video information
 885                                 self._downloader.process_info({
 886                                         'id':           video_id.decode('utf-8'),
 887                                         'url':          video_real_url.decode('utf-8'),
 888                                         'uploader':     video_uploader.decode('utf-8'),
 889                                         'title':        video_title,
 890                                         'stitle':       simple_title,
 891                                         'ext':          video_extension.decode('utf-8'),
 892                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 893                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 894                                         'description':  video_description.decode('utf-8'),
 895                                 })
 896
 897                                 if all_formats:
 898                                         if quality_index == len(self._available_formats):
 899                                                 # None left to get
 900                                                 return
 901                                         else:
 902                                                 quality_index += 1
 903                                                 format_param = self._available_formats[quality_index]
 904                                                 continue
 905                                 return
 906
 907                         except UnavailableFormatError, err:
 908                                 if best_quality or all_formats:
 909                                         if quality_index == len(self._available_formats):
 910                                                 # I don't ever expect this to happen
 911                                                 if not all_formats:
 912                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 913                                                 return
 914                                         else:
 915                                                 self.report_unavailable_format(video_id, format_param)
 916                                                 quality_index += 1
 917                                                 format_param = self._available_formats[quality_index]
 918                                                 continue
 919                                 else:
 920                                         self._downloader.trouble('ERROR: format not available for video')
 921                                         return
 922
 923
 924 class MetacafeIE(InfoExtractor):
 925         """Information Extractor for metacafe.com."""
 926
 927         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 928         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 929         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 930         _youtube_ie = None
 931
 932         def __init__(self, youtube_ie, downloader=None):
 933                 InfoExtractor.__init__(self, downloader)
 934                 self._youtube_ie = youtube_ie
 935
 936         @staticmethod
 937         def suitable(url):
 938                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 939
 940         def report_disclaimer(self):
 941                 """Report disclaimer retrieval."""
 942                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 943
 944         def report_age_confirmation(self):
 945                 """Report attempt to confirm age."""
 946                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 947
 948         def report_download_webpage(self, video_id):
 949                 """Report webpage download."""
 950                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 951
 952         def report_extraction(self, video_id):
 953                 """Report information extraction."""
 954                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 955
 956         def _real_initialize(self):
 957                 # Retrieve disclaimer
 958                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 959                 try:
 960                         self.report_disclaimer()
 961                         disclaimer = urllib2.urlopen(request).read()
 962                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 963                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 964                         return
 965
 966                 # Confirm age
 967                 disclaimer_form = {
 968                         'filters': '0',
 969                         'submit': "Continue - I'm over 18",
 970                         }
 971                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 972                 try:
 973                         self.report_age_confirmation()
 974                         disclaimer = urllib2.urlopen(request).read()
 975                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 976                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 977                         return
 978
 979         def _real_extract(self, url):
 980                 # Extract id and simplified title from URL
 981                 mobj = re.match(self._VALID_URL, url)
 982                 if mobj is None:
 983                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 984                         return
 985
 986                 video_id = mobj.group(1)
 987
 988                 # Check if video comes from YouTube
 989                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 990                 if mobj2 is not None:
 991                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 992                         return
 993
 994                 simple_title = mobj.group(2).decode('utf-8')
 995                 video_extension = 'flv'
 996
 997                 # Retrieve video webpage to extract further information
 998                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 999                 try:
1000                         self.report_download_webpage(video_id)
1001                         webpage = urllib2.urlopen(request).read()
1002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1004                         return
1005
1006                 # Extract URL, uploader and title from webpage
1007                 self.report_extraction(video_id)
1008                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1011                         return
1012                 mediaURL = urllib.unquote(mobj.group(1))
1013
1014                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1015                 #if mobj is None:
1016                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1017                 #       return
1018                 #gdaKey = mobj.group(1)
1019                 #
1020                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1021
1022                 video_url = mediaURL
1023
1024                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1025                 if mobj is None:
1026                         self._downloader.trouble(u'ERROR: unable to extract title')
1027                         return
1028                 video_title = mobj.group(1).decode('utf-8')
1029                 video_title = sanitize_title(video_title)
1030
1031                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1032                 if mobj is None:
1033                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1034                         return
1035                 video_uploader = mobj.group(1)
1036
1037                 try:
1038                         # Process video information
1039                         self._downloader.process_info({
1040                                 'id':           video_id.decode('utf-8'),
1041                                 'url':          video_url.decode('utf-8'),
1042                                 'uploader':     video_uploader.decode('utf-8'),
1043                                 'title':        video_title,
1044                                 'stitle':       simple_title,
1045                                 'ext':          video_extension.decode('utf-8'),
1046                                 'format':       u'NA',
1047                         })
1048                 except UnavailableFormatError:
1049                         self._downloader.trouble(u'ERROR: format not available for video')
1050
1051
1052 class GoogleIE(InfoExtractor):
1053         """Information extractor for video.google.com."""
1054
1055         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1056
1057         def __init__(self, downloader=None):
1058                 InfoExtractor.__init__(self, downloader)
1059
1060         @staticmethod
1061         def suitable(url):
1062                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1063
1064         def report_download_webpage(self, video_id):
1065                 """Report webpage download."""
1066                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1067
1068         def report_extraction(self, video_id):
1069                 """Report information extraction."""
1070                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1071
1072         def _real_initialize(self):
1073                 return
1074
1075         def _real_extract(self, url):
1076                 # Extract id from URL
1077                 mobj = re.match(self._VALID_URL, url)
1078                 if mobj is None:
1079                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1080                         return
1081
1082                 video_id = mobj.group(1)
1083
1084                 video_extension = 'mp4'
1085
1086                 # Retrieve video webpage to extract further information
1087                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1088                 try:
1089                         self.report_download_webpage(video_id)
1090                         webpage = urllib2.urlopen(request).read()
1091                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1092                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1093                         return
1094
1095                 # Extract URL, uploader, and title from webpage
1096                 self.report_extraction(video_id)
1097                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1098                 if mobj is None:
1099                         video_extension = 'flv'
1100                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1101                 if mobj is None:
1102                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1103                         return
1104                 mediaURL = urllib.unquote(mobj.group(1))
1105                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1106                 mediaURL = mediaURL.replace('\\x26', '\x26')
1107
1108                 video_url = mediaURL
1109
1110                 mobj = re.search(r'<title>(.*)</title>', webpage)
1111                 if mobj is None:
1112                         self._downloader.trouble(u'ERROR: unable to extract title')
1113                         return
1114                 video_title = mobj.group(1).decode('utf-8')
1115                 video_title = sanitize_title(video_title)
1116                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1117
1118                 # Extract video description
1119                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1120                 if mobj is None:
1121                         self._downloader.trouble(u'ERROR: unable to extract video description')
1122                         return
1123                 video_description = mobj.group(1).decode('utf-8')
1124                 if not video_description:
1125                         video_description = 'No description available.'
1126
1127                 # Extract video thumbnail
1128                 if self._downloader.params.get('forcethumbnail', False):
1129                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1130                         try:
1131                                 webpage = urllib2.urlopen(request).read()
1132                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1133                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1134                                 return
1135                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1136                         if mobj is None:
1137                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1138                                 return
1139                         video_thumbnail = mobj.group(1)
1140                 else:   # we need something to pass to process_info
1141                         video_thumbnail = ''
1142
1143
1144                 try:
1145                         # Process video information
1146                         self._downloader.process_info({
1147                                 'id':           video_id.decode('utf-8'),
1148                                 'url':          video_url.decode('utf-8'),
1149                                 'uploader':     u'NA',
1150                                 'title':        video_title,
1151                                 'stitle':       simple_title,
1152                                 'ext':          video_extension.decode('utf-8'),
1153                                 'format':       u'NA',
1154                         })
1155                 except UnavailableFormatError:
1156                         self._downloader.trouble(u'ERROR: format not available for video')
1157
1158
1159 class PhotobucketIE(InfoExtractor):
1160         """Information extractor for photobucket.com."""
1161
1162         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1163
1164         def __init__(self, downloader=None):
1165                 InfoExtractor.__init__(self, downloader)
1166
1167         @staticmethod
1168         def suitable(url):
1169                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1170
1171         def report_download_webpage(self, video_id):
1172                 """Report webpage download."""
1173                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1174
1175         def report_extraction(self, video_id):
1176                 """Report information extraction."""
1177                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1178
1179         def _real_initialize(self):
1180                 return
1181
1182         def _real_extract(self, url):
1183                 # Extract id from URL
1184                 mobj = re.match(self._VALID_URL, url)
1185                 if mobj is None:
1186                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187                         return
1188
1189                 video_id = mobj.group(1)
1190
1191                 video_extension = 'flv'
1192
1193                 # Retrieve video webpage to extract further information
1194                 request = urllib2.Request(url)
1195                 try:
1196                         self.report_download_webpage(video_id)
1197                         webpage = urllib2.urlopen(request).read()
1198                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1200                         return
1201
1202                 # Extract URL, uploader, and title from webpage
1203                 self.report_extraction(video_id)
1204                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1205                 if mobj is None:
1206                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1207                         return
1208                 mediaURL = urllib.unquote(mobj.group(1))
1209
1210                 video_url = mediaURL
1211
1212                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1213                 if mobj is None:
1214                         self._downloader.trouble(u'ERROR: unable to extract title')
1215                         return
1216                 video_title = mobj.group(1).decode('utf-8')
1217                 video_title = sanitize_title(video_title)
1218                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1219
1220                 video_uploader = mobj.group(2).decode('utf-8')
1221
1222                 try:
1223                         # Process video information
1224                         self._downloader.process_info({
1225                                 'id':           video_id.decode('utf-8'),
1226                                 'url':          video_url.decode('utf-8'),
1227                                 'uploader':     video_uploader,
1228                                 'title':        video_title,
1229                                 'stitle':       simple_title,
1230                                 'ext':          video_extension.decode('utf-8'),
1231                                 'format':       u'NA',
1232                         })
1233                 except UnavailableFormatError:
1234                         self._downloader.trouble(u'ERROR: format not available for video')
1235
1236
1237 class YahooIE(InfoExtractor):
1238         """Information extractor for video.yahoo.com."""
1239
1240         # _VALID_URL matches all Yahoo! Video URLs
1241         # _VPAGE_URL matches only the extractable '/watch/' URLs
1242         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1243         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1244
1245         def __init__(self, downloader=None):
1246                 InfoExtractor.__init__(self, downloader)
1247
1248         @staticmethod
1249         def suitable(url):
1250                 return (re.match(YahooIE._VALID_URL, url) is not None)
1251
1252         def report_download_webpage(self, video_id):
1253                 """Report webpage download."""
1254                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1255
1256         def report_extraction(self, video_id):
1257                 """Report information extraction."""
1258                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1259
1260         def _real_initialize(self):
1261                 return
1262
1263         def _real_extract(self, url):
1264                 # Extract ID from URL
1265                 mobj = re.match(self._VALID_URL, url)
1266                 if mobj is None:
1267                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1268                         return
1269
1270                 video_id = mobj.group(2)
1271                 video_extension = 'flv'
1272
1273                 # Rewrite valid but non-extractable URLs as
1274                 # extractable English language /watch/ URLs
1275                 if re.match(self._VPAGE_URL, url) is None:
1276                         request = urllib2.Request(url)
1277                         try:
1278                                 webpage = urllib2.urlopen(request).read()
1279                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1280                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1281                                 return
1282
1283                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1284                         if mobj is None:
1285                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1286                                 return
1287                         yahoo_id = mobj.group(1)
1288
1289                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1290                         if mobj is None:
1291                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1292                                 return
1293                         yahoo_vid = mobj.group(1)
1294
1295                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1296                         return self._real_extract(url)
1297
1298                 # Retrieve video webpage to extract further information
1299                 request = urllib2.Request(url)
1300                 try:
1301                         self.report_download_webpage(video_id)
1302                         webpage = urllib2.urlopen(request).read()
1303                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1304                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1305                         return
1306
1307                 # Extract uploader and title from webpage
1308                 self.report_extraction(video_id)
1309                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1310                 if mobj is None:
1311                         self._downloader.trouble(u'ERROR: unable to extract video title')
1312                         return
1313                 video_title = mobj.group(1).decode('utf-8')
1314                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1315
1316                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1317                 if mobj is None:
1318                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1319                         return
1320                 video_uploader = mobj.group(1).decode('utf-8')
1321
1322                 # Extract video thumbnail
1323                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1324                 if mobj is None:
1325                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1326                         return
1327                 video_thumbnail = mobj.group(1).decode('utf-8')
1328
1329                 # Extract video description
1330                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1331                 if mobj is None:
1332                         self._downloader.trouble(u'ERROR: unable to extract video description')
1333                         return
1334                 video_description = mobj.group(1).decode('utf-8')
1335                 if not video_description: video_description = 'No description available.'
1336
1337                 # Extract video height and width
1338                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1339                 if mobj is None:
1340                         self._downloader.trouble(u'ERROR: unable to extract video height')
1341                         return
1342                 yv_video_height = mobj.group(1)
1343
1344                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1345                 if mobj is None:
1346                         self._downloader.trouble(u'ERROR: unable to extract video width')
1347                         return
1348                 yv_video_width = mobj.group(1)
1349
1350                 # Retrieve video playlist to extract media URL
1351                 # I'm not completely sure what all these options are, but we
1352                 # seem to need most of them, otherwise the server sends a 401.
1353                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1354                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1355                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1356                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1357                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1358                 try:
1359                         self.report_download_webpage(video_id)
1360                         webpage = urllib2.urlopen(request).read()
1361                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1362                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1363                         return
1364
1365                 # Extract media URL from playlist XML
1366                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1367                 if mobj is None:
1368                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1369                         return
1370                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1371                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1372
1373                 try:
1374                         # Process video information
1375                         self._downloader.process_info({
1376                                 'id':           video_id.decode('utf-8'),
1377                                 'url':          video_url,
1378                                 'uploader':     video_uploader,
1379                                 'title':        video_title,
1380                                 'stitle':       simple_title,
1381                                 'ext':          video_extension.decode('utf-8'),
1382                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1383                                 'description':  video_description,
1384                                 'thumbnail':    video_thumbnail,
1385                                 'description':  video_description,
1386                         })
1387                 except UnavailableFormatError:
1388                         self._downloader.trouble(u'ERROR: format not available for video')
1389
1390
1391 class GenericIE(InfoExtractor):
1392         """Generic last-resort information extractor."""
1393
1394         def __init__(self, downloader=None):
1395                 InfoExtractor.__init__(self, downloader)
1396
1397         @staticmethod
1398         def suitable(url):
1399                 return True
1400
1401         def report_download_webpage(self, video_id):
1402                 """Report webpage download."""
1403                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1404                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1405
1406         def report_extraction(self, video_id):
1407                 """Report information extraction."""
1408                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1409
1410         def _real_initialize(self):
1411                 return
1412
1413         def _real_extract(self, url):
1414                 video_id = url.split('/')[-1]
1415                 request = urllib2.Request(url)
1416                 try:
1417                         self.report_download_webpage(video_id)
1418                         webpage = urllib2.urlopen(request).read()
1419                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1420                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1421                         return
1422                 except ValueError, err:
1423                         # since this is the last-resort InfoExtractor, if
1424                         # this error is thrown, it'll be thrown here
1425                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1426                         return
1427
1428                 # Start with something easy: JW Player in SWFObject
1429                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1430                 if mobj is None:
1431                         # Broaden the search a little bit
1432                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1433                 if mobj is None:
1434                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1435                         return
1436
1437                 # It's possible that one of the regexes
1438                 # matched, but returned an empty group:
1439                 if mobj.group(1) is None:
1440                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1441                         return
1442
1443                 video_url = urllib.unquote(mobj.group(1))
1444                 video_id  = os.path.basename(video_url)
1445
1446                 # here's a fun little line of code for you:
1447                 video_extension = os.path.splitext(video_id)[1][1:]
1448                 video_id        = os.path.splitext(video_id)[0]
1449
1450                 # it's tempting to parse this further, but you would
1451                 # have to take into account all the variations like
1452                 #   Video Title - Site Name
1453                 #   Site Name | Video Title
1454                 #   Video Title - Tagline | Site Name
1455                 # and so on and so forth; it's just not practical
1456                 mobj = re.search(r'<title>(.*)</title>', webpage)
1457                 if mobj is None:
1458                         self._downloader.trouble(u'ERROR: unable to extract title')
1459                         return
1460                 video_title = mobj.group(1).decode('utf-8')
1461                 video_title = sanitize_title(video_title)
1462                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1463
1464                 # video uploader is domain name
1465                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1466                 if mobj is None:
1467                         self._downloader.trouble(u'ERROR: unable to extract title')
1468                         return
1469                 video_uploader = mobj.group(1).decode('utf-8')
1470
1471                 try:
1472                         # Process video information
1473                         self._downloader.process_info({
1474                                 'id':           video_id.decode('utf-8'),
1475                                 'url':          video_url.decode('utf-8'),
1476                                 'uploader':     video_uploader,
1477                                 'title':        video_title,
1478                                 'stitle':       simple_title,
1479                                 'ext':          video_extension.decode('utf-8'),
1480                                 'format':       u'NA',
1481                         })
1482                 except UnavailableFormatError:
1483                         self._downloader.trouble(u'ERROR: format not available for video')
1484
1485
1486 class YoutubeSearchIE(InfoExtractor):
1487         """Information Extractor for YouTube search queries."""
1488         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1489         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1490         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1491         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1492         _youtube_ie = None
1493         _max_youtube_results = 1000
1494
1495         def __init__(self, youtube_ie, downloader=None):
1496                 InfoExtractor.__init__(self, downloader)
1497                 self._youtube_ie = youtube_ie
1498
1499         @staticmethod
1500         def suitable(url):
1501                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1502
1503         def report_download_page(self, query, pagenum):
1504                 """Report attempt to download playlist page with given number."""
1505                 query = query.decode(preferredencoding())
1506                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1507
1508         def _real_initialize(self):
1509                 self._youtube_ie.initialize()
1510
1511         def _real_extract(self, query):
1512                 mobj = re.match(self._VALID_QUERY, query)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1515                         return
1516
1517                 prefix, query = query.split(':')
1518                 prefix = prefix[8:]
1519                 query  = query.encode('utf-8')
1520                 if prefix == '':
1521                         self._download_n_results(query, 1)
1522                         return
1523                 elif prefix == 'all':
1524                         self._download_n_results(query, self._max_youtube_results)
1525                         return
1526                 else:
1527                         try:
1528                                 n = long(prefix)
1529                                 if n <= 0:
1530                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1531                                         return
1532                                 elif n > self._max_youtube_results:
1533                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1534                                         n = self._max_youtube_results
1535                                 self._download_n_results(query, n)
1536                                 return
1537                         except ValueError: # parsing prefix as integer fails
1538                                 self._download_n_results(query, 1)
1539                                 return
1540
1541         def _download_n_results(self, query, n):
1542                 """Downloads a specified number of results for a query"""
1543
1544                 video_ids = []
1545                 already_seen = set()
1546                 pagenum = 1
1547
1548                 while True:
1549                         self.report_download_page(query, pagenum)
1550                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1551                         request = urllib2.Request(result_url, None, std_headers)
1552                         try:
1553                                 page = urllib2.urlopen(request).read()
1554                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1555                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1556                                 return
1557
1558                         # Extract video identifiers
1559                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1561                                 if video_id not in already_seen:
1562                                         video_ids.append(video_id)
1563                                         already_seen.add(video_id)
1564                                         if len(video_ids) == n:
1565                                                 # Specified n videos reached
1566                                                 for id in video_ids:
1567                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1568                                                 return
1569
1570                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1571                                 for id in video_ids:
1572                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1573                                 return
1574
1575                         pagenum = pagenum + 1
1576
1577 class GoogleSearchIE(InfoExtractor):
1578         """Information Extractor for Google Video search queries."""
1579         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1580         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1581         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1582         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1583         _google_ie = None
1584         _max_google_results = 1000
1585
1586         def __init__(self, google_ie, downloader=None):
1587                 InfoExtractor.__init__(self, downloader)
1588                 self._google_ie = google_ie
1589
1590         @staticmethod
1591         def suitable(url):
1592                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1593
1594         def report_download_page(self, query, pagenum):
1595                 """Report attempt to download playlist page with given number."""
1596                 query = query.decode(preferredencoding())
1597                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1598
1599         def _real_initialize(self):
1600                 self._google_ie.initialize()
1601
1602         def _real_extract(self, query):
1603                 mobj = re.match(self._VALID_QUERY, query)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1606                         return
1607
1608                 prefix, query = query.split(':')
1609                 prefix = prefix[8:]
1610                 query  = query.encode('utf-8')
1611                 if prefix == '':
1612                         self._download_n_results(query, 1)
1613                         return
1614                 elif prefix == 'all':
1615                         self._download_n_results(query, self._max_google_results)
1616                         return
1617                 else:
1618                         try:
1619                                 n = long(prefix)
1620                                 if n <= 0:
1621                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1622                                         return
1623                                 elif n > self._max_google_results:
1624                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1625                                         n = self._max_google_results
1626                                 self._download_n_results(query, n)
1627                                 return
1628                         except ValueError: # parsing prefix as integer fails
1629                                 self._download_n_results(query, 1)
1630                                 return
1631
1632         def _download_n_results(self, query, n):
1633                 """Downloads a specified number of results for a query"""
1634
1635                 video_ids = []
1636                 already_seen = set()
1637                 pagenum = 1
1638
1639                 while True:
1640                         self.report_download_page(query, pagenum)
1641                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1642                         request = urllib2.Request(result_url, None, std_headers)
1643                         try:
1644                                 page = urllib2.urlopen(request).read()
1645                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1647                                 return
1648
1649                         # Extract video identifiers
1650                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651                                 video_id = mobj.group(1)
1652                                 if video_id not in already_seen:
1653                                         video_ids.append(video_id)
1654                                         already_seen.add(video_id)
1655                                         if len(video_ids) == n:
1656                                                 # Specified n videos reached
1657                                                 for id in video_ids:
1658                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1659                                                 return
1660
1661                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662                                 for id in video_ids:
1663                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1664                                 return
1665
1666                         pagenum = pagenum + 1
1667
1668 class YahooSearchIE(InfoExtractor):
1669         """Information Extractor for Yahoo! Video search queries."""
1670         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1671         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1672         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1673         _MORE_PAGES_INDICATOR = r'\s*Next'
1674         _yahoo_ie = None
1675         _max_yahoo_results = 1000
1676
1677         def __init__(self, yahoo_ie, downloader=None):
1678                 InfoExtractor.__init__(self, downloader)
1679                 self._yahoo_ie = yahoo_ie
1680
1681         @staticmethod
1682         def suitable(url):
1683                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1684
1685         def report_download_page(self, query, pagenum):
1686                 """Report attempt to download playlist page with given number."""
1687                 query = query.decode(preferredencoding())
1688                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1689
1690         def _real_initialize(self):
1691                 self._yahoo_ie.initialize()
1692
1693         def _real_extract(self, query):
1694                 mobj = re.match(self._VALID_QUERY, query)
1695                 if mobj is None:
1696                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1697                         return
1698
1699                 prefix, query = query.split(':')
1700                 prefix = prefix[8:]
1701                 query  = query.encode('utf-8')
1702                 if prefix == '':
1703                         self._download_n_results(query, 1)
1704                         return
1705                 elif prefix == 'all':
1706                         self._download_n_results(query, self._max_yahoo_results)
1707                         return
1708                 else:
1709                         try:
1710                                 n = long(prefix)
1711                                 if n <= 0:
1712                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1713                                         return
1714                                 elif n > self._max_yahoo_results:
1715                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1716                                         n = self._max_yahoo_results
1717                                 self._download_n_results(query, n)
1718                                 return
1719                         except ValueError: # parsing prefix as integer fails
1720                                 self._download_n_results(query, 1)
1721                                 return
1722
1723         def _download_n_results(self, query, n):
1724                 """Downloads a specified number of results for a query"""
1725
1726                 video_ids = []
1727                 already_seen = set()
1728                 pagenum = 1
1729
1730                 while True:
1731                         self.report_download_page(query, pagenum)
1732                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1733                         request = urllib2.Request(result_url, None, std_headers)
1734                         try:
1735                                 page = urllib2.urlopen(request).read()
1736                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1737                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1738                                 return
1739
1740                         # Extract video identifiers
1741                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1742                                 video_id = mobj.group(1)
1743                                 if video_id not in already_seen:
1744                                         video_ids.append(video_id)
1745                                         already_seen.add(video_id)
1746                                         if len(video_ids) == n:
1747                                                 # Specified n videos reached
1748                                                 for id in video_ids:
1749                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1750                                                 return
1751
1752                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1753                                 for id in video_ids:
1754                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1755                                 return
1756
1757                         pagenum = pagenum + 1
1758
1759 class YoutubePlaylistIE(InfoExtractor):
1760         """Information Extractor for YouTube playlists."""
1761
1762         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1763         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1764         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1765         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1766         _youtube_ie = None
1767
1768         def __init__(self, youtube_ie, downloader=None):
1769                 InfoExtractor.__init__(self, downloader)
1770                 self._youtube_ie = youtube_ie
1771
1772         @staticmethod
1773         def suitable(url):
1774                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1775
1776         def report_download_page(self, playlist_id, pagenum):
1777                 """Report attempt to download playlist page with given number."""
1778                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1779
1780         def _real_initialize(self):
1781                 self._youtube_ie.initialize()
1782
1783         def _real_extract(self, url):
1784                 # Extract playlist id
1785                 mobj = re.match(self._VALID_URL, url)
1786                 if mobj is None:
1787                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1788                         return
1789
1790                 # Download playlist pages
1791                 playlist_id = mobj.group(1)
1792                 video_ids = []
1793                 pagenum = 1
1794
1795                 while True:
1796                         self.report_download_page(playlist_id, pagenum)
1797                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1798                         try:
1799                                 page = urllib2.urlopen(request).read()
1800                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1802                                 return
1803
1804                         # Extract video identifiers
1805                         ids_in_page = []
1806                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1807                                 if mobj.group(1) not in ids_in_page:
1808                                         ids_in_page.append(mobj.group(1))
1809                         video_ids.extend(ids_in_page)
1810
1811                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1812                                 break
1813                         pagenum = pagenum + 1
1814
1815                 for id in video_ids:
1816                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1817                 return
1818
1819 class YoutubeUserIE(InfoExtractor):
1820         """Information Extractor for YouTube users."""
1821
1822         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1823         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1824         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1825         _youtube_ie = None
1826
1827         def __init__(self, youtube_ie, downloader=None):
1828                 InfoExtractor.__init__(self, downloader)
1829                 self._youtube_ie = youtube_ie
1830
1831         @staticmethod
1832         def suitable(url):
1833                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1834
1835         def report_download_page(self, username):
1836                 """Report attempt to download user page."""
1837                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1838
1839         def _real_initialize(self):
1840                 self._youtube_ie.initialize()
1841
1842         def _real_extract(self, url):
1843                 # Extract username
1844                 mobj = re.match(self._VALID_URL, url)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1847                         return
1848
1849                 # Download user page
1850                 username = mobj.group(1)
1851                 video_ids = []
1852                 pagenum = 1
1853
1854                 self.report_download_page(username)
1855                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1856                 try:
1857                         page = urllib2.urlopen(request).read()
1858                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1860                         return
1861
1862                 # Extract video identifiers
1863                 ids_in_page = []
1864
1865                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1866                         if mobj.group(1) not in ids_in_page:
1867                                 ids_in_page.append(mobj.group(1))
1868                 video_ids.extend(ids_in_page)
1869
1870                 for id in video_ids:
1871                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1872                 return
1873
1874 class PostProcessor(object):
1875         """Post Processor class.
1876
1877         PostProcessor objects can be added to downloaders with their
1878         add_post_processor() method. When the downloader has finished a
1879         successful download, it will take its internal chain of PostProcessors
1880         and start calling the run() method on each one of them, first with
1881         an initial argument and then with the returned value of the previous
1882         PostProcessor.
1883
1884         The chain will be stopped if one of them ever returns None or the end
1885         of the chain is reached.
1886
1887         PostProcessor objects follow a "mutual registration" process similar
1888         to InfoExtractor objects.
1889         """
1890
1891         _downloader = None
1892
1893         def __init__(self, downloader=None):
1894                 self._downloader = downloader
1895
1896         def set_downloader(self, downloader):
1897                 """Sets the downloader for this PP."""
1898                 self._downloader = downloader
1899
1900         def run(self, information):
1901                 """Run the PostProcessor.
1902
1903                 The "information" argument is a dictionary like the ones
1904                 composed by InfoExtractors. The only difference is that this
1905                 one has an extra field called "filepath" that points to the
1906                 downloaded file.
1907
1908                 When this method returns None, the postprocessing chain is
1909                 stopped. However, this method may return an information
1910                 dictionary that will be passed to the next postprocessing
1911                 object in the chain. It can be the one it received after
1912                 changing some fields.
1913
1914                 In addition, this method may raise a PostProcessingError
1915                 exception that will be taken into account by the downloader
1916                 it was called from.
1917                 """
1918                 return information # by default, do nothing
1919
1920 ### MAIN PROGRAM ###
1921 if __name__ == '__main__':
1922         try:
1923                 # Modules needed only when running the main program
1924                 import getpass
1925                 import optparse
1926
1927                 # Function to update the program file with the latest version from bitbucket.org
1928                 def update_self(downloader, filename):
1929                         # Note: downloader only used for options
1930                         if not os.access (filename, os.W_OK):
1931                                 sys.exit('ERROR: no write permissions on %s' % filename)
1932
1933                         downloader.to_stdout('Updating to latest stable version...')
1934                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1935                         latest_version = urllib.urlopen(latest_url).read().strip()
1936                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1937                         newcontent = urllib.urlopen(prog_url).read()
1938                         stream = open(filename, 'w')
1939                         stream.write(newcontent)
1940                         stream.close()
1941                         downloader.to_stdout('Updated to version %s' % latest_version)
1942
1943                 # General configuration
1944                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1945                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1946                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1947
1948                 # Parse command line
1949                 parser = optparse.OptionParser(
1950                         usage='Usage: %prog [options] url...',
1951                         version='2010.04.04',
1952                         conflict_handler='resolve',
1953                 )
1954
1955                 parser.add_option('-h', '--help',
1956                                 action='help', help='print this help text and exit')
1957                 parser.add_option('-v', '--version',
1958                                 action='version', help='print program version and exit')
1959                 parser.add_option('-U', '--update',
1960                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1961                 parser.add_option('-i', '--ignore-errors',
1962                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1963                 parser.add_option('-r', '--rate-limit',
1964                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1965
1966                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1967                 authentication.add_option('-u', '--username',
1968                                 dest='username', metavar='UN', help='account username')
1969                 authentication.add_option('-p', '--password',
1970                                 dest='password', metavar='PW', help='account password')
1971                 authentication.add_option('-n', '--netrc',
1972                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1973                 parser.add_option_group(authentication)
1974
1975                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1976                 video_format.add_option('-f', '--format',
1977                                 action='store', dest='format', metavar='FMT', help='video format code')
1978                 video_format.add_option('-b', '--best-quality',
1979                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1980                 video_format.add_option('-m', '--mobile-version',
1981                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1982                 video_format.add_option('-d', '--high-def',
1983                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1984                 video_format.add_option('--all-formats',
1985                                 action='store_const', dest='format', help='download all available video formats', const='-1')
1986                 parser.add_option_group(video_format)
1987
1988                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1989                 verbosity.add_option('-q', '--quiet',
1990                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1991                 verbosity.add_option('-s', '--simulate',
1992                                 action='store_true', dest='simulate', help='do not download video', default=False)
1993                 verbosity.add_option('-g', '--get-url',
1994                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1995                 verbosity.add_option('-e', '--get-title',
1996                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1997                 verbosity.add_option('--get-thumbnail',
1998                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
1999                 verbosity.add_option('--get-description',
2000                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2001                 verbosity.add_option('--no-progress',
2002                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2003                 parser.add_option_group(verbosity)
2004
2005                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2006                 filesystem.add_option('-t', '--title',
2007                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2008                 filesystem.add_option('-l', '--literal',
2009                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2010                 filesystem.add_option('-o', '--output',
2011                                 dest='outtmpl', metavar='TPL', help='output filename template')
2012                 filesystem.add_option('-a', '--batch-file',
2013                                 dest='batchfile', metavar='F', help='file containing URLs to download')
2014                 filesystem.add_option('-w', '--no-overwrites',
2015                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2016                 filesystem.add_option('-c', '--continue',
2017                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2018                 parser.add_option_group(filesystem)
2019
2020                 (opts, args) = parser.parse_args()
2021
2022                 # Batch file verification
2023                 batchurls = []
2024                 if opts.batchfile is not None:
2025                         try:
2026                                 batchurls = open(opts.batchfile, 'r').readlines()
2027                                 batchurls = [x.strip() for x in batchurls]
2028                                 batchurls = [x for x in batchurls if len(x) > 0]
2029                         except IOError:
2030                                 sys.exit(u'ERROR: batch file could not be read')
2031                 all_urls = batchurls + args
2032
2033                 # Conflicting, missing and erroneous options
2034                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2035                         parser.error(u'using .netrc conflicts with giving username/password')
2036                 if opts.password is not None and opts.username is None:
2037                         parser.error(u'account username missing')
2038                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2039                         parser.error(u'using output template conflicts with using title or literal title')
2040                 if opts.usetitle and opts.useliteral:
2041                         parser.error(u'using title conflicts with using literal title')
2042                 if opts.username is not None and opts.password is None:
2043                         opts.password = getpass.getpass(u'Type account password and press return:')
2044                 if opts.ratelimit is not None:
2045                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2046                         if numeric_limit is None:
2047                                 parser.error(u'invalid rate limit specified')
2048                         opts.ratelimit = numeric_limit
2049
2050                 # Information extractors
2051                 youtube_ie = YoutubeIE()
2052                 metacafe_ie = MetacafeIE(youtube_ie)
2053                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2054                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2055                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2056                 google_ie = GoogleIE()
2057                 google_search_ie = GoogleSearchIE(google_ie)
2058                 photobucket_ie = PhotobucketIE()
2059                 yahoo_ie = YahooIE()
2060                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2061                 generic_ie = GenericIE()
2062
2063                 # File downloader
2064                 fd = FileDownloader({
2065                         'usenetrc': opts.usenetrc,
2066                         'username': opts.username,
2067                         'password': opts.password,
2068                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2069                         'forceurl': opts.geturl,
2070                         'forcetitle': opts.gettitle,
2071                         'forcethumbnail': opts.getthumbnail,
2072                         'forcedescription': opts.getdescription,
2073                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2074                         'format': opts.format,
2075                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2076                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2077                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2078                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2079                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2080                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2081                                 or u'%(id)s.%(ext)s'),
2082                         'ignoreerrors': opts.ignoreerrors,
2083                         'ratelimit': opts.ratelimit,
2084                         'nooverwrites': opts.nooverwrites,
2085                         'continuedl': opts.continue_dl,
2086                         'noprogress': opts.noprogress,
2087                         })
2088                 fd.add_info_extractor(youtube_search_ie)
2089                 fd.add_info_extractor(youtube_pl_ie)
2090                 fd.add_info_extractor(youtube_user_ie)
2091                 fd.add_info_extractor(metacafe_ie)
2092                 fd.add_info_extractor(youtube_ie)
2093                 fd.add_info_extractor(google_ie)
2094                 fd.add_info_extractor(google_search_ie)
2095                 fd.add_info_extractor(photobucket_ie)
2096                 fd.add_info_extractor(yahoo_ie)
2097                 fd.add_info_extractor(yahoo_search_ie)
2098
2099                 # This must come last since it's the
2100                 # fallback if none of the others work
2101                 fd.add_info_extractor(generic_ie)
2102
2103                 # Update version
2104                 if opts.update_self:
2105                         update_self(fd, sys.argv[0])
2106
2107                 # Maybe do nothing
2108                 if len(all_urls) < 1:
2109                         if not opts.update_self:
2110                                 parser.error(u'you must provide at least one URL')
2111                         else:
2112                                 sys.exit()
2113                 retcode = fd.download(all_urls)
2114                 sys.exit(retcode)
2115
2116         except DownloadError:
2117                 sys.exit(1)
2118         except SameFileError:
2119                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2120         except KeyboardInterrupt:
2121                 sys.exit(u'\nERROR: Interrupted by user')