youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import collections
   7 import errno
   8 import io
   9 import json
  10 import os
  11 import platform
  12 import re
  13 import shutil
  14 import subprocess
  15 import socket
  16 import sys
  17 import time
  18 import traceback
  19
  20 if os.name == 'nt':
  21     import ctypes
  22
  23 from .utils import (
  24     compat_cookiejar,
  25     compat_http_client,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_request,
  29     ContentTooShortError,
  30     date_from_str,
  31     DateRange,
  32     determine_ext,
  33     DownloadError,
  34     encodeFilename,
  35     ExtractorError,
  36     format_bytes,
  37     formatSeconds,
  38     get_term_width,
  39     locked_file,
  40     make_HTTPS_handler,
  41     MaxDownloadsReached,
  42     PostProcessingError,
  43     platform_name,
  44     preferredencoding,
  45     SameFileError,
  46     sanitize_filename,
  47     subtitles_filename,
  48     takewhile_inclusive,
  49     UnavailableVideoError,
  50     write_json_file,
  51     write_string,
  52     YoutubeDLHandler,
  53 )
  54 from .extractor import get_info_extractor, gen_extractors
  55 from .FileDownloader import FileDownloader
  56 from .version import __version__
  57
  58
  59 class YoutubeDL(object):
  60     """YoutubeDL class.
  61
  62     YoutubeDL objects are the ones responsible of downloading the
  63     actual video file and writing it to disk if the user has requested
  64     it, among some other tasks. In most cases there should be one per
  65     program. As, given a video URL, the downloader doesn't know how to
  66     extract all the needed information, task that InfoExtractors do, it
  67     has to pass the URL to one of them.
  68
  69     For this, YoutubeDL objects have a method that allows
  70     InfoExtractors to be registered in a given order. When it is passed
  71     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  72     finds that reports being able to handle it. The InfoExtractor extracts
  73     all the information about the video or videos the URL refers to, and
  74     YoutubeDL process the extracted information, possibly using a File
  75     Downloader to download the video.
  76
  77     YoutubeDL objects accept a lot of parameters. In order not to saturate
  78     the object constructor with arguments, it receives a dictionary of
  79     options instead. These options are available through the params
  80     attribute for the InfoExtractors to use. The YoutubeDL also
  81     registers itself as the downloader in charge for the InfoExtractors
  82     that are added to it, so this is a "mutual registration".
  83
  84     Available options:
  85
  86     username:          Username for authentication purposes.
  87     password:          Password for authentication purposes.
  88     videopassword:     Password for acces a video.
  89     usenetrc:          Use netrc for authentication instead.
  90     verbose:           Print additional info to stdout.
  91     quiet:             Do not print messages to stdout.
  92     forceurl:          Force printing final URL.
  93     forcetitle:        Force printing title.
  94     forceid:           Force printing ID.
  95     forcethumbnail:    Force printing thumbnail URL.
  96     forcedescription:  Force printing description.
  97     forcefilename:     Force printing final filename.
  98     forceduration:     Force printing duration.
  99     forcejson:         Force printing info_dict as JSON.
 100     simulate:          Do not download the video files.
 101     format:            Video format code.
 102     format_limit:      Highest quality format to try.
 103     outtmpl:           Template for output names.
 104     restrictfilenames: Do not allow "&" and spaces in file names
 105     ignoreerrors:      Do not stop on download errors.
 106     nooverwrites:      Prevent overwriting files.
 107     playliststart:     Playlist item to start at.
 108     playlistend:       Playlist item to end at.
 109     matchtitle:        Download only matching titles.
 110     rejecttitle:       Reject downloads for matching titles.
 111     logger:            Log messages to a logging.Logger instance.
 112     logtostderr:       Log messages to stderr instead of stdout.
 113     writedescription:  Write the video description to a .description file
 114     writeinfojson:     Write the video description to a .info.json file
 115     writeannotations:  Write the video annotations to a .annotations.xml file
 116     writethumbnail:    Write the thumbnail image to a file
 117     writesubtitles:    Write the video subtitles to a file
 118     writeautomaticsub: Write the automatic subtitles to a file
 119     allsubtitles:      Downloads all the subtitles of the video
 120                        (requires writesubtitles or writeautomaticsub)
 121     listsubtitles:     Lists all available subtitles for the video
 122     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 123     subtitleslangs:    List of languages of the subtitles to download
 124     keepvideo:         Keep the video file after post-processing
 125     daterange:         A DateRange object, download only if the upload_date is in the range.
 126     skip_download:     Skip the actual download of the video file
 127     cachedir:          Location of the cache files in the filesystem.
 128                        None to disable filesystem cache.
 129     noplaylist:        Download single video instead of a playlist if in doubt.
 130     age_limit:         An integer representing the user's age in years.
 131                        Unsuitable videos for the given age are skipped.
 132     min_views:         An integer representing the minimum view count the video
 133                        must have in order to not be skipped.
 134                        Videos without view count information are always
 135                        downloaded. None for no limit.
 136     max_views:         An integer representing the maximum view count.
 137                        Videos that are more popular than that are not
 138                        downloaded.
 139                        Videos without view count information are always
 140                        downloaded. None for no limit.
 141     download_archive:  File name of a file where all downloads are recorded.
 142                        Videos already present in the file are not downloaded
 143                        again.
 144     cookiefile:        File name where cookies should be read from and dumped to.
 145     nocheckcertificate:Do not verify SSL certificates
 146     proxy:             URL of the proxy server to use
 147     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 148     bidi_workaround:   Work around buggy terminals without bidirectional text
 149                        support, using fridibi
 150
 151     The following parameters are not used by YoutubeDL itself, they are used by
 152     the FileDownloader:
 153     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 154     noresizebuffer, retries, continuedl, noprogress, consoletitle
 155     """
 156
 157     params = None
 158     _ies = []
 159     _pps = []
 160     _download_retcode = None
 161     _num_downloads = None
 162     _screen_file = None
 163
 164     def __init__(self, params=None):
 165         """Create a FileDownloader object with the given options."""
 166         self._ies = []
 167         self._ies_instances = {}
 168         self._pps = []
 169         self._progress_hooks = []
 170         self._download_retcode = 0
 171         self._num_downloads = 0
 172         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 173         self._err_file = sys.stderr
 174         self.params = {} if params is None else params
 175
 176         if params.get('bidi_workaround', False):
 177             try:
 178                 import pty
 179                 master, slave = pty.openpty()
 180                 width = get_term_width()
 181                 if width is None:
 182                     width_args = []
 183                 else:
 184                     width_args = ['-w', str(width)]
 185                 self._fribidi = subprocess.Popen(
 186                     ['fribidi', '-c', 'UTF-8'] + width_args,
 187                     stdin=subprocess.PIPE,
 188                     stdout=slave,
 189                     stderr=self._err_file)
 190                 self._fribidi_channel = os.fdopen(master, 'rb')
 191             except OSError as ose:
 192                 if ose.errno == 2:
 193                     self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 194                 else:
 195                     raise
 196
 197         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 198                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 199                 and not params['restrictfilenames']):
 200             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 201             self.report_warning(
 202                 u'Assuming --restrict-filenames since file system encoding '
 203                 u'cannot encode all charactes. '
 204                 u'Set the LC_ALL environment variable to fix this.')
 205             self.params['restrictfilenames'] = True
 206
 207         self.fd = FileDownloader(self, self.params)
 208
 209         if '%(stitle)s' in self.params.get('outtmpl', ''):
 210             self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 211
 212         self._setup_opener()
 213
 214     def add_info_extractor(self, ie):
 215         """Add an InfoExtractor object to the end of the list."""
 216         self._ies.append(ie)
 217         self._ies_instances[ie.ie_key()] = ie
 218         ie.set_downloader(self)
 219
 220     def get_info_extractor(self, ie_key):
 221         """
 222         Get an instance of an IE with name ie_key, it will try to get one from
 223         the _ies list, if there's no instance it will create a new one and add
 224         it to the extractor list.
 225         """
 226         ie = self._ies_instances.get(ie_key)
 227         if ie is None:
 228             ie = get_info_extractor(ie_key)()
 229             self.add_info_extractor(ie)
 230         return ie
 231
 232     def add_default_info_extractors(self):
 233         """
 234         Add the InfoExtractors returned by gen_extractors to the end of the list
 235         """
 236         for ie in gen_extractors():
 237             self.add_info_extractor(ie)
 238
 239     def add_post_processor(self, pp):
 240         """Add a PostProcessor object to the end of the chain."""
 241         self._pps.append(pp)
 242         pp.set_downloader(self)
 243
 244     def _bidi_workaround(self, message):
 245         if not hasattr(self, '_fribidi_channel'):
 246             return message
 247
 248         assert type(message) == type(u'')
 249         line_count = message.count(u'\n') + 1
 250         self._fribidi.stdin.write((message + u'\n').encode('utf-8'))
 251         self._fribidi.stdin.flush()
 252         res = u''.join(self._fribidi_channel.readline().decode('utf-8')
 253                        for _ in range(line_count))
 254         return res[:-len(u'\n')]
 255
 256     def to_screen(self, message, skip_eol=False):
 257         """Print message to stdout if not in quiet mode."""
 258         return self.to_stdout(message, skip_eol, check_quiet=True)
 259
 260     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 261         """Print message to stdout if not in quiet mode."""
 262         if self.params.get('logger'):
 263             self.params['logger'].debug(message)
 264         elif not check_quiet or not self.params.get('quiet', False):
 265             message = self._bidi_workaround(message)
 266             terminator = [u'\n', u''][skip_eol]
 267             output = message + terminator
 268
 269             write_string(output, self._screen_file)
 270
 271     def to_stderr(self, message):
 272         """Print message to stderr."""
 273         assert type(message) == type(u'')
 274         if self.params.get('logger'):
 275             self.params['logger'].error(message)
 276         else:
 277             message = self._bidi_workaround(message)
 278             output = message + u'\n'
 279             write_string(output, self._err_file)
 280
 281     def to_console_title(self, message):
 282         if not self.params.get('consoletitle', False):
 283             return
 284         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 285             # c_wchar_p() might not be necessary if `message` is
 286             # already of type unicode()
 287             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 288         elif 'TERM' in os.environ:
 289             write_string(u'\033]0;%s\007' % message, self._screen_file)
 290
 291     def save_console_title(self):
 292         if not self.params.get('consoletitle', False):
 293             return
 294         if 'TERM' in os.environ:
 295             # Save the title on stack
 296             write_string(u'\033[22;0t', self._screen_file)
 297
 298     def restore_console_title(self):
 299         if not self.params.get('consoletitle', False):
 300             return
 301         if 'TERM' in os.environ:
 302             # Restore the title from stack
 303             write_string(u'\033[23;0t', self._screen_file)
 304
 305     def __enter__(self):
 306         self.save_console_title()
 307         return self
 308
 309     def __exit__(self, *args):
 310         self.restore_console_title()
 311
 312         if self.params.get('cookiefile') is not None:
 313             self.cookiejar.save()
 314
 315     def trouble(self, message=None, tb=None):
 316         """Determine action to take when a download problem appears.
 317
 318         Depending on if the downloader has been configured to ignore
 319         download errors or not, this method may throw an exception or
 320         not when errors are found, after printing the message.
 321
 322         tb, if given, is additional traceback information.
 323         """
 324         if message is not None:
 325             self.to_stderr(message)
 326         if self.params.get('verbose'):
 327             if tb is None:
 328                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 329                     tb = u''
 330                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 331                         tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 332                     tb += compat_str(traceback.format_exc())
 333                 else:
 334                     tb_data = traceback.format_list(traceback.extract_stack())
 335                     tb = u''.join(tb_data)
 336             self.to_stderr(tb)
 337         if not self.params.get('ignoreerrors', False):
 338             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 339                 exc_info = sys.exc_info()[1].exc_info
 340             else:
 341                 exc_info = sys.exc_info()
 342             raise DownloadError(message, exc_info)
 343         self._download_retcode = 1
 344
 345     def report_warning(self, message):
 346         '''
 347         Print the message to stderr, it will be prefixed with 'WARNING:'
 348         If stderr is a tty file the 'WARNING:' will be colored
 349         '''
 350         if self._err_file.isatty() and os.name != 'nt':
 351             _msg_header = u'\033[0;33mWARNING:\033[0m'
 352         else:
 353             _msg_header = u'WARNING:'
 354         warning_message = u'%s %s' % (_msg_header, message)
 355         self.to_stderr(warning_message)
 356
 357     def report_error(self, message, tb=None):
 358         '''
 359         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 360         in red if stderr is a tty file.
 361         '''
 362         if self._err_file.isatty() and os.name != 'nt':
 363             _msg_header = u'\033[0;31mERROR:\033[0m'
 364         else:
 365             _msg_header = u'ERROR:'
 366         error_message = u'%s %s' % (_msg_header, message)
 367         self.trouble(error_message, tb)
 368
 369     def report_file_already_downloaded(self, file_name):
 370         """Report file has already been fully downloaded."""
 371         try:
 372             self.to_screen(u'[download] %s has already been downloaded' % file_name)
 373         except UnicodeEncodeError:
 374             self.to_screen(u'[download] The file has already been downloaded')
 375
 376     def increment_downloads(self):
 377         """Increment the ordinal that assigns a number to each file."""
 378         self._num_downloads += 1
 379
 380     def prepare_filename(self, info_dict):
 381         """Generate the output filename."""
 382         try:
 383             template_dict = dict(info_dict)
 384
 385             template_dict['epoch'] = int(time.time())
 386             autonumber_size = self.params.get('autonumber_size')
 387             if autonumber_size is None:
 388                 autonumber_size = 5
 389             autonumber_templ = u'%0' + str(autonumber_size) + u'd'
 390             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 391             if template_dict.get('playlist_index') is not None:
 392                 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
 393
 394             sanitize = lambda k, v: sanitize_filename(
 395                 compat_str(v),
 396                 restricted=self.params.get('restrictfilenames'),
 397                 is_id=(k == u'id'))
 398             template_dict = dict((k, sanitize(k, v))
 399                                  for k, v in template_dict.items()
 400                                  if v is not None)
 401             template_dict = collections.defaultdict(lambda: u'NA', template_dict)
 402
 403             tmpl = os.path.expanduser(self.params['outtmpl'])
 404             filename = tmpl % template_dict
 405             return filename
 406         except ValueError as err:
 407             self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
 408             return None
 409
 410     def _match_entry(self, info_dict):
 411         """ Returns None iff the file should be downloaded """
 412
 413         video_title = info_dict.get('title', info_dict.get('id', u'video'))
 414         if 'title' in info_dict:
 415             # This can happen when we're just evaluating the playlist
 416             title = info_dict['title']
 417             matchtitle = self.params.get('matchtitle', False)
 418             if matchtitle:
 419                 if not re.search(matchtitle, title, re.IGNORECASE):
 420                     return u'"' + title + '" title did not match pattern "' + matchtitle + '"'
 421             rejecttitle = self.params.get('rejecttitle', False)
 422             if rejecttitle:
 423                 if re.search(rejecttitle, title, re.IGNORECASE):
 424                     return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 425         date = info_dict.get('upload_date', None)
 426         if date is not None:
 427             dateRange = self.params.get('daterange', DateRange())
 428             if date not in dateRange:
 429                 return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 430         view_count = info_dict.get('view_count', None)
 431         if view_count is not None:
 432             min_views = self.params.get('min_views')
 433             if min_views is not None and view_count < min_views:
 434                 return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 435             max_views = self.params.get('max_views')
 436             if max_views is not None and view_count > max_views:
 437                 return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 438         age_limit = self.params.get('age_limit')
 439         if age_limit is not None:
 440             if age_limit < info_dict.get('age_limit', 0):
 441                 return u'Skipping "' + title + '" because it is age restricted'
 442         if self.in_download_archive(info_dict):
 443             return u'%s has already been recorded in archive' % video_title
 444         return None
 445
 446     @staticmethod
 447     def add_extra_info(info_dict, extra_info):
 448         '''Set the keys from extra_info in info dict if they are missing'''
 449         for key, value in extra_info.items():
 450             info_dict.setdefault(key, value)
 451
 452     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 453                      process=True):
 454         '''
 455         Returns a list with a dictionary for each video we find.
 456         If 'download', also downloads the videos.
 457         extra_info is a dict containing the extra values to add to each result
 458          '''
 459
 460         if ie_key:
 461             ies = [self.get_info_extractor(ie_key)]
 462         else:
 463             ies = self._ies
 464
 465         for ie in ies:
 466             if not ie.suitable(url):
 467                 continue
 468
 469             if not ie.working():
 470                 self.report_warning(u'The program functionality for this site has been marked as broken, '
 471                                     u'and will probably not work.')
 472
 473             try:
 474                 ie_result = ie.extract(url)
 475                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 476                     break
 477                 if isinstance(ie_result, list):
 478                     # Backwards compatibility: old IE result format
 479                     ie_result = {
 480                         '_type': 'compat_list',
 481                         'entries': ie_result,
 482                     }
 483                 self.add_extra_info(ie_result,
 484                     {
 485                         'extractor': ie.IE_NAME,
 486                         'webpage_url': url,
 487                         'extractor_key': ie.ie_key(),
 488                     })
 489                 if process:
 490                     return self.process_ie_result(ie_result, download, extra_info)
 491                 else:
 492                     return ie_result
 493             except ExtractorError as de: # An error we somewhat expected
 494                 self.report_error(compat_str(de), de.format_traceback())
 495                 break
 496             except Exception as e:
 497                 if self.params.get('ignoreerrors', False):
 498                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 499                     break
 500                 else:
 501                     raise
 502         else:
 503             self.report_error(u'no suitable InfoExtractor: %s' % url)
 504
 505     def process_ie_result(self, ie_result, download=True, extra_info={}):
 506         """
 507         Take the result of the ie(may be modified) and resolve all unresolved
 508         references (URLs, playlist items).
 509
 510         It will also download the videos if 'download'.
 511         Returns the resolved ie_result.
 512         """
 513
 514         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 515         if result_type == 'video':
 516             self.add_extra_info(ie_result, extra_info)
 517             return self.process_video_result(ie_result, download=download)
 518         elif result_type == 'url':
 519             # We have to add extra_info to the results because it may be
 520             # contained in a playlist
 521             return self.extract_info(ie_result['url'],
 522                                      download,
 523                                      ie_key=ie_result.get('ie_key'),
 524                                      extra_info=extra_info)
 525         elif result_type == 'url_transparent':
 526             # Use the information from the embedding page
 527             info = self.extract_info(
 528                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 529                 extra_info=extra_info, download=False, process=False)
 530
 531             def make_result(embedded_info):
 532                 new_result = ie_result.copy()
 533                 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
 534                           'entries', 'urlhandle', 'ie_key', 'duration',
 535                           'subtitles', 'annotations', 'format',
 536                           'thumbnail', 'thumbnails'):
 537                     if f in new_result:
 538                         del new_result[f]
 539                     if f in embedded_info:
 540                         new_result[f] = embedded_info[f]
 541                 return new_result
 542             new_result = make_result(info)
 543
 544             assert new_result.get('_type') != 'url_transparent'
 545             if new_result.get('_type') == 'compat_list':
 546                 new_result['entries'] = [
 547                     make_result(e) for e in new_result['entries']]
 548
 549             return self.process_ie_result(
 550                 new_result, download=download, extra_info=extra_info)
 551         elif result_type == 'playlist':
 552             # We process each entry in the playlist
 553             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 554             self.to_screen(u'[download] Downloading playlist: %s' % playlist)
 555
 556             playlist_results = []
 557
 558             n_all_entries = len(ie_result['entries'])
 559             playliststart = self.params.get('playliststart', 1) - 1
 560             playlistend = self.params.get('playlistend', None)
 561             # For backwards compatibility, interpret -1 as whole list
 562             if playlistend == -1:
 563                 playlistend = None
 564
 565             entries = ie_result['entries'][playliststart:playlistend]
 566             n_entries = len(entries)
 567
 568             self.to_screen(
 569                 u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 570                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 571
 572             for i, entry in enumerate(entries, 1):
 573                 self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries))
 574                 extra = {
 575                     'playlist': playlist,
 576                     'playlist_index': i + playliststart,
 577                     'extractor': ie_result['extractor'],
 578                     'webpage_url': ie_result['webpage_url'],
 579                     'extractor_key': ie_result['extractor_key'],
 580                 }
 581
 582                 reason = self._match_entry(entry)
 583                 if reason is not None:
 584                     self.to_screen(u'[download] ' + reason)
 585                     continue
 586
 587                 entry_result = self.process_ie_result(entry,
 588                                                       download=download,
 589                                                       extra_info=extra)
 590                 playlist_results.append(entry_result)
 591             ie_result['entries'] = playlist_results
 592             return ie_result
 593         elif result_type == 'compat_list':
 594             def _fixup(r):
 595                 self.add_extra_info(r,
 596                     {
 597                         'extractor': ie_result['extractor'],
 598                         'webpage_url': ie_result['webpage_url'],
 599                         'extractor_key': ie_result['extractor_key'],
 600                     })
 601                 return r
 602             ie_result['entries'] = [
 603                 self.process_ie_result(_fixup(r), download, extra_info)
 604                 for r in ie_result['entries']
 605             ]
 606             return ie_result
 607         else:
 608             raise Exception('Invalid result type: %s' % result_type)
 609
 610     def select_format(self, format_spec, available_formats):
 611         if format_spec == 'best' or format_spec is None:
 612             return available_formats[-1]
 613         elif format_spec == 'worst':
 614             return available_formats[0]
 615         else:
 616             extensions = [u'mp4', u'flv', u'webm', u'3gp']
 617             if format_spec in extensions:
 618                 filter_f = lambda f: f['ext'] == format_spec
 619             else:
 620                 filter_f = lambda f: f['format_id'] == format_spec
 621             matches = list(filter(filter_f, available_formats))
 622             if matches:
 623                 return matches[-1]
 624         return None
 625
 626     def process_video_result(self, info_dict, download=True):
 627         assert info_dict.get('_type', 'video') == 'video'
 628
 629         if 'playlist' not in info_dict:
 630             # It isn't part of a playlist
 631             info_dict['playlist'] = None
 632             info_dict['playlist_index'] = None
 633
 634         # This extractors handle format selection themselves
 635         if info_dict['extractor'] in [u'youtube', u'Youku']:
 636             if download:
 637                 self.process_info(info_dict)
 638             return info_dict
 639
 640         # We now pick which formats have to be downloaded
 641         if info_dict.get('formats') is None:
 642             # There's only one format available
 643             formats = [info_dict]
 644         else:
 645             formats = info_dict['formats']
 646
 647         # We check that all the formats have the format and format_id fields
 648         for (i, format) in enumerate(formats):
 649             if format.get('format_id') is None:
 650                 format['format_id'] = compat_str(i)
 651             if format.get('format') is None:
 652                 format['format'] = u'{id} - {res}{note}'.format(
 653                     id=format['format_id'],
 654                     res=self.format_resolution(format),
 655                     note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
 656                 )
 657             # Automatically determine file extension if missing
 658             if 'ext' not in format:
 659                 format['ext'] = determine_ext(format['url'])
 660
 661         if self.params.get('listformats', None):
 662             self.list_formats(info_dict)
 663             return
 664
 665         format_limit = self.params.get('format_limit', None)
 666         if format_limit:
 667             formats = list(takewhile_inclusive(
 668                 lambda f: f['format_id'] != format_limit, formats
 669             ))
 670         if self.params.get('prefer_free_formats'):
 671             def _free_formats_key(f):
 672                 try:
 673                     ext_ord = [u'flv', u'mp4', u'webm'].index(f['ext'])
 674                 except ValueError:
 675                     ext_ord = -1
 676                 # We only compare the extension if they have the same height and width
 677                 return (f.get('height'), f.get('width'), ext_ord)
 678             formats = sorted(formats, key=_free_formats_key)
 679
 680         req_format = self.params.get('format', 'best')
 681         if req_format is None:
 682             req_format = 'best'
 683         formats_to_download = []
 684         # The -1 is for supporting YoutubeIE
 685         if req_format in ('-1', 'all'):
 686             formats_to_download = formats
 687         else:
 688             # We can accept formats requestd in the format: 34/5/best, we pick
 689             # the first that is available, starting from left
 690             req_formats = req_format.split('/')
 691             for rf in req_formats:
 692                 selected_format = self.select_format(rf, formats)
 693                 if selected_format is not None:
 694                     formats_to_download = [selected_format]
 695                     break
 696         if not formats_to_download:
 697             raise ExtractorError(u'requested format not available',
 698                                  expected=True)
 699
 700         if download:
 701             if len(formats_to_download) > 1:
 702                 self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
 703             for format in formats_to_download:
 704                 new_info = dict(info_dict)
 705                 new_info.update(format)
 706                 self.process_info(new_info)
 707         # We update the info dict with the best quality format (backwards compatibility)
 708         info_dict.update(formats_to_download[-1])
 709         return info_dict
 710
 711     def process_info(self, info_dict):
 712         """Process a single resolved IE result."""
 713
 714         assert info_dict.get('_type', 'video') == 'video'
 715         #We increment the download the download count here to match the previous behaviour.
 716         self.increment_downloads()
 717
 718         info_dict['fulltitle'] = info_dict['title']
 719         if len(info_dict['title']) > 200:
 720             info_dict['title'] = info_dict['title'][:197] + u'...'
 721
 722         # Keep for backwards compatibility
 723         info_dict['stitle'] = info_dict['title']
 724
 725         if not 'format' in info_dict:
 726             info_dict['format'] = info_dict['ext']
 727
 728         reason = self._match_entry(info_dict)
 729         if reason is not None:
 730             self.to_screen(u'[download] ' + reason)
 731             return
 732
 733         max_downloads = self.params.get('max_downloads')
 734         if max_downloads is not None:
 735             if self._num_downloads > int(max_downloads):
 736                 raise MaxDownloadsReached()
 737
 738         filename = self.prepare_filename(info_dict)
 739
 740         # Forced printings
 741         if self.params.get('forcetitle', False):
 742             self.to_stdout(info_dict['fulltitle'])
 743         if self.params.get('forceid', False):
 744             self.to_stdout(info_dict['id'])
 745         if self.params.get('forceurl', False):
 746             # For RTMP URLs, also include the playpath
 747             self.to_stdout(info_dict['url'] + info_dict.get('play_path', u''))
 748         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
 749             self.to_stdout(info_dict['thumbnail'])
 750         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
 751             self.to_stdout(info_dict['description'])
 752         if self.params.get('forcefilename', False) and filename is not None:
 753             self.to_stdout(filename)
 754         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
 755             self.to_stdout(formatSeconds(info_dict['duration']))
 756         if self.params.get('forceformat', False):
 757             self.to_stdout(info_dict['format'])
 758         if self.params.get('forcejson', False):
 759             info_dict['_filename'] = filename
 760             self.to_stdout(json.dumps(info_dict))
 761
 762         # Do nothing else if in simulate mode
 763         if self.params.get('simulate', False):
 764             return
 765
 766         if filename is None:
 767             return
 768
 769         try:
 770             dn = os.path.dirname(encodeFilename(filename))
 771             if dn != '' and not os.path.exists(dn):
 772                 os.makedirs(dn)
 773         except (OSError, IOError) as err:
 774             self.report_error(u'unable to create directory ' + compat_str(err))
 775             return
 776
 777         if self.params.get('writedescription', False):
 778             descfn = filename + u'.description'
 779             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
 780                 self.to_screen(u'[info] Video description is already present')
 781             else:
 782                 try:
 783                     self.to_screen(u'[info] Writing video description to: ' + descfn)
 784                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 785                         descfile.write(info_dict['description'])
 786                 except (KeyError, TypeError):
 787                     self.report_warning(u'There\'s no description to write.')
 788                 except (OSError, IOError):
 789                     self.report_error(u'Cannot write description file ' + descfn)
 790                     return
 791
 792         if self.params.get('writeannotations', False):
 793             annofn = filename + u'.annotations.xml'
 794             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
 795                 self.to_screen(u'[info] Video annotations are already present')
 796             else:
 797                 try:
 798                     self.to_screen(u'[info] Writing video annotations to: ' + annofn)
 799                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
 800                         annofile.write(info_dict['annotations'])
 801                 except (KeyError, TypeError):
 802                     self.report_warning(u'There are no annotations to write.')
 803                 except (OSError, IOError):
 804                     self.report_error(u'Cannot write annotations file: ' + annofn)
 805                     return
 806
 807         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 808                                        self.params.get('writeautomaticsub')])
 809
 810         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 811             # subtitles download errors are already managed as troubles in relevant IE
 812             # that way it will silently go on when used with unsupporting IE
 813             subtitles = info_dict['subtitles']
 814             sub_format = self.params.get('subtitlesformat', 'srt')
 815             for sub_lang in subtitles.keys():
 816                 sub = subtitles[sub_lang]
 817                 if sub is None:
 818                     continue
 819                 try:
 820                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 821                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 822                         self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
 823                     else:
 824                         self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
 825                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 826                                 subfile.write(sub)
 827                 except (OSError, IOError):
 828                     self.report_error(u'Cannot write subtitles file ' + descfn)
 829                     return
 830
 831         if self.params.get('writeinfojson', False):
 832             infofn = os.path.splitext(filename)[0] + u'.info.json'
 833             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
 834                 self.to_screen(u'[info] Video description metadata is already present')
 835             else:
 836                 self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn)
 837                 try:
 838                     json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
 839                     write_json_file(json_info_dict, encodeFilename(infofn))
 840                 except (OSError, IOError):
 841                     self.report_error(u'Cannot write metadata to JSON file ' + infofn)
 842                     return
 843
 844         if self.params.get('writethumbnail', False):
 845             if info_dict.get('thumbnail') is not None:
 846                 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
 847                 thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
 848                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
 849                     self.to_screen(u'[%s] %s: Thumbnail is already present' %
 850                                    (info_dict['extractor'], info_dict['id']))
 851                 else:
 852                     self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
 853                                    (info_dict['extractor'], info_dict['id']))
 854                     try:
 855                         uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 856                         with open(thumb_filename, 'wb') as thumbf:
 857                             shutil.copyfileobj(uf, thumbf)
 858                         self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
 859                             (info_dict['extractor'], info_dict['id'], thumb_filename))
 860                     except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 861                         self.report_warning(u'Unable to download thumbnail "%s": %s' %
 862                             (info_dict['thumbnail'], compat_str(err)))
 863
 864         if not self.params.get('skip_download', False):
 865             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 866                 success = True
 867             else:
 868                 try:
 869                     success = self.fd._do_download(filename, info_dict)
 870                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 871                     self.report_error(u'unable to download video data: %s' % str(err))
 872                     return
 873                 except (OSError, IOError) as err:
 874                     raise UnavailableVideoError(err)
 875                 except (ContentTooShortError, ) as err:
 876                     self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 877                     return
 878
 879             if success:
 880                 try:
 881                     self.post_process(filename, info_dict)
 882                 except (PostProcessingError) as err:
 883                     self.report_error(u'postprocessing: %s' % str(err))
 884                     return
 885
 886         self.record_download_archive(info_dict)
 887
 888     def download(self, url_list):
 889         """Download a given list of URLs."""
 890         if (len(url_list) > 1 and
 891                 '%' not in self.params['outtmpl']
 892                 and self.params.get('max_downloads') != 1):
 893             raise SameFileError(self.params['outtmpl'])
 894
 895         for url in url_list:
 896             try:
 897                 #It also downloads the videos
 898                 self.extract_info(url)
 899             except UnavailableVideoError:
 900                 self.report_error(u'unable to download video')
 901             except MaxDownloadsReached:
 902                 self.to_screen(u'[info] Maximum number of downloaded files reached.')
 903                 raise
 904
 905         return self._download_retcode
 906
 907     def download_with_info_file(self, info_filename):
 908         with io.open(info_filename, 'r', encoding='utf-8') as f:
 909             info = json.load(f)
 910         try:
 911             self.process_ie_result(info, download=True)
 912         except DownloadError:
 913             webpage_url = info.get('webpage_url')
 914             if webpage_url is not None:
 915                 self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url)
 916                 return self.download([webpage_url])
 917             else:
 918                 raise
 919         return self._download_retcode
 920
 921     def post_process(self, filename, ie_info):
 922         """Run all the postprocessors on the given file."""
 923         info = dict(ie_info)
 924         info['filepath'] = filename
 925         keep_video = None
 926         for pp in self._pps:
 927             try:
 928                 keep_video_wish, new_info = pp.run(info)
 929                 if keep_video_wish is not None:
 930                     if keep_video_wish:
 931                         keep_video = keep_video_wish
 932                     elif keep_video is None:
 933                         # No clear decision yet, let IE decide
 934                         keep_video = keep_video_wish
 935             except PostProcessingError as e:
 936                 self.report_error(e.msg)
 937         if keep_video is False and not self.params.get('keepvideo', False):
 938             try:
 939                 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
 940                 os.remove(encodeFilename(filename))
 941             except (IOError, OSError):
 942                 self.report_warning(u'Unable to remove downloaded video file')
 943
 944     def _make_archive_id(self, info_dict):
 945         # Future-proof against any change in case
 946         # and backwards compatibility with prior versions
 947         extractor = info_dict.get('extractor_key')
 948         if extractor is None:
 949             if 'id' in info_dict:
 950                 extractor = info_dict.get('ie_key')  # key in a playlist
 951         if extractor is None:
 952             return None  # Incomplete video information
 953         return extractor.lower() + u' ' + info_dict['id']
 954
 955     def in_download_archive(self, info_dict):
 956         fn = self.params.get('download_archive')
 957         if fn is None:
 958             return False
 959
 960         vid_id = self._make_archive_id(info_dict)
 961         if vid_id is None:
 962             return False  # Incomplete video information
 963
 964         try:
 965             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
 966                 for line in archive_file:
 967                     if line.strip() == vid_id:
 968                         return True
 969         except IOError as ioe:
 970             if ioe.errno != errno.ENOENT:
 971                 raise
 972         return False
 973
 974     def record_download_archive(self, info_dict):
 975         fn = self.params.get('download_archive')
 976         if fn is None:
 977             return
 978         vid_id = self._make_archive_id(info_dict)
 979         assert vid_id
 980         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
 981             archive_file.write(vid_id + u'\n')
 982
 983     @staticmethod
 984     def format_resolution(format, default='unknown'):
 985         if format.get('vcodec') == 'none':
 986             return 'audio only'
 987         if format.get('_resolution') is not None:
 988             return format['_resolution']
 989         if format.get('height') is not None:
 990             if format.get('width') is not None:
 991                 res = u'%sx%s' % (format['width'], format['height'])
 992             else:
 993                 res = u'%sp' % format['height']
 994         else:
 995             res = default
 996         return res
 997
 998     def list_formats(self, info_dict):
 999         def format_note(fdict):
1000             res = u''
1001             if fdict.get('format_note') is not None:
1002                 res += fdict['format_note'] + u' '
1003             if (fdict.get('vcodec') is not None and
1004                     fdict.get('vcodec') != 'none'):
1005                 res += u'%-5s' % fdict['vcodec']
1006             elif fdict.get('vbr') is not None:
1007                 res += u'video'
1008             if fdict.get('vbr') is not None:
1009                 res += u'@%4dk' % fdict['vbr']
1010             if fdict.get('acodec') is not None:
1011                 if res:
1012                     res += u', '
1013                 res += u'%-5s' % fdict['acodec']
1014             elif fdict.get('abr') is not None:
1015                 if res:
1016                     res += u', '
1017                 res += 'audio'
1018             if fdict.get('abr') is not None:
1019                 res += u'@%3dk' % fdict['abr']
1020             if fdict.get('filesize') is not None:
1021                 if res:
1022                     res += u', '
1023                 res += format_bytes(fdict['filesize'])
1024             return res
1025
1026         def line(format, idlen=20):
1027             return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
1028                 format['format_id'],
1029                 format['ext'],
1030                 self.format_resolution(format),
1031                 format_note(format),
1032             ))
1033
1034         formats = info_dict.get('formats', [info_dict])
1035         idlen = max(len(u'format code'),
1036                     max(len(f['format_id']) for f in formats))
1037         formats_s = [line(f, idlen) for f in formats]
1038         if len(formats) > 1:
1039             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1040             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1041
1042         header_line = line({
1043             'format_id': u'format code', 'ext': u'extension',
1044             '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
1045         self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
1046                        (info_dict['id'], header_line, u"\n".join(formats_s)))
1047
1048     def urlopen(self, req):
1049         """ Start an HTTP download """
1050         return self._opener.open(req)
1051
1052     def print_debug_header(self):
1053         if not self.params.get('verbose'):
1054             return
1055         write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
1056         try:
1057             sp = subprocess.Popen(
1058                 ['git', 'rev-parse', '--short', 'HEAD'],
1059                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1060                 cwd=os.path.dirname(os.path.abspath(__file__)))
1061             out, err = sp.communicate()
1062             out = out.decode().strip()
1063             if re.match('[0-9a-f]+', out):
1064                 write_string(u'[debug] Git HEAD: ' + out + u'\n')
1065         except:
1066             try:
1067                 sys.exc_clear()
1068             except:
1069                 pass
1070         write_string(u'[debug] Python version %s - %s' %
1071                      (platform.python_version(), platform_name()) + u'\n')
1072
1073         proxy_map = {}
1074         for handler in self._opener.handlers:
1075             if hasattr(handler, 'proxies'):
1076                 proxy_map.update(handler.proxies)
1077         write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
1078
1079     def _setup_opener(self):
1080         timeout_val = self.params.get('socket_timeout')
1081         timeout = 600 if timeout_val is None else float(timeout_val)
1082
1083         opts_cookiefile = self.params.get('cookiefile')
1084         opts_proxy = self.params.get('proxy')
1085
1086         if opts_cookiefile is None:
1087             self.cookiejar = compat_cookiejar.CookieJar()
1088         else:
1089             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1090                 opts_cookiefile)
1091             if os.access(opts_cookiefile, os.R_OK):
1092                 self.cookiejar.load()
1093
1094         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1095             self.cookiejar)
1096         if opts_proxy is not None:
1097             if opts_proxy == '':
1098                 proxies = {}
1099             else:
1100                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1101         else:
1102             proxies = compat_urllib_request.getproxies()
1103             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1104             if 'http' in proxies and 'https' not in proxies:
1105                 proxies['https'] = proxies['http']
1106         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1107         https_handler = make_HTTPS_handler(
1108             self.params.get('nocheckcertificate', False))
1109         opener = compat_urllib_request.build_opener(
1110             https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
1111         # Delete the default user-agent header, which would otherwise apply in
1112         # cases where our custom HTTP handler doesn't come into play
1113         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1114         opener.addheaders = []
1115         self._opener = opener
1116
1117         # TODO remove this global modification
1118         compat_urllib_request.install_opener(opener)
1119         socket.setdefaulttimeout(timeout)