youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import contextlib
   8 import copy
   9 import datetime
  10 import errno
  11 import fileinput
  12 import io
  13 import itertools
  14 import json
  15 import locale
  16 import operator
  17 import os
  18 import platform
  19 import re
  20 import shutil
  21 import subprocess
  22 import socket
  23 import sys
  24 import time
  25 import tokenize
  26 import traceback
  27 import random
  28
  29 from .compat import (
  30     compat_basestring,
  31     compat_cookiejar,
  32     compat_get_terminal_size,
  33     compat_http_client,
  34     compat_kwargs,
  35     compat_numeric_types,
  36     compat_os_name,
  37     compat_str,
  38     compat_tokenize_tokenize,
  39     compat_urllib_error,
  40     compat_urllib_request,
  41     compat_urllib_request_DataHandler,
  42 )
  43 from .utils import (
  44     age_restricted,
  45     args_to_str,
  46     ContentTooShortError,
  47     date_from_str,
  48     DateRange,
  49     DEFAULT_OUTTMPL,
  50     determine_ext,
  51     determine_protocol,
  52     DownloadError,
  53     encode_compat_str,
  54     encodeFilename,
  55     error_to_compat_str,
  56     expand_path,
  57     ExtractorError,
  58     format_bytes,
  59     formatSeconds,
  60     GeoRestrictedError,
  61     int_or_none,
  62     ISO3166Utils,
  63     locked_file,
  64     make_HTTPS_handler,
  65     MaxDownloadsReached,
  66     PagedList,
  67     parse_filesize,
  68     PerRequestProxyHandler,
  69     platform_name,
  70     PostProcessingError,
  71     preferredencoding,
  72     prepend_extension,
  73     register_socks_protocols,
  74     render_table,
  75     replace_extension,
  76     SameFileError,
  77     sanitize_filename,
  78     sanitize_path,
  79     sanitize_url,
  80     sanitized_Request,
  81     std_headers,
  82     subtitles_filename,
  83     UnavailableVideoError,
  84     url_basename,
  85     version_tuple,
  86     write_json_file,
  87     write_string,
  88     YoutubeDLCookieProcessor,
  89     YoutubeDLHandler,
  90 )
  91 from .cache import Cache
  92 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
  93 from .downloader import get_suitable_downloader
  94 from .downloader.rtmp import rtmpdump_version
  95 from .postprocessor import (
  96     FFmpegFixupM3u8PP,
  97     FFmpegFixupM4aPP,
  98     FFmpegFixupStretchedPP,
  99     FFmpegMergerPP,
 100     FFmpegPostProcessor,
 101     get_postprocessor,
 102 )
 103 from .version import __version__
 104
 105 if compat_os_name == 'nt':
 106     import ctypes
 107
 108
 109 class YoutubeDL(object):
 110     """YoutubeDL class.
 111
 112     YoutubeDL objects are the ones responsible of downloading the
 113     actual video file and writing it to disk if the user has requested
 114     it, among some other tasks. In most cases there should be one per
 115     program. As, given a video URL, the downloader doesn't know how to
 116     extract all the needed information, task that InfoExtractors do, it
 117     has to pass the URL to one of them.
 118
 119     For this, YoutubeDL objects have a method that allows
 120     InfoExtractors to be registered in a given order. When it is passed
 121     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 122     finds that reports being able to handle it. The InfoExtractor extracts
 123     all the information about the video or videos the URL refers to, and
 124     YoutubeDL process the extracted information, possibly using a File
 125     Downloader to download the video.
 126
 127     YoutubeDL objects accept a lot of parameters. In order not to saturate
 128     the object constructor with arguments, it receives a dictionary of
 129     options instead. These options are available through the params
 130     attribute for the InfoExtractors to use. The YoutubeDL also
 131     registers itself as the downloader in charge for the InfoExtractors
 132     that are added to it, so this is a "mutual registration".
 133
 134     Available options:
 135
 136     username:          Username for authentication purposes.
 137     password:          Password for authentication purposes.
 138     videopassword:     Password for accessing a video.
 139     ap_mso:            Adobe Pass multiple-system operator identifier.
 140     ap_username:       Multiple-system operator account username.
 141     ap_password:       Multiple-system operator account password.
 142     usenetrc:          Use netrc for authentication instead.
 143     verbose:           Print additional info to stdout.
 144     quiet:             Do not print messages to stdout.
 145     no_warnings:       Do not print out anything for warnings.
 146     forceurl:          Force printing final URL.
 147     forcetitle:        Force printing title.
 148     forceid:           Force printing ID.
 149     forcethumbnail:    Force printing thumbnail URL.
 150     forcedescription:  Force printing description.
 151     forcefilename:     Force printing final filename.
 152     forceduration:     Force printing duration.
 153     forcejson:         Force printing info_dict as JSON.
 154     dump_single_json:  Force printing the info_dict of the whole playlist
 155                        (or video) as a single JSON line.
 156     simulate:          Do not download the video files.
 157     format:            Video format code. See options.py for more information.
 158     outtmpl:           Template for output names.
 159     restrictfilenames: Do not allow "&" and spaces in file names
 160     ignoreerrors:      Do not stop on download errors.
 161     force_generic_extractor: Force downloader to use the generic extractor
 162     nooverwrites:      Prevent overwriting files.
 163     playliststart:     Playlist item to start at.
 164     playlistend:       Playlist item to end at.
 165     playlist_items:    Specific indices of playlist to download.
 166     playlistreverse:   Download playlist items in reverse order.
 167     playlistrandom:    Download playlist items in random order.
 168     matchtitle:        Download only matching titles.
 169     rejecttitle:       Reject downloads for matching titles.
 170     logger:            Log messages to a logging.Logger instance.
 171     logtostderr:       Log messages to stderr instead of stdout.
 172     writedescription:  Write the video description to a .description file
 173     writeinfojson:     Write the video description to a .info.json file
 174     writeannotations:  Write the video annotations to a .annotations.xml file
 175     writethumbnail:    Write the thumbnail image to a file
 176     write_all_thumbnails:  Write all thumbnail formats to files
 177     writesubtitles:    Write the video subtitles to a file
 178     writeautomaticsub: Write the automatically generated subtitles to a file
 179     allsubtitles:      Downloads all the subtitles of the video
 180                        (requires writesubtitles or writeautomaticsub)
 181     listsubtitles:     Lists all available subtitles for the video
 182     subtitlesformat:   The format code for subtitles
 183     subtitleslangs:    List of languages of the subtitles to download
 184     keepvideo:         Keep the video file after post-processing
 185     daterange:         A DateRange object, download only if the upload_date is in the range.
 186     skip_download:     Skip the actual download of the video file
 187     cachedir:          Location of the cache files in the filesystem.
 188                        False to disable filesystem cache.
 189     noplaylist:        Download single video instead of a playlist if in doubt.
 190     age_limit:         An integer representing the user's age in years.
 191                        Unsuitable videos for the given age are skipped.
 192     min_views:         An integer representing the minimum view count the video
 193                        must have in order to not be skipped.
 194                        Videos without view count information are always
 195                        downloaded. None for no limit.
 196     max_views:         An integer representing the maximum view count.
 197                        Videos that are more popular than that are not
 198                        downloaded.
 199                        Videos without view count information are always
 200                        downloaded. None for no limit.
 201     download_archive:  File name of a file where all downloads are recorded.
 202                        Videos already present in the file are not downloaded
 203                        again.
 204     cookiefile:        File name where cookies should be read from and dumped to.
 205     nocheckcertificate:Do not verify SSL certificates
 206     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 207                        At the moment, this is only supported by YouTube.
 208     proxy:             URL of the proxy server to use
 209     geo_verification_proxy:  URL of the proxy to use for IP address verification
 210                        on geo-restricted sites. (Experimental)
 211     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 212     bidi_workaround:   Work around buggy terminals without bidirectional text
 213                        support, using fridibi
 214     debug_printtraffic:Print out sent and received HTTP traffic
 215     include_ads:       Download ads as well
 216     default_search:    Prepend this string if an input url is not valid.
 217                        'auto' for elaborate guessing
 218     encoding:          Use this encoding instead of the system-specified.
 219     extract_flat:      Do not resolve URLs, return the immediate result.
 220                        Pass in 'in_playlist' to only show this behavior for
 221                        playlist items.
 222     postprocessors:    A list of dictionaries, each with an entry
 223                        * key:  The name of the postprocessor. See
 224                                youtube_dl/postprocessor/__init__.py for a list.
 225                        as well as any further keyword arguments for the
 226                        postprocessor.
 227     progress_hooks:    A list of functions that get called on download
 228                        progress, with a dictionary with the entries
 229                        * status: One of "downloading", "error", or "finished".
 230                                  Check this first and ignore unknown values.
 231
 232                        If status is one of "downloading", or "finished", the
 233                        following properties may also be present:
 234                        * filename: The final filename (always present)
 235                        * tmpfilename: The filename we're currently writing to
 236                        * downloaded_bytes: Bytes on disk
 237                        * total_bytes: Size of the whole file, None if unknown
 238                        * total_bytes_estimate: Guess of the eventual file size,
 239                                                None if unavailable.
 240                        * elapsed: The number of seconds since download started.
 241                        * eta: The estimated time in seconds, None if unknown
 242                        * speed: The download speed in bytes/second, None if
 243                                 unknown
 244                        * fragment_index: The counter of the currently
 245                                          downloaded video fragment.
 246                        * fragment_count: The number of fragments (= individual
 247                                          files that will be merged)
 248
 249                        Progress hooks are guaranteed to be called at least once
 250                        (with status "finished") if the download is successful.
 251     merge_output_format: Extension to use when merging formats.
 252     fixup:             Automatically correct known faults of the file.
 253                        One of:
 254                        - "never": do nothing
 255                        - "warn": only emit a warning
 256                        - "detect_or_warn": check whether we can do anything
 257                                            about it, warn otherwise (default)
 258     source_address:    (Experimental) Client-side IP address to bind to.
 259     call_home:         Boolean, true iff we are allowed to contact the
 260                        youtube-dl servers for debugging.
 261     sleep_interval:    Number of seconds to sleep before each download when
 262                        used alone or a lower bound of a range for randomized
 263                        sleep before each download (minimum possible number
 264                        of seconds to sleep) when used along with
 265                        max_sleep_interval.
 266     max_sleep_interval:Upper bound of a range for randomized sleep before each
 267                        download (maximum possible number of seconds to sleep).
 268                        Must only be used along with sleep_interval.
 269                        Actual sleep time will be a random float from range
 270                        [sleep_interval; max_sleep_interval].
 271     listformats:       Print an overview of available video formats and exit.
 272     list_thumbnails:   Print a table of all thumbnails and exit.
 273     match_filter:      A function that gets called with the info_dict of
 274                        every video.
 275                        If it returns a message, the video is ignored.
 276                        If it returns None, the video is downloaded.
 277                        match_filter_func in utils.py is one example for this.
 278     no_color:          Do not emit color codes in output.
 279     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
 280                        HTTP header (experimental)
 281     geo_bypass_country:
 282                        Two-letter ISO 3166-2 country code that will be used for
 283                        explicit geographic restriction bypassing via faking
 284                        X-Forwarded-For HTTP header (experimental)
 285
 286     The following options determine which downloader is picked:
 287     external_downloader: Executable of the external downloader to call.
 288                        None or unset for standard (built-in) downloader.
 289     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
 290                        if True, otherwise use ffmpeg/avconv if False, otherwise
 291                        use downloader suggested by extractor if None.
 292
 293     The following parameters are not used by YoutubeDL itself, they are used by
 294     the downloader (see youtube_dl/downloader/common.py):
 295     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 296     noresizebuffer, retries, continuedl, noprogress, consoletitle,
 297     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
 298
 299     The following options are used by the post processors:
 300     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 301                        otherwise prefer avconv.
 302     postprocessor_args: A list of additional command-line arguments for the
 303                         postprocessor.
 304     """
 305
 306     _NUMERIC_FIELDS = set((
 307         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
 308         'timestamp', 'upload_year', 'upload_month', 'upload_day',
 309         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
 310         'average_rating', 'comment_count', 'age_limit',
 311         'start_time', 'end_time',
 312         'chapter_number', 'season_number', 'episode_number',
 313         'track_number', 'disc_number', 'release_year',
 314         'playlist_index',
 315     ))
 316
 317     params = None
 318     _ies = []
 319     _pps = []
 320     _download_retcode = None
 321     _num_downloads = None
 322     _screen_file = None
 323
 324     def __init__(self, params=None, auto_init=True):
 325         """Create a FileDownloader object with the given options."""
 326         if params is None:
 327             params = {}
 328         self._ies = []
 329         self._ies_instances = {}
 330         self._pps = []
 331         self._progress_hooks = []
 332         self._download_retcode = 0
 333         self._num_downloads = 0
 334         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 335         self._err_file = sys.stderr
 336         self.params = {
 337             # Default parameters
 338             'nocheckcertificate': False,
 339         }
 340         self.params.update(params)
 341         self.cache = Cache(self)
 342
 343         def check_deprecated(param, option, suggestion):
 344             if self.params.get(param) is not None:
 345                 self.report_warning(
 346                     '%s is deprecated. Use %s instead.' % (option, suggestion))
 347                 return True
 348             return False
 349
 350         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
 351             if self.params.get('geo_verification_proxy') is None:
 352                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
 353
 354         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
 355         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
 356         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
 357
 358         if params.get('bidi_workaround', False):
 359             try:
 360                 import pty
 361                 master, slave = pty.openpty()
 362                 width = compat_get_terminal_size().columns
 363                 if width is None:
 364                     width_args = []
 365                 else:
 366                     width_args = ['-w', str(width)]
 367                 sp_kwargs = dict(
 368                     stdin=subprocess.PIPE,
 369                     stdout=slave,
 370                     stderr=self._err_file)
 371                 try:
 372                     self._output_process = subprocess.Popen(
 373                         ['bidiv'] + width_args, **sp_kwargs
 374                     )
 375                 except OSError:
 376                     self._output_process = subprocess.Popen(
 377                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 378                 self._output_channel = os.fdopen(master, 'rb')
 379             except OSError as ose:
 380                 if ose.errno == errno.ENOENT:
 381                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 382                 else:
 383                     raise
 384
 385         if (sys.platform != 'win32' and
 386                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
 387                 not params.get('restrictfilenames', False)):
 388             # Unicode filesystem API will throw errors (#1474, #13027)
 389             self.report_warning(
 390                 'Assuming --restrict-filenames since file system encoding '
 391                 'cannot encode all characters. '
 392                 'Set the LC_ALL environment variable to fix this.')
 393             self.params['restrictfilenames'] = True
 394
 395         if isinstance(params.get('outtmpl'), bytes):
 396             self.report_warning(
 397                 'Parameter outtmpl is bytes, but should be a unicode string. '
 398                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
 399
 400         self._setup_opener()
 401
 402         if auto_init:
 403             self.print_debug_header()
 404             self.add_default_info_extractors()
 405
 406         for pp_def_raw in self.params.get('postprocessors', []):
 407             pp_class = get_postprocessor(pp_def_raw['key'])
 408             pp_def = dict(pp_def_raw)
 409             del pp_def['key']
 410             pp = pp_class(self, **compat_kwargs(pp_def))
 411             self.add_post_processor(pp)
 412
 413         for ph in self.params.get('progress_hooks', []):
 414             self.add_progress_hook(ph)
 415
 416         register_socks_protocols()
 417
 418     def warn_if_short_id(self, argv):
 419         # short YouTube ID starting with dash?
 420         idxs = [
 421             i for i, a in enumerate(argv)
 422             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 423         if idxs:
 424             correct_argv = (
 425                 ['youtube-dl'] +
 426                 [a for i, a in enumerate(argv) if i not in idxs] +
 427                 ['--'] + [argv[i] for i in idxs]
 428             )
 429             self.report_warning(
 430                 'Long argument string detected. '
 431                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 432                 args_to_str(correct_argv))
 433
 434     def add_info_extractor(self, ie):
 435         """Add an InfoExtractor object to the end of the list."""
 436         self._ies.append(ie)
 437         if not isinstance(ie, type):
 438             self._ies_instances[ie.ie_key()] = ie
 439             ie.set_downloader(self)
 440
 441     def get_info_extractor(self, ie_key):
 442         """
 443         Get an instance of an IE with name ie_key, it will try to get one from
 444         the _ies list, if there's no instance it will create a new one and add
 445         it to the extractor list.
 446         """
 447         ie = self._ies_instances.get(ie_key)
 448         if ie is None:
 449             ie = get_info_extractor(ie_key)()
 450             self.add_info_extractor(ie)
 451         return ie
 452
 453     def add_default_info_extractors(self):
 454         """
 455         Add the InfoExtractors returned by gen_extractors to the end of the list
 456         """
 457         for ie in gen_extractor_classes():
 458             self.add_info_extractor(ie)
 459
 460     def add_post_processor(self, pp):
 461         """Add a PostProcessor object to the end of the chain."""
 462         self._pps.append(pp)
 463         pp.set_downloader(self)
 464
 465     def add_progress_hook(self, ph):
 466         """Add the progress hook (currently only for the file downloader)"""
 467         self._progress_hooks.append(ph)
 468
 469     def _bidi_workaround(self, message):
 470         if not hasattr(self, '_output_channel'):
 471             return message
 472
 473         assert hasattr(self, '_output_process')
 474         assert isinstance(message, compat_str)
 475         line_count = message.count('\n') + 1
 476         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 477         self._output_process.stdin.flush()
 478         res = ''.join(self._output_channel.readline().decode('utf-8')
 479                       for _ in range(line_count))
 480         return res[:-len('\n')]
 481
 482     def to_screen(self, message, skip_eol=False):
 483         """Print message to stdout if not in quiet mode."""
 484         return self.to_stdout(message, skip_eol, check_quiet=True)
 485
 486     def _write_string(self, s, out=None):
 487         write_string(s, out=out, encoding=self.params.get('encoding'))
 488
 489     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 490         """Print message to stdout if not in quiet mode."""
 491         if self.params.get('logger'):
 492             self.params['logger'].debug(message)
 493         elif not check_quiet or not self.params.get('quiet', False):
 494             message = self._bidi_workaround(message)
 495             terminator = ['\n', ''][skip_eol]
 496             output = message + terminator
 497
 498             self._write_string(output, self._screen_file)
 499
 500     def to_stderr(self, message):
 501         """Print message to stderr."""
 502         assert isinstance(message, compat_str)
 503         if self.params.get('logger'):
 504             self.params['logger'].error(message)
 505         else:
 506             message = self._bidi_workaround(message)
 507             output = message + '\n'
 508             self._write_string(output, self._err_file)
 509
 510     def to_console_title(self, message):
 511         if not self.params.get('consoletitle', False):
 512             return
 513         if compat_os_name == 'nt':
 514             if ctypes.windll.kernel32.GetConsoleWindow():
 515                 # c_wchar_p() might not be necessary if `message` is
 516                 # already of type unicode()
 517                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 518         elif 'TERM' in os.environ:
 519             self._write_string('\033]0;%s\007' % message, self._screen_file)
 520
 521     def save_console_title(self):
 522         if not self.params.get('consoletitle', False):
 523             return
 524         if compat_os_name != 'nt' and 'TERM' in os.environ:
 525             # Save the title on stack
 526             self._write_string('\033[22;0t', self._screen_file)
 527
 528     def restore_console_title(self):
 529         if not self.params.get('consoletitle', False):
 530             return
 531         if compat_os_name != 'nt' and 'TERM' in os.environ:
 532             # Restore the title from stack
 533             self._write_string('\033[23;0t', self._screen_file)
 534
 535     def __enter__(self):
 536         self.save_console_title()
 537         return self
 538
 539     def __exit__(self, *args):
 540         self.restore_console_title()
 541
 542         if self.params.get('cookiefile') is not None:
 543             self.cookiejar.save()
 544
 545     def trouble(self, message=None, tb=None):
 546         """Determine action to take when a download problem appears.
 547
 548         Depending on if the downloader has been configured to ignore
 549         download errors or not, this method may throw an exception or
 550         not when errors are found, after printing the message.
 551
 552         tb, if given, is additional traceback information.
 553         """
 554         if message is not None:
 555             self.to_stderr(message)
 556         if self.params.get('verbose'):
 557             if tb is None:
 558                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 559                     tb = ''
 560                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 561                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 562                     tb += encode_compat_str(traceback.format_exc())
 563                 else:
 564                     tb_data = traceback.format_list(traceback.extract_stack())
 565                     tb = ''.join(tb_data)
 566             self.to_stderr(tb)
 567         if not self.params.get('ignoreerrors', False):
 568             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 569                 exc_info = sys.exc_info()[1].exc_info
 570             else:
 571                 exc_info = sys.exc_info()
 572             raise DownloadError(message, exc_info)
 573         self._download_retcode = 1
 574
 575     def report_warning(self, message):
 576         '''
 577         Print the message to stderr, it will be prefixed with 'WARNING:'
 578         If stderr is a tty file the 'WARNING:' will be colored
 579         '''
 580         if self.params.get('logger') is not None:
 581             self.params['logger'].warning(message)
 582         else:
 583             if self.params.get('no_warnings'):
 584                 return
 585             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 586                 _msg_header = '\033[0;33mWARNING:\033[0m'
 587             else:
 588                 _msg_header = 'WARNING:'
 589             warning_message = '%s %s' % (_msg_header, message)
 590             self.to_stderr(warning_message)
 591
 592     def report_error(self, message, tb=None):
 593         '''
 594         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 595         in red if stderr is a tty file.
 596         '''
 597         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 598             _msg_header = '\033[0;31mERROR:\033[0m'
 599         else:
 600             _msg_header = 'ERROR:'
 601         error_message = '%s %s' % (_msg_header, message)
 602         self.trouble(error_message, tb)
 603
 604     def report_file_already_downloaded(self, file_name):
 605         """Report file has already been fully downloaded."""
 606         try:
 607             self.to_screen('[download] %s has already been downloaded' % file_name)
 608         except UnicodeEncodeError:
 609             self.to_screen('[download] The file has already been downloaded')
 610
 611     def prepare_filename(self, info_dict):
 612         """Generate the output filename."""
 613         try:
 614             template_dict = dict(info_dict)
 615
 616             template_dict['epoch'] = int(time.time())
 617             autonumber_size = self.params.get('autonumber_size')
 618             if autonumber_size is None:
 619                 autonumber_size = 5
 620             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
 621             if template_dict.get('resolution') is None:
 622                 if template_dict.get('width') and template_dict.get('height'):
 623                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 624                 elif template_dict.get('height'):
 625                     template_dict['resolution'] = '%sp' % template_dict['height']
 626                 elif template_dict.get('width'):
 627                     template_dict['resolution'] = '%dx?' % template_dict['width']
 628
 629             sanitize = lambda k, v: sanitize_filename(
 630                 compat_str(v),
 631                 restricted=self.params.get('restrictfilenames'),
 632                 is_id=(k == 'id' or k.endswith('_id')))
 633             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
 634                                  for k, v in template_dict.items()
 635                                  if v is not None and not isinstance(v, (list, tuple, dict)))
 636             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 637
 638             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 639
 640             # For fields playlist_index and autonumber convert all occurrences
 641             # of %(field)s to %(field)0Nd for backward compatibility
 642             field_size_compat_map = {
 643                 'playlist_index': len(str(template_dict['n_entries'])),
 644                 'autonumber': autonumber_size,
 645             }
 646             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
 647             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
 648             if mobj:
 649                 outtmpl = re.sub(
 650                     FIELD_SIZE_COMPAT_RE,
 651                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
 652                     outtmpl)
 653
 654             # Missing numeric fields used together with integer presentation types
 655             # in format specification will break the argument substitution since
 656             # string 'NA' is returned for missing fields. We will patch output
 657             # template for missing fields to meet string presentation type.
 658             for numeric_field in self._NUMERIC_FIELDS:
 659                 if numeric_field not in template_dict:
 660                     # As of [1] format syntax is:
 661                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
 662                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
 663                     FORMAT_RE = r'''(?x)
 664                         (?<!%)
 665                         %
 666                         \({0}\)  # mapping key
 667                         (?:[#0\-+ ]+)?  # conversion flags (optional)
 668                         (?:\d+)?  # minimum field width (optional)
 669                         (?:\.\d+)?  # precision (optional)
 670                         [hlL]?  # length modifier (optional)
 671                         [diouxXeEfFgGcrs%]  # conversion type
 672                     '''
 673                     outtmpl = re.sub(
 674                         FORMAT_RE.format(numeric_field),
 675                         r'%({0})s'.format(numeric_field), outtmpl)
 676
 677             filename = expand_path(outtmpl % template_dict)
 678             # Temporary fix for #4787
 679             # 'Treat' all problem characters by passing filename through preferredencoding
 680             # to workaround encoding issues with subprocess on python2 @ Windows
 681             if sys.version_info < (3, 0) and sys.platform == 'win32':
 682                 filename = encodeFilename(filename, True).decode(preferredencoding())
 683             return sanitize_path(filename)
 684         except ValueError as err:
 685             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 686             return None
 687
 688     def _match_entry(self, info_dict, incomplete):
 689         """ Returns None iff the file should be downloaded """
 690
 691         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 692         if 'title' in info_dict:
 693             # This can happen when we're just evaluating the playlist
 694             title = info_dict['title']
 695             matchtitle = self.params.get('matchtitle', False)
 696             if matchtitle:
 697                 if not re.search(matchtitle, title, re.IGNORECASE):
 698                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 699             rejecttitle = self.params.get('rejecttitle', False)
 700             if rejecttitle:
 701                 if re.search(rejecttitle, title, re.IGNORECASE):
 702                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 703         date = info_dict.get('upload_date')
 704         if date is not None:
 705             dateRange = self.params.get('daterange', DateRange())
 706             if date not in dateRange:
 707                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 708         view_count = info_dict.get('view_count')
 709         if view_count is not None:
 710             min_views = self.params.get('min_views')
 711             if min_views is not None and view_count < min_views:
 712                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 713             max_views = self.params.get('max_views')
 714             if max_views is not None and view_count > max_views:
 715                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 716         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 717             return 'Skipping "%s" because it is age restricted' % video_title
 718         if self.in_download_archive(info_dict):
 719             return '%s has already been recorded in archive' % video_title
 720
 721         if not incomplete:
 722             match_filter = self.params.get('match_filter')
 723             if match_filter is not None:
 724                 ret = match_filter(info_dict)
 725                 if ret is not None:
 726                     return ret
 727
 728         return None
 729
 730     @staticmethod
 731     def add_extra_info(info_dict, extra_info):
 732         '''Set the keys from extra_info in info dict if they are missing'''
 733         for key, value in extra_info.items():
 734             info_dict.setdefault(key, value)
 735
 736     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 737                      process=True, force_generic_extractor=False):
 738         '''
 739         Returns a list with a dictionary for each video we find.
 740         If 'download', also downloads the videos.
 741         extra_info is a dict containing the extra values to add to each result
 742         '''
 743
 744         if not ie_key and force_generic_extractor:
 745             ie_key = 'Generic'
 746
 747         if ie_key:
 748             ies = [self.get_info_extractor(ie_key)]
 749         else:
 750             ies = self._ies
 751
 752         for ie in ies:
 753             if not ie.suitable(url):
 754                 continue
 755
 756             ie = self.get_info_extractor(ie.ie_key())
 757             if not ie.working():
 758                 self.report_warning('The program functionality for this site has been marked as broken, '
 759                                     'and will probably not work.')
 760
 761             try:
 762                 ie_result = ie.extract(url)
 763                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 764                     break
 765                 if isinstance(ie_result, list):
 766                     # Backwards compatibility: old IE result format
 767                     ie_result = {
 768                         '_type': 'compat_list',
 769                         'entries': ie_result,
 770                     }
 771                 self.add_default_extra_info(ie_result, ie, url)
 772                 if process:
 773                     return self.process_ie_result(ie_result, download, extra_info)
 774                 else:
 775                     return ie_result
 776             except GeoRestrictedError as e:
 777                 msg = e.msg
 778                 if e.countries:
 779                     msg += '\nThis video is available in %s.' % ', '.join(
 780                         map(ISO3166Utils.short2full, e.countries))
 781                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
 782                 self.report_error(msg)
 783                 break
 784             except ExtractorError as e:  # An error we somewhat expected
 785                 self.report_error(compat_str(e), e.format_traceback())
 786                 break
 787             except MaxDownloadsReached:
 788                 raise
 789             except Exception as e:
 790                 if self.params.get('ignoreerrors', False):
 791                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
 792                     break
 793                 else:
 794                     raise
 795         else:
 796             self.report_error('no suitable InfoExtractor for URL %s' % url)
 797
 798     def add_default_extra_info(self, ie_result, ie, url):
 799         self.add_extra_info(ie_result, {
 800             'extractor': ie.IE_NAME,
 801             'webpage_url': url,
 802             'webpage_url_basename': url_basename(url),
 803             'extractor_key': ie.ie_key(),
 804         })
 805
 806     def process_ie_result(self, ie_result, download=True, extra_info={}):
 807         """
 808         Take the result of the ie(may be modified) and resolve all unresolved
 809         references (URLs, playlist items).
 810
 811         It will also download the videos if 'download'.
 812         Returns the resolved ie_result.
 813         """
 814         result_type = ie_result.get('_type', 'video')
 815
 816         if result_type in ('url', 'url_transparent'):
 817             ie_result['url'] = sanitize_url(ie_result['url'])
 818             extract_flat = self.params.get('extract_flat', False)
 819             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 820                     extract_flat is True):
 821                 if self.params.get('forcejson', False):
 822                     self.to_stdout(json.dumps(ie_result))
 823                 return ie_result
 824
 825         if result_type == 'video':
 826             self.add_extra_info(ie_result, extra_info)
 827             return self.process_video_result(ie_result, download=download)
 828         elif result_type == 'url':
 829             # We have to add extra_info to the results because it may be
 830             # contained in a playlist
 831             return self.extract_info(ie_result['url'],
 832                                      download,
 833                                      ie_key=ie_result.get('ie_key'),
 834                                      extra_info=extra_info)
 835         elif result_type == 'url_transparent':
 836             # Use the information from the embedding page
 837             info = self.extract_info(
 838                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 839                 extra_info=extra_info, download=False, process=False)
 840
 841             # extract_info may return None when ignoreerrors is enabled and
 842             # extraction failed with an error, don't crash and return early
 843             # in this case
 844             if not info:
 845                 return info
 846
 847             force_properties = dict(
 848                 (k, v) for k, v in ie_result.items() if v is not None)
 849             for f in ('_type', 'url', 'ie_key'):
 850                 if f in force_properties:
 851                     del force_properties[f]
 852             new_result = info.copy()
 853             new_result.update(force_properties)
 854
 855             # Extracted info may not be a video result (i.e.
 856             # info.get('_type', 'video') != video) but rather an url or
 857             # url_transparent. In such cases outer metadata (from ie_result)
 858             # should be propagated to inner one (info). For this to happen
 859             # _type of info should be overridden with url_transparent. This
 860             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
 861             if new_result.get('_type') == 'url':
 862                 new_result['_type'] = 'url_transparent'
 863
 864             return self.process_ie_result(
 865                 new_result, download=download, extra_info=extra_info)
 866         elif result_type in ('playlist', 'multi_video'):
 867             # We process each entry in the playlist
 868             playlist = ie_result.get('title') or ie_result.get('id')
 869             self.to_screen('[download] Downloading playlist: %s' % playlist)
 870
 871             playlist_results = []
 872
 873             playliststart = self.params.get('playliststart', 1) - 1
 874             playlistend = self.params.get('playlistend')
 875             # For backwards compatibility, interpret -1 as whole list
 876             if playlistend == -1:
 877                 playlistend = None
 878
 879             playlistitems_str = self.params.get('playlist_items')
 880             playlistitems = None
 881             if playlistitems_str is not None:
 882                 def iter_playlistitems(format):
 883                     for string_segment in format.split(','):
 884                         if '-' in string_segment:
 885                             start, end = string_segment.split('-')
 886                             for item in range(int(start), int(end) + 1):
 887                                 yield int(item)
 888                         else:
 889                             yield int(string_segment)
 890                 playlistitems = iter_playlistitems(playlistitems_str)
 891
 892             ie_entries = ie_result['entries']
 893             if isinstance(ie_entries, list):
 894                 n_all_entries = len(ie_entries)
 895                 if playlistitems:
 896                     entries = [
 897                         ie_entries[i - 1] for i in playlistitems
 898                         if -n_all_entries <= i - 1 < n_all_entries]
 899                 else:
 900                     entries = ie_entries[playliststart:playlistend]
 901                 n_entries = len(entries)
 902                 self.to_screen(
 903                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
 904                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 905             elif isinstance(ie_entries, PagedList):
 906                 if playlistitems:
 907                     entries = []
 908                     for item in playlistitems:
 909                         entries.extend(ie_entries.getslice(
 910                             item - 1, item
 911                         ))
 912                 else:
 913                     entries = ie_entries.getslice(
 914                         playliststart, playlistend)
 915                 n_entries = len(entries)
 916                 self.to_screen(
 917                     '[%s] playlist %s: Downloading %d videos' %
 918                     (ie_result['extractor'], playlist, n_entries))
 919             else:  # iterable
 920                 if playlistitems:
 921                     entry_list = list(ie_entries)
 922                     entries = [entry_list[i - 1] for i in playlistitems]
 923                 else:
 924                     entries = list(itertools.islice(
 925                         ie_entries, playliststart, playlistend))
 926                 n_entries = len(entries)
 927                 self.to_screen(
 928                     '[%s] playlist %s: Downloading %d videos' %
 929                     (ie_result['extractor'], playlist, n_entries))
 930
 931             if self.params.get('playlistreverse', False):
 932                 entries = entries[::-1]
 933
 934             if self.params.get('playlistrandom', False):
 935                 random.shuffle(entries)
 936
 937             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
 938
 939             for i, entry in enumerate(entries, 1):
 940                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 941                 # This __x_forwarded_for_ip thing is a bit ugly but requires
 942                 # minimal changes
 943                 if x_forwarded_for:
 944                     entry['__x_forwarded_for_ip'] = x_forwarded_for
 945                 extra = {
 946                     'n_entries': n_entries,
 947                     'playlist': playlist,
 948                     'playlist_id': ie_result.get('id'),
 949                     'playlist_title': ie_result.get('title'),
 950                     'playlist_index': i + playliststart,
 951                     'extractor': ie_result['extractor'],
 952                     'webpage_url': ie_result['webpage_url'],
 953                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 954                     'extractor_key': ie_result['extractor_key'],
 955                 }
 956
 957                 reason = self._match_entry(entry, incomplete=True)
 958                 if reason is not None:
 959                     self.to_screen('[download] ' + reason)
 960                     continue
 961
 962                 entry_result = self.process_ie_result(entry,
 963                                                       download=download,
 964                                                       extra_info=extra)
 965                 playlist_results.append(entry_result)
 966             ie_result['entries'] = playlist_results
 967             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
 968             return ie_result
 969         elif result_type == 'compat_list':
 970             self.report_warning(
 971                 'Extractor %s returned a compat_list result. '
 972                 'It needs to be updated.' % ie_result.get('extractor'))
 973
 974             def _fixup(r):
 975                 self.add_extra_info(
 976                     r,
 977                     {
 978                         'extractor': ie_result['extractor'],
 979                         'webpage_url': ie_result['webpage_url'],
 980                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 981                         'extractor_key': ie_result['extractor_key'],
 982                     }
 983                 )
 984                 return r
 985             ie_result['entries'] = [
 986                 self.process_ie_result(_fixup(r), download, extra_info)
 987                 for r in ie_result['entries']
 988             ]
 989             return ie_result
 990         else:
 991             raise Exception('Invalid result type: %s' % result_type)
 992
 993     def _build_format_filter(self, filter_spec):
 994         " Returns a function to filter the formats according to the filter_spec "
 995
 996         OPERATORS = {
 997             '<': operator.lt,
 998             '<=': operator.le,
 999             '>': operator.gt,
1000             '>=': operator.ge,
1001             '=': operator.eq,
1002             '!=': operator.ne,
1003         }
1004         operator_rex = re.compile(r'''(?x)\s*
1005             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1006             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1007             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1008             $
1009             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1010         m = operator_rex.search(filter_spec)
1011         if m:
1012             try:
1013                 comparison_value = int(m.group('value'))
1014             except ValueError:
1015                 comparison_value = parse_filesize(m.group('value'))
1016                 if comparison_value is None:
1017                     comparison_value = parse_filesize(m.group('value') + 'B')
1018                 if comparison_value is None:
1019                     raise ValueError(
1020                         'Invalid value %r in format specification %r' % (
1021                             m.group('value'), filter_spec))
1022             op = OPERATORS[m.group('op')]
1023
1024         if not m:
1025             STR_OPERATORS = {
1026                 '=': operator.eq,
1027                 '!=': operator.ne,
1028                 '^=': lambda attr, value: attr.startswith(value),
1029                 '$=': lambda attr, value: attr.endswith(value),
1030                 '*=': lambda attr, value: value in attr,
1031             }
1032             str_operator_rex = re.compile(r'''(?x)
1033                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1034                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1035                 \s*(?P<value>[a-zA-Z0-9._-]+)
1036                 \s*$
1037                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1038             m = str_operator_rex.search(filter_spec)
1039             if m:
1040                 comparison_value = m.group('value')
1041                 op = STR_OPERATORS[m.group('op')]
1042
1043         if not m:
1044             raise ValueError('Invalid filter specification %r' % filter_spec)
1045
1046         def _filter(f):
1047             actual_value = f.get(m.group('key'))
1048             if actual_value is None:
1049                 return m.group('none_inclusive')
1050             return op(actual_value, comparison_value)
1051         return _filter
1052
1053     def build_format_selector(self, format_spec):
1054         def syntax_error(note, start):
1055             message = (
1056                 'Invalid format specification: '
1057                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1058             return SyntaxError(message)
1059
1060         PICKFIRST = 'PICKFIRST'
1061         MERGE = 'MERGE'
1062         SINGLE = 'SINGLE'
1063         GROUP = 'GROUP'
1064         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1065
1066         def _parse_filter(tokens):
1067             filter_parts = []
1068             for type, string, start, _, _ in tokens:
1069                 if type == tokenize.OP and string == ']':
1070                     return ''.join(filter_parts)
1071                 else:
1072                     filter_parts.append(string)
1073
1074         def _remove_unused_ops(tokens):
1075             # Remove operators that we don't use and join them with the surrounding strings
1076             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1077             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1078             last_string, last_start, last_end, last_line = None, None, None, None
1079             for type, string, start, end, line in tokens:
1080                 if type == tokenize.OP and string == '[':
1081                     if last_string:
1082                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1083                         last_string = None
1084                     yield type, string, start, end, line
1085                     # everything inside brackets will be handled by _parse_filter
1086                     for type, string, start, end, line in tokens:
1087                         yield type, string, start, end, line
1088                         if type == tokenize.OP and string == ']':
1089                             break
1090                 elif type == tokenize.OP and string in ALLOWED_OPS:
1091                     if last_string:
1092                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1093                         last_string = None
1094                     yield type, string, start, end, line
1095                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1096                     if not last_string:
1097                         last_string = string
1098                         last_start = start
1099                         last_end = end
1100                     else:
1101                         last_string += string
1102             if last_string:
1103                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1104
1105         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1106             selectors = []
1107             current_selector = None
1108             for type, string, start, _, _ in tokens:
1109                 # ENCODING is only defined in python 3.x
1110                 if type == getattr(tokenize, 'ENCODING', None):
1111                     continue
1112                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1113                     current_selector = FormatSelector(SINGLE, string, [])
1114                 elif type == tokenize.OP:
1115                     if string == ')':
1116                         if not inside_group:
1117                             # ')' will be handled by the parentheses group
1118                             tokens.restore_last_token()
1119                         break
1120                     elif inside_merge and string in ['/', ',']:
1121                         tokens.restore_last_token()
1122                         break
1123                     elif inside_choice and string == ',':
1124                         tokens.restore_last_token()
1125                         break
1126                     elif string == ',':
1127                         if not current_selector:
1128                             raise syntax_error('"," must follow a format selector', start)
1129                         selectors.append(current_selector)
1130                         current_selector = None
1131                     elif string == '/':
1132                         if not current_selector:
1133                             raise syntax_error('"/" must follow a format selector', start)
1134                         first_choice = current_selector
1135                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1136                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1137                     elif string == '[':
1138                         if not current_selector:
1139                             current_selector = FormatSelector(SINGLE, 'best', [])
1140                         format_filter = _parse_filter(tokens)
1141                         current_selector.filters.append(format_filter)
1142                     elif string == '(':
1143                         if current_selector:
1144                             raise syntax_error('Unexpected "("', start)
1145                         group = _parse_format_selection(tokens, inside_group=True)
1146                         current_selector = FormatSelector(GROUP, group, [])
1147                     elif string == '+':
1148                         video_selector = current_selector
1149                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1150                         if not video_selector or not audio_selector:
1151                             raise syntax_error('"+" must be between two format selectors', start)
1152                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1153                     else:
1154                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1155                 elif type == tokenize.ENDMARKER:
1156                     break
1157             if current_selector:
1158                 selectors.append(current_selector)
1159             return selectors
1160
1161         def _build_selector_function(selector):
1162             if isinstance(selector, list):
1163                 fs = [_build_selector_function(s) for s in selector]
1164
1165                 def selector_function(ctx):
1166                     for f in fs:
1167                         for format in f(ctx):
1168                             yield format
1169                 return selector_function
1170             elif selector.type == GROUP:
1171                 selector_function = _build_selector_function(selector.selector)
1172             elif selector.type == PICKFIRST:
1173                 fs = [_build_selector_function(s) for s in selector.selector]
1174
1175                 def selector_function(ctx):
1176                     for f in fs:
1177                         picked_formats = list(f(ctx))
1178                         if picked_formats:
1179                             return picked_formats
1180                     return []
1181             elif selector.type == SINGLE:
1182                 format_spec = selector.selector
1183
1184                 def selector_function(ctx):
1185                     formats = list(ctx['formats'])
1186                     if not formats:
1187                         return
1188                     if format_spec == 'all':
1189                         for f in formats:
1190                             yield f
1191                     elif format_spec in ['best', 'worst', None]:
1192                         format_idx = 0 if format_spec == 'worst' else -1
1193                         audiovideo_formats = [
1194                             f for f in formats
1195                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1196                         if audiovideo_formats:
1197                             yield audiovideo_formats[format_idx]
1198                         # for extractors with incomplete formats (audio only (soundcloud)
1199                         # or video only (imgur)) we will fallback to best/worst
1200                         # {video,audio}-only format
1201                         elif ctx['incomplete_formats']:
1202                             yield formats[format_idx]
1203                     elif format_spec == 'bestaudio':
1204                         audio_formats = [
1205                             f for f in formats
1206                             if f.get('vcodec') == 'none']
1207                         if audio_formats:
1208                             yield audio_formats[-1]
1209                     elif format_spec == 'worstaudio':
1210                         audio_formats = [
1211                             f for f in formats
1212                             if f.get('vcodec') == 'none']
1213                         if audio_formats:
1214                             yield audio_formats[0]
1215                     elif format_spec == 'bestvideo':
1216                         video_formats = [
1217                             f for f in formats
1218                             if f.get('acodec') == 'none']
1219                         if video_formats:
1220                             yield video_formats[-1]
1221                     elif format_spec == 'worstvideo':
1222                         video_formats = [
1223                             f for f in formats
1224                             if f.get('acodec') == 'none']
1225                         if video_formats:
1226                             yield video_formats[0]
1227                     else:
1228                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1229                         if format_spec in extensions:
1230                             filter_f = lambda f: f['ext'] == format_spec
1231                         else:
1232                             filter_f = lambda f: f['format_id'] == format_spec
1233                         matches = list(filter(filter_f, formats))
1234                         if matches:
1235                             yield matches[-1]
1236             elif selector.type == MERGE:
1237                 def _merge(formats_info):
1238                     format_1, format_2 = [f['format_id'] for f in formats_info]
1239                     # The first format must contain the video and the
1240                     # second the audio
1241                     if formats_info[0].get('vcodec') == 'none':
1242                         self.report_error('The first format must '
1243                                           'contain the video, try using '
1244                                           '"-f %s+%s"' % (format_2, format_1))
1245                         return
1246                     # Formats must be opposite (video+audio)
1247                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1248                         self.report_error(
1249                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1250                             % (format_1, format_2))
1251                         return
1252                     output_ext = (
1253                         formats_info[0]['ext']
1254                         if self.params.get('merge_output_format') is None
1255                         else self.params['merge_output_format'])
1256                     return {
1257                         'requested_formats': formats_info,
1258                         'format': '%s+%s' % (formats_info[0].get('format'),
1259                                              formats_info[1].get('format')),
1260                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1261                                                 formats_info[1].get('format_id')),
1262                         'width': formats_info[0].get('width'),
1263                         'height': formats_info[0].get('height'),
1264                         'resolution': formats_info[0].get('resolution'),
1265                         'fps': formats_info[0].get('fps'),
1266                         'vcodec': formats_info[0].get('vcodec'),
1267                         'vbr': formats_info[0].get('vbr'),
1268                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1269                         'acodec': formats_info[1].get('acodec'),
1270                         'abr': formats_info[1].get('abr'),
1271                         'ext': output_ext,
1272                     }
1273                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1274
1275                 def selector_function(ctx):
1276                     for pair in itertools.product(
1277                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1278                         yield _merge(pair)
1279
1280             filters = [self._build_format_filter(f) for f in selector.filters]
1281
1282             def final_selector(ctx):
1283                 ctx_copy = copy.deepcopy(ctx)
1284                 for _filter in filters:
1285                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1286                 return selector_function(ctx_copy)
1287             return final_selector
1288
1289         stream = io.BytesIO(format_spec.encode('utf-8'))
1290         try:
1291             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1292         except tokenize.TokenError:
1293             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1294
1295         class TokenIterator(object):
1296             def __init__(self, tokens):
1297                 self.tokens = tokens
1298                 self.counter = 0
1299
1300             def __iter__(self):
1301                 return self
1302
1303             def __next__(self):
1304                 if self.counter >= len(self.tokens):
1305                     raise StopIteration()
1306                 value = self.tokens[self.counter]
1307                 self.counter += 1
1308                 return value
1309
1310             next = __next__
1311
1312             def restore_last_token(self):
1313                 self.counter -= 1
1314
1315         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1316         return _build_selector_function(parsed_selector)
1317
1318     def _calc_headers(self, info_dict):
1319         res = std_headers.copy()
1320
1321         add_headers = info_dict.get('http_headers')
1322         if add_headers:
1323             res.update(add_headers)
1324
1325         cookies = self._calc_cookies(info_dict)
1326         if cookies:
1327             res['Cookie'] = cookies
1328
1329         if 'X-Forwarded-For' not in res:
1330             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1331             if x_forwarded_for_ip:
1332                 res['X-Forwarded-For'] = x_forwarded_for_ip
1333
1334         return res
1335
1336     def _calc_cookies(self, info_dict):
1337         pr = sanitized_Request(info_dict['url'])
1338         self.cookiejar.add_cookie_header(pr)
1339         return pr.get_header('Cookie')
1340
1341     def process_video_result(self, info_dict, download=True):
1342         assert info_dict.get('_type', 'video') == 'video'
1343
1344         if 'id' not in info_dict:
1345             raise ExtractorError('Missing "id" field in extractor result')
1346         if 'title' not in info_dict:
1347             raise ExtractorError('Missing "title" field in extractor result')
1348
1349         def report_force_conversion(field, field_not, conversion):
1350             self.report_warning(
1351                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1352                 % (field, field_not, conversion))
1353
1354         def sanitize_string_field(info, string_field):
1355             field = info.get(string_field)
1356             if field is None or isinstance(field, compat_str):
1357                 return
1358             report_force_conversion(string_field, 'a string', 'string')
1359             info[string_field] = compat_str(field)
1360
1361         def sanitize_numeric_fields(info):
1362             for numeric_field in self._NUMERIC_FIELDS:
1363                 field = info.get(numeric_field)
1364                 if field is None or isinstance(field, compat_numeric_types):
1365                     continue
1366                 report_force_conversion(numeric_field, 'numeric', 'int')
1367                 info[numeric_field] = int_or_none(field)
1368
1369         sanitize_string_field(info_dict, 'id')
1370         sanitize_numeric_fields(info_dict)
1371
1372         if 'playlist' not in info_dict:
1373             # It isn't part of a playlist
1374             info_dict['playlist'] = None
1375             info_dict['playlist_index'] = None
1376
1377         thumbnails = info_dict.get('thumbnails')
1378         if thumbnails is None:
1379             thumbnail = info_dict.get('thumbnail')
1380             if thumbnail:
1381                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1382         if thumbnails:
1383             thumbnails.sort(key=lambda t: (
1384                 t.get('preference') if t.get('preference') is not None else -1,
1385                 t.get('width') if t.get('width') is not None else -1,
1386                 t.get('height') if t.get('height') is not None else -1,
1387                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1388             for i, t in enumerate(thumbnails):
1389                 t['url'] = sanitize_url(t['url'])
1390                 if t.get('width') and t.get('height'):
1391                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1392                 if t.get('id') is None:
1393                     t['id'] = '%d' % i
1394
1395         if self.params.get('list_thumbnails'):
1396             self.list_thumbnails(info_dict)
1397             return
1398
1399         thumbnail = info_dict.get('thumbnail')
1400         if thumbnail:
1401             info_dict['thumbnail'] = sanitize_url(thumbnail)
1402         elif thumbnails:
1403             info_dict['thumbnail'] = thumbnails[-1]['url']
1404
1405         if 'display_id' not in info_dict and 'id' in info_dict:
1406             info_dict['display_id'] = info_dict['id']
1407
1408         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1409             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1410             # see http://bugs.python.org/issue1646728)
1411             try:
1412                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1413                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1414             except (ValueError, OverflowError, OSError):
1415                 pass
1416
1417         # Auto generate title fields corresponding to the *_number fields when missing
1418         # in order to always have clean titles. This is very common for TV series.
1419         for field in ('chapter', 'season', 'episode'):
1420             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1421                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1422
1423         subtitles = info_dict.get('subtitles')
1424         if subtitles:
1425             for _, subtitle in subtitles.items():
1426                 for subtitle_format in subtitle:
1427                     if subtitle_format.get('url'):
1428                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1429                     if subtitle_format.get('ext') is None:
1430                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1431
1432         if self.params.get('listsubtitles', False):
1433             if 'automatic_captions' in info_dict:
1434                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1435             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1436             return
1437         info_dict['requested_subtitles'] = self.process_subtitles(
1438             info_dict['id'], subtitles,
1439             info_dict.get('automatic_captions'))
1440
1441         # We now pick which formats have to be downloaded
1442         if info_dict.get('formats') is None:
1443             # There's only one format available
1444             formats = [info_dict]
1445         else:
1446             formats = info_dict['formats']
1447
1448         if not formats:
1449             raise ExtractorError('No video formats found!')
1450
1451         formats_dict = {}
1452
1453         # We check that all the formats have the format and format_id fields
1454         for i, format in enumerate(formats):
1455             if 'url' not in format:
1456                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1457
1458             sanitize_string_field(format, 'format_id')
1459             sanitize_numeric_fields(format)
1460             format['url'] = sanitize_url(format['url'])
1461
1462             if format.get('format_id') is None:
1463                 format['format_id'] = compat_str(i)
1464             else:
1465                 # Sanitize format_id from characters used in format selector expression
1466                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1467             format_id = format['format_id']
1468             if format_id not in formats_dict:
1469                 formats_dict[format_id] = []
1470             formats_dict[format_id].append(format)
1471
1472         # Make sure all formats have unique format_id
1473         for format_id, ambiguous_formats in formats_dict.items():
1474             if len(ambiguous_formats) > 1:
1475                 for i, format in enumerate(ambiguous_formats):
1476                     format['format_id'] = '%s-%d' % (format_id, i)
1477
1478         for i, format in enumerate(formats):
1479             if format.get('format') is None:
1480                 format['format'] = '{id} - {res}{note}'.format(
1481                     id=format['format_id'],
1482                     res=self.format_resolution(format),
1483                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1484                 )
1485             # Automatically determine file extension if missing
1486             if format.get('ext') is None:
1487                 format['ext'] = determine_ext(format['url']).lower()
1488             # Automatically determine protocol if missing (useful for format
1489             # selection purposes)
1490             if format.get('protocol') is None:
1491                 format['protocol'] = determine_protocol(format)
1492             # Add HTTP headers, so that external programs can use them from the
1493             # json output
1494             full_format_info = info_dict.copy()
1495             full_format_info.update(format)
1496             format['http_headers'] = self._calc_headers(full_format_info)
1497         # Remove private housekeeping stuff
1498         if '__x_forwarded_for_ip' in info_dict:
1499             del info_dict['__x_forwarded_for_ip']
1500
1501         # TODO Central sorting goes here
1502
1503         if formats[0] is not info_dict:
1504             # only set the 'formats' fields if the original info_dict list them
1505             # otherwise we end up with a circular reference, the first (and unique)
1506             # element in the 'formats' field in info_dict is info_dict itself,
1507             # which can't be exported to json
1508             info_dict['formats'] = formats
1509         if self.params.get('listformats'):
1510             self.list_formats(info_dict)
1511             return
1512
1513         req_format = self.params.get('format')
1514         if req_format is None:
1515             req_format_list = []
1516             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1517                     not info_dict.get('is_live')):
1518                 merger = FFmpegMergerPP(self)
1519                 if merger.available and merger.can_merge():
1520                     req_format_list.append('bestvideo+bestaudio')
1521             req_format_list.append('best')
1522             req_format = '/'.join(req_format_list)
1523         format_selector = self.build_format_selector(req_format)
1524
1525         # While in format selection we may need to have an access to the original
1526         # format set in order to calculate some metrics or do some processing.
1527         # For now we need to be able to guess whether original formats provided
1528         # by extractor are incomplete or not (i.e. whether extractor provides only
1529         # video-only or audio-only formats) for proper formats selection for
1530         # extractors with such incomplete formats (see
1531         # https://github.com/rg3/youtube-dl/pull/5556).
1532         # Since formats may be filtered during format selection and may not match
1533         # the original formats the results may be incorrect. Thus original formats
1534         # or pre-calculated metrics should be passed to format selection routines
1535         # as well.
1536         # We will pass a context object containing all necessary additional data
1537         # instead of just formats.
1538         # This fixes incorrect format selection issue (see
1539         # https://github.com/rg3/youtube-dl/issues/10083).
1540         incomplete_formats = (
1541             # All formats are video-only or
1542             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1543             # all formats are audio-only
1544             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1545
1546         ctx = {
1547             'formats': formats,
1548             'incomplete_formats': incomplete_formats,
1549         }
1550
1551         formats_to_download = list(format_selector(ctx))
1552         if not formats_to_download:
1553             raise ExtractorError('requested format not available',
1554                                  expected=True)
1555
1556         if download:
1557             if len(formats_to_download) > 1:
1558                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1559             for format in formats_to_download:
1560                 new_info = dict(info_dict)
1561                 new_info.update(format)
1562                 self.process_info(new_info)
1563         # We update the info dict with the best quality format (backwards compatibility)
1564         info_dict.update(formats_to_download[-1])
1565         return info_dict
1566
1567     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1568         """Select the requested subtitles and their format"""
1569         available_subs = {}
1570         if normal_subtitles and self.params.get('writesubtitles'):
1571             available_subs.update(normal_subtitles)
1572         if automatic_captions and self.params.get('writeautomaticsub'):
1573             for lang, cap_info in automatic_captions.items():
1574                 if lang not in available_subs:
1575                     available_subs[lang] = cap_info
1576
1577         if (not self.params.get('writesubtitles') and not
1578                 self.params.get('writeautomaticsub') or not
1579                 available_subs):
1580             return None
1581
1582         if self.params.get('allsubtitles', False):
1583             requested_langs = available_subs.keys()
1584         else:
1585             if self.params.get('subtitleslangs', False):
1586                 requested_langs = self.params.get('subtitleslangs')
1587             elif 'en' in available_subs:
1588                 requested_langs = ['en']
1589             else:
1590                 requested_langs = [list(available_subs.keys())[0]]
1591
1592         formats_query = self.params.get('subtitlesformat', 'best')
1593         formats_preference = formats_query.split('/') if formats_query else []
1594         subs = {}
1595         for lang in requested_langs:
1596             formats = available_subs.get(lang)
1597             if formats is None:
1598                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1599                 continue
1600             for ext in formats_preference:
1601                 if ext == 'best':
1602                     f = formats[-1]
1603                     break
1604                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1605                 if matches:
1606                     f = matches[-1]
1607                     break
1608             else:
1609                 f = formats[-1]
1610                 self.report_warning(
1611                     'No subtitle format found matching "%s" for language %s, '
1612                     'using %s' % (formats_query, lang, f['ext']))
1613             subs[lang] = f
1614         return subs
1615
1616     def process_info(self, info_dict):
1617         """Process a single resolved IE result."""
1618
1619         assert info_dict.get('_type', 'video') == 'video'
1620
1621         max_downloads = self.params.get('max_downloads')
1622         if max_downloads is not None:
1623             if self._num_downloads >= int(max_downloads):
1624                 raise MaxDownloadsReached()
1625
1626         info_dict['fulltitle'] = info_dict['title']
1627         if len(info_dict['title']) > 200:
1628             info_dict['title'] = info_dict['title'][:197] + '...'
1629
1630         if 'format' not in info_dict:
1631             info_dict['format'] = info_dict['ext']
1632
1633         reason = self._match_entry(info_dict, incomplete=False)
1634         if reason is not None:
1635             self.to_screen('[download] ' + reason)
1636             return
1637
1638         self._num_downloads += 1
1639
1640         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1641
1642         # Forced printings
1643         if self.params.get('forcetitle', False):
1644             self.to_stdout(info_dict['fulltitle'])
1645         if self.params.get('forceid', False):
1646             self.to_stdout(info_dict['id'])
1647         if self.params.get('forceurl', False):
1648             if info_dict.get('requested_formats') is not None:
1649                 for f in info_dict['requested_formats']:
1650                     self.to_stdout(f['url'] + f.get('play_path', ''))
1651             else:
1652                 # For RTMP URLs, also include the playpath
1653                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1654         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1655             self.to_stdout(info_dict['thumbnail'])
1656         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1657             self.to_stdout(info_dict['description'])
1658         if self.params.get('forcefilename', False) and filename is not None:
1659             self.to_stdout(filename)
1660         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1661             self.to_stdout(formatSeconds(info_dict['duration']))
1662         if self.params.get('forceformat', False):
1663             self.to_stdout(info_dict['format'])
1664         if self.params.get('forcejson', False):
1665             self.to_stdout(json.dumps(info_dict))
1666
1667         # Do nothing else if in simulate mode
1668         if self.params.get('simulate', False):
1669             return
1670
1671         if filename is None:
1672             return
1673
1674         try:
1675             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1676             if dn and not os.path.exists(dn):
1677                 os.makedirs(dn)
1678         except (OSError, IOError) as err:
1679             self.report_error('unable to create directory ' + error_to_compat_str(err))
1680             return
1681
1682         if self.params.get('writedescription', False):
1683             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1684             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1685                 self.to_screen('[info] Video description is already present')
1686             elif info_dict.get('description') is None:
1687                 self.report_warning('There\'s no description to write.')
1688             else:
1689                 try:
1690                     self.to_screen('[info] Writing video description to: ' + descfn)
1691                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1692                         descfile.write(info_dict['description'])
1693                 except (OSError, IOError):
1694                     self.report_error('Cannot write description file ' + descfn)
1695                     return
1696
1697         if self.params.get('writeannotations', False):
1698             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1699             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1700                 self.to_screen('[info] Video annotations are already present')
1701             else:
1702                 try:
1703                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1704                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1705                         annofile.write(info_dict['annotations'])
1706                 except (KeyError, TypeError):
1707                     self.report_warning('There are no annotations to write.')
1708                 except (OSError, IOError):
1709                     self.report_error('Cannot write annotations file: ' + annofn)
1710                     return
1711
1712         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1713                                        self.params.get('writeautomaticsub')])
1714
1715         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1716             # subtitles download errors are already managed as troubles in relevant IE
1717             # that way it will silently go on when used with unsupporting IE
1718             subtitles = info_dict['requested_subtitles']
1719             ie = self.get_info_extractor(info_dict['extractor_key'])
1720             for sub_lang, sub_info in subtitles.items():
1721                 sub_format = sub_info['ext']
1722                 if sub_info.get('data') is not None:
1723                     sub_data = sub_info['data']
1724                 else:
1725                     try:
1726                         sub_data = ie._download_webpage(
1727                             sub_info['url'], info_dict['id'], note=False)
1728                     except ExtractorError as err:
1729                         self.report_warning('Unable to download subtitle for "%s": %s' %
1730                                             (sub_lang, error_to_compat_str(err.cause)))
1731                         continue
1732                 try:
1733                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1734                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1735                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1736                     else:
1737                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1738                         # Use newline='' to prevent conversion of newline characters
1739                         # See https://github.com/rg3/youtube-dl/issues/10268
1740                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1741                             subfile.write(sub_data)
1742                 except (OSError, IOError):
1743                     self.report_error('Cannot write subtitles file ' + sub_filename)
1744                     return
1745
1746         if self.params.get('writeinfojson', False):
1747             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1748             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1749                 self.to_screen('[info] Video description metadata is already present')
1750             else:
1751                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1752                 try:
1753                     write_json_file(self.filter_requested_info(info_dict), infofn)
1754                 except (OSError, IOError):
1755                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1756                     return
1757
1758         self._write_thumbnails(info_dict, filename)
1759
1760         if not self.params.get('skip_download', False):
1761             try:
1762                 def dl(name, info):
1763                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1764                     for ph in self._progress_hooks:
1765                         fd.add_progress_hook(ph)
1766                     if self.params.get('verbose'):
1767                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1768                     return fd.download(name, info)
1769
1770                 if info_dict.get('requested_formats') is not None:
1771                     downloaded = []
1772                     success = True
1773                     merger = FFmpegMergerPP(self)
1774                     if not merger.available:
1775                         postprocessors = []
1776                         self.report_warning('You have requested multiple '
1777                                             'formats but ffmpeg or avconv are not installed.'
1778                                             ' The formats won\'t be merged.')
1779                     else:
1780                         postprocessors = [merger]
1781
1782                     def compatible_formats(formats):
1783                         video, audio = formats
1784                         # Check extension
1785                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1786                         if video_ext and audio_ext:
1787                             COMPATIBLE_EXTS = (
1788                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1789                                 ('webm')
1790                             )
1791                             for exts in COMPATIBLE_EXTS:
1792                                 if video_ext in exts and audio_ext in exts:
1793                                     return True
1794                         # TODO: Check acodec/vcodec
1795                         return False
1796
1797                     filename_real_ext = os.path.splitext(filename)[1][1:]
1798                     filename_wo_ext = (
1799                         os.path.splitext(filename)[0]
1800                         if filename_real_ext == info_dict['ext']
1801                         else filename)
1802                     requested_formats = info_dict['requested_formats']
1803                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1804                         info_dict['ext'] = 'mkv'
1805                         self.report_warning(
1806                             'Requested formats are incompatible for merge and will be merged into mkv.')
1807                     # Ensure filename always has a correct extension for successful merge
1808                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1809                     if os.path.exists(encodeFilename(filename)):
1810                         self.to_screen(
1811                             '[download] %s has already been downloaded and '
1812                             'merged' % filename)
1813                     else:
1814                         for f in requested_formats:
1815                             new_info = dict(info_dict)
1816                             new_info.update(f)
1817                             fname = self.prepare_filename(new_info)
1818                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1819                             downloaded.append(fname)
1820                             partial_success = dl(fname, new_info)
1821                             success = success and partial_success
1822                         info_dict['__postprocessors'] = postprocessors
1823                         info_dict['__files_to_merge'] = downloaded
1824                 else:
1825                     # Just a single file
1826                     success = dl(filename, info_dict)
1827             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1829                 return
1830             except (OSError, IOError) as err:
1831                 raise UnavailableVideoError(err)
1832             except (ContentTooShortError, ) as err:
1833                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1834                 return
1835
1836             if success and filename != '-':
1837                 # Fixup content
1838                 fixup_policy = self.params.get('fixup')
1839                 if fixup_policy is None:
1840                     fixup_policy = 'detect_or_warn'
1841
1842                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1843
1844                 stretched_ratio = info_dict.get('stretched_ratio')
1845                 if stretched_ratio is not None and stretched_ratio != 1:
1846                     if fixup_policy == 'warn':
1847                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1848                             info_dict['id'], stretched_ratio))
1849                     elif fixup_policy == 'detect_or_warn':
1850                         stretched_pp = FFmpegFixupStretchedPP(self)
1851                         if stretched_pp.available:
1852                             info_dict.setdefault('__postprocessors', [])
1853                             info_dict['__postprocessors'].append(stretched_pp)
1854                         else:
1855                             self.report_warning(
1856                                 '%s: Non-uniform pixel ratio (%s). %s'
1857                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1858                     else:
1859                         assert fixup_policy in ('ignore', 'never')
1860
1861                 if (info_dict.get('requested_formats') is None and
1862                         info_dict.get('container') == 'm4a_dash'):
1863                     if fixup_policy == 'warn':
1864                         self.report_warning(
1865                             '%s: writing DASH m4a. '
1866                             'Only some players support this container.'
1867                             % info_dict['id'])
1868                     elif fixup_policy == 'detect_or_warn':
1869                         fixup_pp = FFmpegFixupM4aPP(self)
1870                         if fixup_pp.available:
1871                             info_dict.setdefault('__postprocessors', [])
1872                             info_dict['__postprocessors'].append(fixup_pp)
1873                         else:
1874                             self.report_warning(
1875                                 '%s: writing DASH m4a. '
1876                                 'Only some players support this container. %s'
1877                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1878                     else:
1879                         assert fixup_policy in ('ignore', 'never')
1880
1881                 if (info_dict.get('protocol') == 'm3u8_native' or
1882                         info_dict.get('protocol') == 'm3u8' and
1883                         self.params.get('hls_prefer_native')):
1884                     if fixup_policy == 'warn':
1885                         self.report_warning('%s: malformated aac bitstream.' % (
1886                             info_dict['id']))
1887                     elif fixup_policy == 'detect_or_warn':
1888                         fixup_pp = FFmpegFixupM3u8PP(self)
1889                         if fixup_pp.available:
1890                             info_dict.setdefault('__postprocessors', [])
1891                             info_dict['__postprocessors'].append(fixup_pp)
1892                         else:
1893                             self.report_warning(
1894                                 '%s: malformated aac bitstream. %s'
1895                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1896                     else:
1897                         assert fixup_policy in ('ignore', 'never')
1898
1899                 try:
1900                     self.post_process(filename, info_dict)
1901                 except (PostProcessingError) as err:
1902                     self.report_error('postprocessing: %s' % str(err))
1903                     return
1904                 self.record_download_archive(info_dict)
1905
1906     def download(self, url_list):
1907         """Download a given list of URLs."""
1908         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1909         if (len(url_list) > 1 and
1910                 outtmpl != '-' and
1911                 '%' not in outtmpl and
1912                 self.params.get('max_downloads') != 1):
1913             raise SameFileError(outtmpl)
1914
1915         for url in url_list:
1916             try:
1917                 # It also downloads the videos
1918                 res = self.extract_info(
1919                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1920             except UnavailableVideoError:
1921                 self.report_error('unable to download video')
1922             except MaxDownloadsReached:
1923                 self.to_screen('[info] Maximum number of downloaded files reached.')
1924                 raise
1925             else:
1926                 if self.params.get('dump_single_json', False):
1927                     self.to_stdout(json.dumps(res))
1928
1929         return self._download_retcode
1930
1931     def download_with_info_file(self, info_filename):
1932         with contextlib.closing(fileinput.FileInput(
1933                 [info_filename], mode='r',
1934                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1935             # FileInput doesn't have a read method, we can't call json.load
1936             info = self.filter_requested_info(json.loads('\n'.join(f)))
1937         try:
1938             self.process_ie_result(info, download=True)
1939         except DownloadError:
1940             webpage_url = info.get('webpage_url')
1941             if webpage_url is not None:
1942                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1943                 return self.download([webpage_url])
1944             else:
1945                 raise
1946         return self._download_retcode
1947
1948     @staticmethod
1949     def filter_requested_info(info_dict):
1950         return dict(
1951             (k, v) for k, v in info_dict.items()
1952             if k not in ['requested_formats', 'requested_subtitles'])
1953
1954     def post_process(self, filename, ie_info):
1955         """Run all the postprocessors on the given file."""
1956         info = dict(ie_info)
1957         info['filepath'] = filename
1958         pps_chain = []
1959         if ie_info.get('__postprocessors') is not None:
1960             pps_chain.extend(ie_info['__postprocessors'])
1961         pps_chain.extend(self._pps)
1962         for pp in pps_chain:
1963             files_to_delete = []
1964             try:
1965                 files_to_delete, info = pp.run(info)
1966             except PostProcessingError as e:
1967                 self.report_error(e.msg)
1968             if files_to_delete and not self.params.get('keepvideo', False):
1969                 for old_filename in files_to_delete:
1970                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1971                     try:
1972                         os.remove(encodeFilename(old_filename))
1973                     except (IOError, OSError):
1974                         self.report_warning('Unable to remove downloaded original file')
1975
1976     def _make_archive_id(self, info_dict):
1977         # Future-proof against any change in case
1978         # and backwards compatibility with prior versions
1979         extractor = info_dict.get('extractor_key')
1980         if extractor is None:
1981             if 'id' in info_dict:
1982                 extractor = info_dict.get('ie_key')  # key in a playlist
1983         if extractor is None:
1984             return None  # Incomplete video information
1985         return extractor.lower() + ' ' + info_dict['id']
1986
1987     def in_download_archive(self, info_dict):
1988         fn = self.params.get('download_archive')
1989         if fn is None:
1990             return False
1991
1992         vid_id = self._make_archive_id(info_dict)
1993         if vid_id is None:
1994             return False  # Incomplete video information
1995
1996         try:
1997             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1998                 for line in archive_file:
1999                     if line.strip() == vid_id:
2000                         return True
2001         except IOError as ioe:
2002             if ioe.errno != errno.ENOENT:
2003                 raise
2004         return False
2005
2006     def record_download_archive(self, info_dict):
2007         fn = self.params.get('download_archive')
2008         if fn is None:
2009             return
2010         vid_id = self._make_archive_id(info_dict)
2011         assert vid_id
2012         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2013             archive_file.write(vid_id + '\n')
2014
2015     @staticmethod
2016     def format_resolution(format, default='unknown'):
2017         if format.get('vcodec') == 'none':
2018             return 'audio only'
2019         if format.get('resolution') is not None:
2020             return format['resolution']
2021         if format.get('height') is not None:
2022             if format.get('width') is not None:
2023                 res = '%sx%s' % (format['width'], format['height'])
2024             else:
2025                 res = '%sp' % format['height']
2026         elif format.get('width') is not None:
2027             res = '%dx?' % format['width']
2028         else:
2029             res = default
2030         return res
2031
2032     def _format_note(self, fdict):
2033         res = ''
2034         if fdict.get('ext') in ['f4f', 'f4m']:
2035             res += '(unsupported) '
2036         if fdict.get('language'):
2037             if res:
2038                 res += ' '
2039             res += '[%s] ' % fdict['language']
2040         if fdict.get('format_note') is not None:
2041             res += fdict['format_note'] + ' '
2042         if fdict.get('tbr') is not None:
2043             res += '%4dk ' % fdict['tbr']
2044         if fdict.get('container') is not None:
2045             if res:
2046                 res += ', '
2047             res += '%s container' % fdict['container']
2048         if (fdict.get('vcodec') is not None and
2049                 fdict.get('vcodec') != 'none'):
2050             if res:
2051                 res += ', '
2052             res += fdict['vcodec']
2053             if fdict.get('vbr') is not None:
2054                 res += '@'
2055         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2056             res += 'video@'
2057         if fdict.get('vbr') is not None:
2058             res += '%4dk' % fdict['vbr']
2059         if fdict.get('fps') is not None:
2060             if res:
2061                 res += ', '
2062             res += '%sfps' % fdict['fps']
2063         if fdict.get('acodec') is not None:
2064             if res:
2065                 res += ', '
2066             if fdict['acodec'] == 'none':
2067                 res += 'video only'
2068             else:
2069                 res += '%-5s' % fdict['acodec']
2070         elif fdict.get('abr') is not None:
2071             if res:
2072                 res += ', '
2073             res += 'audio'
2074         if fdict.get('abr') is not None:
2075             res += '@%3dk' % fdict['abr']
2076         if fdict.get('asr') is not None:
2077             res += ' (%5dHz)' % fdict['asr']
2078         if fdict.get('filesize') is not None:
2079             if res:
2080                 res += ', '
2081             res += format_bytes(fdict['filesize'])
2082         elif fdict.get('filesize_approx') is not None:
2083             if res:
2084                 res += ', '
2085             res += '~' + format_bytes(fdict['filesize_approx'])
2086         return res
2087
2088     def list_formats(self, info_dict):
2089         formats = info_dict.get('formats', [info_dict])
2090         table = [
2091             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2092             for f in formats
2093             if f.get('preference') is None or f['preference'] >= -1000]
2094         if len(formats) > 1:
2095             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2096
2097         header_line = ['format code', 'extension', 'resolution', 'note']
2098         self.to_screen(
2099             '[info] Available formats for %s:\n%s' %
2100             (info_dict['id'], render_table(header_line, table)))
2101
2102     def list_thumbnails(self, info_dict):
2103         thumbnails = info_dict.get('thumbnails')
2104         if not thumbnails:
2105             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2106             return
2107
2108         self.to_screen(
2109             '[info] Thumbnails for %s:' % info_dict['id'])
2110         self.to_screen(render_table(
2111             ['ID', 'width', 'height', 'URL'],
2112             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2113
2114     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2115         if not subtitles:
2116             self.to_screen('%s has no %s' % (video_id, name))
2117             return
2118         self.to_screen(
2119             'Available %s for %s:' % (name, video_id))
2120         self.to_screen(render_table(
2121             ['Language', 'formats'],
2122             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2123                 for lang, formats in subtitles.items()]))
2124
2125     def urlopen(self, req):
2126         """ Start an HTTP download """
2127         if isinstance(req, compat_basestring):
2128             req = sanitized_Request(req)
2129         return self._opener.open(req, timeout=self._socket_timeout)
2130
2131     def print_debug_header(self):
2132         if not self.params.get('verbose'):
2133             return
2134
2135         if type('') is not compat_str:
2136             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2137             self.report_warning(
2138                 'Your Python is broken! Update to a newer and supported version')
2139
2140         stdout_encoding = getattr(
2141             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2142         encoding_str = (
2143             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2144                 locale.getpreferredencoding(),
2145                 sys.getfilesystemencoding(),
2146                 stdout_encoding,
2147                 self.get_encoding()))
2148         write_string(encoding_str, encoding=None)
2149
2150         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2151         if _LAZY_LOADER:
2152             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2153         try:
2154             sp = subprocess.Popen(
2155                 ['git', 'rev-parse', '--short', 'HEAD'],
2156                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2157                 cwd=os.path.dirname(os.path.abspath(__file__)))
2158             out, err = sp.communicate()
2159             out = out.decode().strip()
2160             if re.match('[0-9a-f]+', out):
2161                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2162         except Exception:
2163             try:
2164                 sys.exc_clear()
2165             except Exception:
2166                 pass
2167         self._write_string('[debug] Python version %s - %s\n' % (
2168             platform.python_version(), platform_name()))
2169
2170         exe_versions = FFmpegPostProcessor.get_versions(self)
2171         exe_versions['rtmpdump'] = rtmpdump_version()
2172         exe_str = ', '.join(
2173             '%s %s' % (exe, v)
2174             for exe, v in sorted(exe_versions.items())
2175             if v
2176         )
2177         if not exe_str:
2178             exe_str = 'none'
2179         self._write_string('[debug] exe versions: %s\n' % exe_str)
2180
2181         proxy_map = {}
2182         for handler in self._opener.handlers:
2183             if hasattr(handler, 'proxies'):
2184                 proxy_map.update(handler.proxies)
2185         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2186
2187         if self.params.get('call_home', False):
2188             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2189             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2190             latest_version = self.urlopen(
2191                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2192             if version_tuple(latest_version) > version_tuple(__version__):
2193                 self.report_warning(
2194                     'You are using an outdated version (newest version: %s)! '
2195                     'See https://yt-dl.org/update if you need help updating.' %
2196                     latest_version)
2197
2198     def _setup_opener(self):
2199         timeout_val = self.params.get('socket_timeout')
2200         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2201
2202         opts_cookiefile = self.params.get('cookiefile')
2203         opts_proxy = self.params.get('proxy')
2204
2205         if opts_cookiefile is None:
2206             self.cookiejar = compat_cookiejar.CookieJar()
2207         else:
2208             opts_cookiefile = expand_path(opts_cookiefile)
2209             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2210                 opts_cookiefile)
2211             if os.access(opts_cookiefile, os.R_OK):
2212                 self.cookiejar.load()
2213
2214         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2215         if opts_proxy is not None:
2216             if opts_proxy == '':
2217                 proxies = {}
2218             else:
2219                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2220         else:
2221             proxies = compat_urllib_request.getproxies()
2222             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2223             if 'http' in proxies and 'https' not in proxies:
2224                 proxies['https'] = proxies['http']
2225         proxy_handler = PerRequestProxyHandler(proxies)
2226
2227         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2228         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2229         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2230         data_handler = compat_urllib_request_DataHandler()
2231
2232         # When passing our own FileHandler instance, build_opener won't add the
2233         # default FileHandler and allows us to disable the file protocol, which
2234         # can be used for malicious purposes (see
2235         # https://github.com/rg3/youtube-dl/issues/8227)
2236         file_handler = compat_urllib_request.FileHandler()
2237
2238         def file_open(*args, **kwargs):
2239             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2240         file_handler.file_open = file_open
2241
2242         opener = compat_urllib_request.build_opener(
2243             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2244
2245         # Delete the default user-agent header, which would otherwise apply in
2246         # cases where our custom HTTP handler doesn't come into play
2247         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2248         opener.addheaders = []
2249         self._opener = opener
2250
2251     def encode(self, s):
2252         if isinstance(s, bytes):
2253             return s  # Already encoded
2254
2255         try:
2256             return s.encode(self.get_encoding())
2257         except UnicodeEncodeError as err:
2258             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2259             raise
2260
2261     def get_encoding(self):
2262         encoding = self.params.get('encoding')
2263         if encoding is None:
2264             encoding = preferredencoding()
2265         return encoding
2266
2267     def _write_thumbnails(self, info_dict, filename):
2268         if self.params.get('writethumbnail', False):
2269             thumbnails = info_dict.get('thumbnails')
2270             if thumbnails:
2271                 thumbnails = [thumbnails[-1]]
2272         elif self.params.get('write_all_thumbnails', False):
2273             thumbnails = info_dict.get('thumbnails')
2274         else:
2275             return
2276
2277         if not thumbnails:
2278             # No thumbnails present, so return immediately
2279             return
2280
2281         for t in thumbnails:
2282             thumb_ext = determine_ext(t['url'], 'jpg')
2283             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2284             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2285             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2286
2287             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2288                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2289                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2290             else:
2291                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2292                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2293                 try:
2294                     uf = self.urlopen(t['url'])
2295                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2296                         shutil.copyfileobj(uf, thumbf)
2297                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2298                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2299                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2300                     self.report_warning('Unable to download thumbnail "%s": %s' %
2301                                         (t['url'], error_to_compat_str(err)))