youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import collections
   7 import errno
   8 import io
   9 import json
  10 import os
  11 import platform
  12 import re
  13 import shutil
  14 import subprocess
  15 import socket
  16 import sys
  17 import time
  18 import traceback
  19
  20 if os.name == 'nt':
  21     import ctypes
  22
  23 from .utils import (
  24     compat_cookiejar,
  25     compat_http_client,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_request,
  29     ContentTooShortError,
  30     date_from_str,
  31     DateRange,
  32     determine_ext,
  33     DownloadError,
  34     encodeFilename,
  35     ExtractorError,
  36     format_bytes,
  37     formatSeconds,
  38     get_term_width,
  39     locked_file,
  40     make_HTTPS_handler,
  41     MaxDownloadsReached,
  42     PostProcessingError,
  43     platform_name,
  44     preferredencoding,
  45     SameFileError,
  46     sanitize_filename,
  47     subtitles_filename,
  48     takewhile_inclusive,
  49     UnavailableVideoError,
  50     url_basename,
  51     write_json_file,
  52     write_string,
  53     YoutubeDLHandler,
  54     prepend_extension,
  55 )
  56 from .extractor import get_info_extractor, gen_extractors
  57 from .downloader import get_suitable_downloader
  58 from .PostProcessor import FFmpegMergerPP
  59 from .version import __version__
  60
  61
  62 class YoutubeDL(object):
  63     """YoutubeDL class.
  64
  65     YoutubeDL objects are the ones responsible of downloading the
  66     actual video file and writing it to disk if the user has requested
  67     it, among some other tasks. In most cases there should be one per
  68     program. As, given a video URL, the downloader doesn't know how to
  69     extract all the needed information, task that InfoExtractors do, it
  70     has to pass the URL to one of them.
  71
  72     For this, YoutubeDL objects have a method that allows
  73     InfoExtractors to be registered in a given order. When it is passed
  74     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  75     finds that reports being able to handle it. The InfoExtractor extracts
  76     all the information about the video or videos the URL refers to, and
  77     YoutubeDL process the extracted information, possibly using a File
  78     Downloader to download the video.
  79
  80     YoutubeDL objects accept a lot of parameters. In order not to saturate
  81     the object constructor with arguments, it receives a dictionary of
  82     options instead. These options are available through the params
  83     attribute for the InfoExtractors to use. The YoutubeDL also
  84     registers itself as the downloader in charge for the InfoExtractors
  85     that are added to it, so this is a "mutual registration".
  86
  87     Available options:
  88
  89     username:          Username for authentication purposes.
  90     password:          Password for authentication purposes.
  91     videopassword:     Password for acces a video.
  92     usenetrc:          Use netrc for authentication instead.
  93     verbose:           Print additional info to stdout.
  94     quiet:             Do not print messages to stdout.
  95     forceurl:          Force printing final URL.
  96     forcetitle:        Force printing title.
  97     forceid:           Force printing ID.
  98     forcethumbnail:    Force printing thumbnail URL.
  99     forcedescription:  Force printing description.
 100     forcefilename:     Force printing final filename.
 101     forceduration:     Force printing duration.
 102     forcejson:         Force printing info_dict as JSON.
 103     simulate:          Do not download the video files.
 104     format:            Video format code.
 105     format_limit:      Highest quality format to try.
 106     outtmpl:           Template for output names.
 107     restrictfilenames: Do not allow "&" and spaces in file names
 108     ignoreerrors:      Do not stop on download errors.
 109     nooverwrites:      Prevent overwriting files.
 110     playliststart:     Playlist item to start at.
 111     playlistend:       Playlist item to end at.
 112     matchtitle:        Download only matching titles.
 113     rejecttitle:       Reject downloads for matching titles.
 114     logger:            Log messages to a logging.Logger instance.
 115     logtostderr:       Log messages to stderr instead of stdout.
 116     writedescription:  Write the video description to a .description file
 117     writeinfojson:     Write the video description to a .info.json file
 118     writeannotations:  Write the video annotations to a .annotations.xml file
 119     writethumbnail:    Write the thumbnail image to a file
 120     writesubtitles:    Write the video subtitles to a file
 121     writeautomaticsub: Write the automatic subtitles to a file
 122     allsubtitles:      Downloads all the subtitles of the video
 123                        (requires writesubtitles or writeautomaticsub)
 124     listsubtitles:     Lists all available subtitles for the video
 125     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 126     subtitleslangs:    List of languages of the subtitles to download
 127     keepvideo:         Keep the video file after post-processing
 128     daterange:         A DateRange object, download only if the upload_date is in the range.
 129     skip_download:     Skip the actual download of the video file
 130     cachedir:          Location of the cache files in the filesystem.
 131                        None to disable filesystem cache.
 132     noplaylist:        Download single video instead of a playlist if in doubt.
 133     age_limit:         An integer representing the user's age in years.
 134                        Unsuitable videos for the given age are skipped.
 135     min_views:         An integer representing the minimum view count the video
 136                        must have in order to not be skipped.
 137                        Videos without view count information are always
 138                        downloaded. None for no limit.
 139     max_views:         An integer representing the maximum view count.
 140                        Videos that are more popular than that are not
 141                        downloaded.
 142                        Videos without view count information are always
 143                        downloaded. None for no limit.
 144     download_archive:  File name of a file where all downloads are recorded.
 145                        Videos already present in the file are not downloaded
 146                        again.
 147     cookiefile:        File name where cookies should be read from and dumped to.
 148     nocheckcertificate:Do not verify SSL certificates
 149     proxy:             URL of the proxy server to use
 150     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 151     bidi_workaround:   Work around buggy terminals without bidirectional text
 152                        support, using fridibi
 153     debug_printtraffic:Print out sent and received HTTP traffic
 154
 155     The following parameters are not used by YoutubeDL itself, they are used by
 156     the FileDownloader:
 157     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 158     noresizebuffer, retries, continuedl, noprogress, consoletitle
 159     """
 160
 161     params = None
 162     _ies = []
 163     _pps = []
 164     _download_retcode = None
 165     _num_downloads = None
 166     _screen_file = None
 167
 168     def __init__(self, params=None):
 169         """Create a FileDownloader object with the given options."""
 170         if params is None:
 171             params = {}
 172         self._ies = []
 173         self._ies_instances = {}
 174         self._pps = []
 175         self._progress_hooks = []
 176         self._download_retcode = 0
 177         self._num_downloads = 0
 178         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 179         self._err_file = sys.stderr
 180         self.params = params
 181
 182         if params.get('bidi_workaround', False):
 183             try:
 184                 import pty
 185                 master, slave = pty.openpty()
 186                 width = get_term_width()
 187                 if width is None:
 188                     width_args = []
 189                 else:
 190                     width_args = ['-w', str(width)]
 191                 sp_kwargs = dict(
 192                     stdin=subprocess.PIPE,
 193                     stdout=slave,
 194                     stderr=self._err_file)
 195                 try:
 196                     self._output_process = subprocess.Popen(
 197                         ['bidiv'] + width_args, **sp_kwargs
 198                     )
 199                 except OSError:
 200                     self._output_process = subprocess.Popen(
 201                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 202                 self._output_channel = os.fdopen(master, 'rb')
 203             except OSError as ose:
 204                 if ose.errno == 2:
 205                     self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 206                 else:
 207                     raise
 208
 209         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 210                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 211                 and not params['restrictfilenames']):
 212             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 213             self.report_warning(
 214                 u'Assuming --restrict-filenames since file system encoding '
 215                 u'cannot encode all charactes. '
 216                 u'Set the LC_ALL environment variable to fix this.')
 217             self.params['restrictfilenames'] = True
 218
 219         if '%(stitle)s' in self.params.get('outtmpl', ''):
 220             self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 221
 222         self._setup_opener()
 223
 224     def add_info_extractor(self, ie):
 225         """Add an InfoExtractor object to the end of the list."""
 226         self._ies.append(ie)
 227         self._ies_instances[ie.ie_key()] = ie
 228         ie.set_downloader(self)
 229
 230     def get_info_extractor(self, ie_key):
 231         """
 232         Get an instance of an IE with name ie_key, it will try to get one from
 233         the _ies list, if there's no instance it will create a new one and add
 234         it to the extractor list.
 235         """
 236         ie = self._ies_instances.get(ie_key)
 237         if ie is None:
 238             ie = get_info_extractor(ie_key)()
 239             self.add_info_extractor(ie)
 240         return ie
 241
 242     def add_default_info_extractors(self):
 243         """
 244         Add the InfoExtractors returned by gen_extractors to the end of the list
 245         """
 246         for ie in gen_extractors():
 247             self.add_info_extractor(ie)
 248
 249     def add_post_processor(self, pp):
 250         """Add a PostProcessor object to the end of the chain."""
 251         self._pps.append(pp)
 252         pp.set_downloader(self)
 253
 254     def add_progress_hook(self, ph):
 255         """Add the progress hook (currently only for the file downloader)"""
 256         self._progress_hooks.append(ph)
 257
 258     def _bidi_workaround(self, message):
 259         if not hasattr(self, '_output_channel'):
 260             return message
 261
 262         assert hasattr(self, '_output_process')
 263         assert type(message) == type(u'')
 264         line_count = message.count(u'\n') + 1
 265         self._output_process.stdin.write((message + u'\n').encode('utf-8'))
 266         self._output_process.stdin.flush()
 267         res = u''.join(self._output_channel.readline().decode('utf-8')
 268                        for _ in range(line_count))
 269         return res[:-len(u'\n')]
 270
 271     def to_screen(self, message, skip_eol=False):
 272         """Print message to stdout if not in quiet mode."""
 273         return self.to_stdout(message, skip_eol, check_quiet=True)
 274
 275     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 276         """Print message to stdout if not in quiet mode."""
 277         if self.params.get('logger'):
 278             self.params['logger'].debug(message)
 279         elif not check_quiet or not self.params.get('quiet', False):
 280             message = self._bidi_workaround(message)
 281             terminator = [u'\n', u''][skip_eol]
 282             output = message + terminator
 283
 284             write_string(output, self._screen_file)
 285
 286     def to_stderr(self, message):
 287         """Print message to stderr."""
 288         assert type(message) == type(u'')
 289         if self.params.get('logger'):
 290             self.params['logger'].error(message)
 291         else:
 292             message = self._bidi_workaround(message)
 293             output = message + u'\n'
 294             write_string(output, self._err_file)
 295
 296     def to_console_title(self, message):
 297         if not self.params.get('consoletitle', False):
 298             return
 299         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 300             # c_wchar_p() might not be necessary if `message` is
 301             # already of type unicode()
 302             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 303         elif 'TERM' in os.environ:
 304             write_string(u'\033]0;%s\007' % message, self._screen_file)
 305
 306     def save_console_title(self):
 307         if not self.params.get('consoletitle', False):
 308             return
 309         if 'TERM' in os.environ:
 310             # Save the title on stack
 311             write_string(u'\033[22;0t', self._screen_file)
 312
 313     def restore_console_title(self):
 314         if not self.params.get('consoletitle', False):
 315             return
 316         if 'TERM' in os.environ:
 317             # Restore the title from stack
 318             write_string(u'\033[23;0t', self._screen_file)
 319
 320     def __enter__(self):
 321         self.save_console_title()
 322         return self
 323
 324     def __exit__(self, *args):
 325         self.restore_console_title()
 326
 327         if self.params.get('cookiefile') is not None:
 328             self.cookiejar.save()
 329
 330     def trouble(self, message=None, tb=None):
 331         """Determine action to take when a download problem appears.
 332
 333         Depending on if the downloader has been configured to ignore
 334         download errors or not, this method may throw an exception or
 335         not when errors are found, after printing the message.
 336
 337         tb, if given, is additional traceback information.
 338         """
 339         if message is not None:
 340             self.to_stderr(message)
 341         if self.params.get('verbose'):
 342             if tb is None:
 343                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 344                     tb = u''
 345                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 346                         tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 347                     tb += compat_str(traceback.format_exc())
 348                 else:
 349                     tb_data = traceback.format_list(traceback.extract_stack())
 350                     tb = u''.join(tb_data)
 351             self.to_stderr(tb)
 352         if not self.params.get('ignoreerrors', False):
 353             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 354                 exc_info = sys.exc_info()[1].exc_info
 355             else:
 356                 exc_info = sys.exc_info()
 357             raise DownloadError(message, exc_info)
 358         self._download_retcode = 1
 359
 360     def report_warning(self, message):
 361         '''
 362         Print the message to stderr, it will be prefixed with 'WARNING:'
 363         If stderr is a tty file the 'WARNING:' will be colored
 364         '''
 365         if self._err_file.isatty() and os.name != 'nt':
 366             _msg_header = u'\033[0;33mWARNING:\033[0m'
 367         else:
 368             _msg_header = u'WARNING:'
 369         warning_message = u'%s %s' % (_msg_header, message)
 370         self.to_stderr(warning_message)
 371
 372     def report_error(self, message, tb=None):
 373         '''
 374         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 375         in red if stderr is a tty file.
 376         '''
 377         if self._err_file.isatty() and os.name != 'nt':
 378             _msg_header = u'\033[0;31mERROR:\033[0m'
 379         else:
 380             _msg_header = u'ERROR:'
 381         error_message = u'%s %s' % (_msg_header, message)
 382         self.trouble(error_message, tb)
 383
 384     def report_file_already_downloaded(self, file_name):
 385         """Report file has already been fully downloaded."""
 386         try:
 387             self.to_screen(u'[download] %s has already been downloaded' % file_name)
 388         except UnicodeEncodeError:
 389             self.to_screen(u'[download] The file has already been downloaded')
 390
 391     def increment_downloads(self):
 392         """Increment the ordinal that assigns a number to each file."""
 393         self._num_downloads += 1
 394
 395     def prepare_filename(self, info_dict):
 396         """Generate the output filename."""
 397         try:
 398             template_dict = dict(info_dict)
 399
 400             template_dict['epoch'] = int(time.time())
 401             autonumber_size = self.params.get('autonumber_size')
 402             if autonumber_size is None:
 403                 autonumber_size = 5
 404             autonumber_templ = u'%0' + str(autonumber_size) + u'd'
 405             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 406             if template_dict.get('playlist_index') is not None:
 407                 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
 408
 409             sanitize = lambda k, v: sanitize_filename(
 410                 compat_str(v),
 411                 restricted=self.params.get('restrictfilenames'),
 412                 is_id=(k == u'id'))
 413             template_dict = dict((k, sanitize(k, v))
 414                                  for k, v in template_dict.items()
 415                                  if v is not None)
 416             template_dict = collections.defaultdict(lambda: u'NA', template_dict)
 417
 418             tmpl = os.path.expanduser(self.params['outtmpl'])
 419             filename = tmpl % template_dict
 420             return filename
 421         except ValueError as err:
 422             self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
 423             return None
 424
 425     def _match_entry(self, info_dict):
 426         """ Returns None iff the file should be downloaded """
 427
 428         video_title = info_dict.get('title', info_dict.get('id', u'video'))
 429         if 'title' in info_dict:
 430             # This can happen when we're just evaluating the playlist
 431             title = info_dict['title']
 432             matchtitle = self.params.get('matchtitle', False)
 433             if matchtitle:
 434                 if not re.search(matchtitle, title, re.IGNORECASE):
 435                     return u'"' + title + '" title did not match pattern "' + matchtitle + '"'
 436             rejecttitle = self.params.get('rejecttitle', False)
 437             if rejecttitle:
 438                 if re.search(rejecttitle, title, re.IGNORECASE):
 439                     return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 440         date = info_dict.get('upload_date', None)
 441         if date is not None:
 442             dateRange = self.params.get('daterange', DateRange())
 443             if date not in dateRange:
 444                 return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 445         view_count = info_dict.get('view_count', None)
 446         if view_count is not None:
 447             min_views = self.params.get('min_views')
 448             if min_views is not None and view_count < min_views:
 449                 return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 450             max_views = self.params.get('max_views')
 451             if max_views is not None and view_count > max_views:
 452                 return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 453         age_limit = self.params.get('age_limit')
 454         if age_limit is not None:
 455             if age_limit < info_dict.get('age_limit', 0):
 456                 return u'Skipping "' + title + '" because it is age restricted'
 457         if self.in_download_archive(info_dict):
 458             return u'%s has already been recorded in archive' % video_title
 459         return None
 460
 461     @staticmethod
 462     def add_extra_info(info_dict, extra_info):
 463         '''Set the keys from extra_info in info dict if they are missing'''
 464         for key, value in extra_info.items():
 465             info_dict.setdefault(key, value)
 466
 467     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 468                      process=True):
 469         '''
 470         Returns a list with a dictionary for each video we find.
 471         If 'download', also downloads the videos.
 472         extra_info is a dict containing the extra values to add to each result
 473          '''
 474
 475         if ie_key:
 476             ies = [self.get_info_extractor(ie_key)]
 477         else:
 478             ies = self._ies
 479
 480         for ie in ies:
 481             if not ie.suitable(url):
 482                 continue
 483
 484             if not ie.working():
 485                 self.report_warning(u'The program functionality for this site has been marked as broken, '
 486                                     u'and will probably not work.')
 487
 488             try:
 489                 ie_result = ie.extract(url)
 490                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 491                     break
 492                 if isinstance(ie_result, list):
 493                     # Backwards compatibility: old IE result format
 494                     ie_result = {
 495                         '_type': 'compat_list',
 496                         'entries': ie_result,
 497                     }
 498                 self.add_extra_info(ie_result,
 499                     {
 500                         'extractor': ie.IE_NAME,
 501                         'webpage_url': url,
 502                         'webpage_url_basename': url_basename(url),
 503                         'extractor_key': ie.ie_key(),
 504                     })
 505                 if process:
 506                     return self.process_ie_result(ie_result, download, extra_info)
 507                 else:
 508                     return ie_result
 509             except ExtractorError as de: # An error we somewhat expected
 510                 self.report_error(compat_str(de), de.format_traceback())
 511                 break
 512             except Exception as e:
 513                 if self.params.get('ignoreerrors', False):
 514                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 515                     break
 516                 else:
 517                     raise
 518         else:
 519             self.report_error(u'no suitable InfoExtractor: %s' % url)
 520
 521     def process_ie_result(self, ie_result, download=True, extra_info={}):
 522         """
 523         Take the result of the ie(may be modified) and resolve all unresolved
 524         references (URLs, playlist items).
 525
 526         It will also download the videos if 'download'.
 527         Returns the resolved ie_result.
 528         """
 529
 530         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 531         if result_type == 'video':
 532             self.add_extra_info(ie_result, extra_info)
 533             return self.process_video_result(ie_result, download=download)
 534         elif result_type == 'url':
 535             # We have to add extra_info to the results because it may be
 536             # contained in a playlist
 537             return self.extract_info(ie_result['url'],
 538                                      download,
 539                                      ie_key=ie_result.get('ie_key'),
 540                                      extra_info=extra_info)
 541         elif result_type == 'url_transparent':
 542             # Use the information from the embedding page
 543             info = self.extract_info(
 544                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 545                 extra_info=extra_info, download=False, process=False)
 546
 547             def make_result(embedded_info):
 548                 new_result = ie_result.copy()
 549                 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
 550                           'entries', 'ie_key', 'duration',
 551                           'subtitles', 'annotations', 'format',
 552                           'thumbnail', 'thumbnails'):
 553                     if f in new_result:
 554                         del new_result[f]
 555                     if f in embedded_info:
 556                         new_result[f] = embedded_info[f]
 557                 return new_result
 558             new_result = make_result(info)
 559
 560             assert new_result.get('_type') != 'url_transparent'
 561             if new_result.get('_type') == 'compat_list':
 562                 new_result['entries'] = [
 563                     make_result(e) for e in new_result['entries']]
 564
 565             return self.process_ie_result(
 566                 new_result, download=download, extra_info=extra_info)
 567         elif result_type == 'playlist':
 568             # We process each entry in the playlist
 569             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 570             self.to_screen(u'[download] Downloading playlist: %s' % playlist)
 571
 572             playlist_results = []
 573
 574             n_all_entries = len(ie_result['entries'])
 575             playliststart = self.params.get('playliststart', 1) - 1
 576             playlistend = self.params.get('playlistend', None)
 577             # For backwards compatibility, interpret -1 as whole list
 578             if playlistend == -1:
 579                 playlistend = None
 580
 581             entries = ie_result['entries'][playliststart:playlistend]
 582             n_entries = len(entries)
 583
 584             self.to_screen(
 585                 u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 586                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 587
 588             for i, entry in enumerate(entries, 1):
 589                 self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries))
 590                 extra = {
 591                     'playlist': playlist,
 592                     'playlist_index': i + playliststart,
 593                     'extractor': ie_result['extractor'],
 594                     'webpage_url': ie_result['webpage_url'],
 595                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 596                     'extractor_key': ie_result['extractor_key'],
 597                 }
 598
 599                 reason = self._match_entry(entry)
 600                 if reason is not None:
 601                     self.to_screen(u'[download] ' + reason)
 602                     continue
 603
 604                 entry_result = self.process_ie_result(entry,
 605                                                       download=download,
 606                                                       extra_info=extra)
 607                 playlist_results.append(entry_result)
 608             ie_result['entries'] = playlist_results
 609             return ie_result
 610         elif result_type == 'compat_list':
 611             def _fixup(r):
 612                 self.add_extra_info(r,
 613                     {
 614                         'extractor': ie_result['extractor'],
 615                         'webpage_url': ie_result['webpage_url'],
 616                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 617                         'extractor_key': ie_result['extractor_key'],
 618                     })
 619                 return r
 620             ie_result['entries'] = [
 621                 self.process_ie_result(_fixup(r), download, extra_info)
 622                 for r in ie_result['entries']
 623             ]
 624             return ie_result
 625         else:
 626             raise Exception('Invalid result type: %s' % result_type)
 627
 628     def select_format(self, format_spec, available_formats):
 629         if format_spec == 'best' or format_spec is None:
 630             return available_formats[-1]
 631         elif format_spec == 'worst':
 632             return available_formats[0]
 633         else:
 634             extensions = [u'mp4', u'flv', u'webm', u'3gp']
 635             if format_spec in extensions:
 636                 filter_f = lambda f: f['ext'] == format_spec
 637             else:
 638                 filter_f = lambda f: f['format_id'] == format_spec
 639             matches = list(filter(filter_f, available_formats))
 640             if matches:
 641                 return matches[-1]
 642         return None
 643
 644     def process_video_result(self, info_dict, download=True):
 645         assert info_dict.get('_type', 'video') == 'video'
 646
 647         if 'playlist' not in info_dict:
 648             # It isn't part of a playlist
 649             info_dict['playlist'] = None
 650             info_dict['playlist_index'] = None
 651
 652         # This extractors handle format selection themselves
 653         if info_dict['extractor'] in [u'Youku']:
 654             if download:
 655                 self.process_info(info_dict)
 656             return info_dict
 657
 658         # We now pick which formats have to be downloaded
 659         if info_dict.get('formats') is None:
 660             # There's only one format available
 661             formats = [info_dict]
 662         else:
 663             formats = info_dict['formats']
 664
 665         # We check that all the formats have the format and format_id fields
 666         for (i, format) in enumerate(formats):
 667             if format.get('format_id') is None:
 668                 format['format_id'] = compat_str(i)
 669             if format.get('format') is None:
 670                 format['format'] = u'{id} - {res}{note}'.format(
 671                     id=format['format_id'],
 672                     res=self.format_resolution(format),
 673                     note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
 674                 )
 675             # Automatically determine file extension if missing
 676             if 'ext' not in format:
 677                 format['ext'] = determine_ext(format['url'])
 678
 679         format_limit = self.params.get('format_limit', None)
 680         if format_limit:
 681             formats = list(takewhile_inclusive(
 682                 lambda f: f['format_id'] != format_limit, formats
 683             ))
 684
 685         # TODO Central sorting goes here
 686
 687         if formats[0] is not info_dict:
 688             # only set the 'formats' fields if the original info_dict list them
 689             # otherwise we end up with a circular reference, the first (and unique)
 690             # element in the 'formats' field in info_dict is info_dict itself,
 691             # wich can't be exported to json
 692             info_dict['formats'] = formats
 693         if self.params.get('listformats', None):
 694             self.list_formats(info_dict)
 695             return
 696
 697         req_format = self.params.get('format', 'best')
 698         if req_format is None:
 699             req_format = 'best'
 700         formats_to_download = []
 701         # The -1 is for supporting YoutubeIE
 702         if req_format in ('-1', 'all'):
 703             formats_to_download = formats
 704         else:
 705             # We can accept formats requestd in the format: 34/5/best, we pick
 706             # the first that is available, starting from left
 707             req_formats = req_format.split('/')
 708             for rf in req_formats:
 709                 if re.match(r'.+?\+.+?', rf) is not None:
 710                     # Two formats have been requested like '137+139'
 711                     format_1, format_2 = rf.split('+')
 712                     formats_info = (self.select_format(format_1, formats),
 713                         self.select_format(format_2, formats))
 714                     if all(formats_info):
 715                         selected_format = {'requested_formats': formats_info}
 716                     else:
 717                         selected_format = None
 718                 else:
 719                     selected_format = self.select_format(rf, formats)
 720                 if selected_format is not None:
 721                     formats_to_download = [selected_format]
 722                     break
 723         if not formats_to_download:
 724             raise ExtractorError(u'requested format not available',
 725                                  expected=True)
 726
 727         if download:
 728             if len(formats_to_download) > 1:
 729                 self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
 730             for format in formats_to_download:
 731                 new_info = dict(info_dict)
 732                 new_info.update(format)
 733                 self.process_info(new_info)
 734         # We update the info dict with the best quality format (backwards compatibility)
 735         info_dict.update(formats_to_download[-1])
 736         return info_dict
 737
 738     def process_info(self, info_dict):
 739         """Process a single resolved IE result."""
 740
 741         assert info_dict.get('_type', 'video') == 'video'
 742         #We increment the download the download count here to match the previous behaviour.
 743         self.increment_downloads()
 744
 745         info_dict['fulltitle'] = info_dict['title']
 746         if len(info_dict['title']) > 200:
 747             info_dict['title'] = info_dict['title'][:197] + u'...'
 748
 749         # Keep for backwards compatibility
 750         info_dict['stitle'] = info_dict['title']
 751
 752         if not 'format' in info_dict:
 753             info_dict['format'] = info_dict['ext']
 754
 755         reason = self._match_entry(info_dict)
 756         if reason is not None:
 757             self.to_screen(u'[download] ' + reason)
 758             return
 759
 760         max_downloads = self.params.get('max_downloads')
 761         if max_downloads is not None:
 762             if self._num_downloads > int(max_downloads):
 763                 raise MaxDownloadsReached()
 764
 765         filename = self.prepare_filename(info_dict)
 766
 767         # Forced printings
 768         if self.params.get('forcetitle', False):
 769             self.to_stdout(info_dict['fulltitle'])
 770         if self.params.get('forceid', False):
 771             self.to_stdout(info_dict['id'])
 772         if self.params.get('forceurl', False):
 773             # For RTMP URLs, also include the playpath
 774             self.to_stdout(info_dict['url'] + info_dict.get('play_path', u''))
 775         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
 776             self.to_stdout(info_dict['thumbnail'])
 777         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
 778             self.to_stdout(info_dict['description'])
 779         if self.params.get('forcefilename', False) and filename is not None:
 780             self.to_stdout(filename)
 781         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
 782             self.to_stdout(formatSeconds(info_dict['duration']))
 783         if self.params.get('forceformat', False):
 784             self.to_stdout(info_dict['format'])
 785         if self.params.get('forcejson', False):
 786             info_dict['_filename'] = filename
 787             self.to_stdout(json.dumps(info_dict))
 788
 789         # Do nothing else if in simulate mode
 790         if self.params.get('simulate', False):
 791             return
 792
 793         if filename is None:
 794             return
 795
 796         try:
 797             dn = os.path.dirname(encodeFilename(filename))
 798             if dn != '' and not os.path.exists(dn):
 799                 os.makedirs(dn)
 800         except (OSError, IOError) as err:
 801             self.report_error(u'unable to create directory ' + compat_str(err))
 802             return
 803
 804         if self.params.get('writedescription', False):
 805             descfn = filename + u'.description'
 806             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
 807                 self.to_screen(u'[info] Video description is already present')
 808             else:
 809                 try:
 810                     self.to_screen(u'[info] Writing video description to: ' + descfn)
 811                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 812                         descfile.write(info_dict['description'])
 813                 except (KeyError, TypeError):
 814                     self.report_warning(u'There\'s no description to write.')
 815                 except (OSError, IOError):
 816                     self.report_error(u'Cannot write description file ' + descfn)
 817                     return
 818
 819         if self.params.get('writeannotations', False):
 820             annofn = filename + u'.annotations.xml'
 821             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
 822                 self.to_screen(u'[info] Video annotations are already present')
 823             else:
 824                 try:
 825                     self.to_screen(u'[info] Writing video annotations to: ' + annofn)
 826                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
 827                         annofile.write(info_dict['annotations'])
 828                 except (KeyError, TypeError):
 829                     self.report_warning(u'There are no annotations to write.')
 830                 except (OSError, IOError):
 831                     self.report_error(u'Cannot write annotations file: ' + annofn)
 832                     return
 833
 834         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 835                                        self.params.get('writeautomaticsub')])
 836
 837         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 838             # subtitles download errors are already managed as troubles in relevant IE
 839             # that way it will silently go on when used with unsupporting IE
 840             subtitles = info_dict['subtitles']
 841             sub_format = self.params.get('subtitlesformat', 'srt')
 842             for sub_lang in subtitles.keys():
 843                 sub = subtitles[sub_lang]
 844                 if sub is None:
 845                     continue
 846                 try:
 847                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 848                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 849                         self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
 850                     else:
 851                         self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
 852                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 853                                 subfile.write(sub)
 854                 except (OSError, IOError):
 855                     self.report_error(u'Cannot write subtitles file ' + descfn)
 856                     return
 857
 858         if self.params.get('writeinfojson', False):
 859             infofn = os.path.splitext(filename)[0] + u'.info.json'
 860             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
 861                 self.to_screen(u'[info] Video description metadata is already present')
 862             else:
 863                 self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn)
 864                 try:
 865                     write_json_file(info_dict, encodeFilename(infofn))
 866                 except (OSError, IOError):
 867                     self.report_error(u'Cannot write metadata to JSON file ' + infofn)
 868                     return
 869
 870         if self.params.get('writethumbnail', False):
 871             if info_dict.get('thumbnail') is not None:
 872                 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
 873                 thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
 874                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
 875                     self.to_screen(u'[%s] %s: Thumbnail is already present' %
 876                                    (info_dict['extractor'], info_dict['id']))
 877                 else:
 878                     self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
 879                                    (info_dict['extractor'], info_dict['id']))
 880                     try:
 881                         uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 882                         with open(thumb_filename, 'wb') as thumbf:
 883                             shutil.copyfileobj(uf, thumbf)
 884                         self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
 885                             (info_dict['extractor'], info_dict['id'], thumb_filename))
 886                     except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 887                         self.report_warning(u'Unable to download thumbnail "%s": %s' %
 888                             (info_dict['thumbnail'], compat_str(err)))
 889
 890         if not self.params.get('skip_download', False):
 891             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 892                 success = True
 893             else:
 894                 try:
 895                     def dl(name, info):
 896                         fd = get_suitable_downloader(info)(self, self.params)
 897                         for ph in self._progress_hooks:
 898                             fd.add_progress_hook(ph)
 899                         return fd.download(name, info)
 900                     if info_dict.get('requested_formats') is not None:
 901                         downloaded = []
 902                         success = True
 903                         for f in info_dict['requested_formats']:
 904                             new_info = dict(info_dict)
 905                             new_info.update(f)
 906                             fname = self.prepare_filename(new_info)
 907                             fname = prepend_extension(fname, 'f%s' % f['format_id'])
 908                             downloaded.append(fname)
 909                             partial_success = dl(fname, new_info)
 910                             success = success and partial_success
 911                         info_dict['__postprocessors'] = [FFmpegMergerPP(self)]
 912                         info_dict['__files_to_merge'] = downloaded
 913                     else:
 914                         # Just a single file
 915                         success = dl(filename, info_dict)
 916                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 917                     self.report_error(u'unable to download video data: %s' % str(err))
 918                     return
 919                 except (OSError, IOError) as err:
 920                     raise UnavailableVideoError(err)
 921                 except (ContentTooShortError, ) as err:
 922                     self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 923                     return
 924
 925             if success:
 926                 try:
 927                     self.post_process(filename, info_dict)
 928                 except (PostProcessingError) as err:
 929                     self.report_error(u'postprocessing: %s' % str(err))
 930                     return
 931
 932         self.record_download_archive(info_dict)
 933
 934     def download(self, url_list):
 935         """Download a given list of URLs."""
 936         if (len(url_list) > 1 and
 937                 '%' not in self.params['outtmpl']
 938                 and self.params.get('max_downloads') != 1):
 939             raise SameFileError(self.params['outtmpl'])
 940
 941         for url in url_list:
 942             try:
 943                 #It also downloads the videos
 944                 self.extract_info(url)
 945             except UnavailableVideoError:
 946                 self.report_error(u'unable to download video')
 947             except MaxDownloadsReached:
 948                 self.to_screen(u'[info] Maximum number of downloaded files reached.')
 949                 raise
 950
 951         return self._download_retcode
 952
 953     def download_with_info_file(self, info_filename):
 954         with io.open(info_filename, 'r', encoding='utf-8') as f:
 955             info = json.load(f)
 956         try:
 957             self.process_ie_result(info, download=True)
 958         except DownloadError:
 959             webpage_url = info.get('webpage_url')
 960             if webpage_url is not None:
 961                 self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url)
 962                 return self.download([webpage_url])
 963             else:
 964                 raise
 965         return self._download_retcode
 966
 967     def post_process(self, filename, ie_info):
 968         """Run all the postprocessors on the given file."""
 969         info = dict(ie_info)
 970         info['filepath'] = filename
 971         keep_video = None
 972         pps_chain = []
 973         if ie_info.get('__postprocessors') is not None:
 974             pps_chain.extend(ie_info['__postprocessors'])
 975         pps_chain.extend(self._pps)
 976         for pp in pps_chain:
 977             try:
 978                 keep_video_wish, new_info = pp.run(info)
 979                 if keep_video_wish is not None:
 980                     if keep_video_wish:
 981                         keep_video = keep_video_wish
 982                     elif keep_video is None:
 983                         # No clear decision yet, let IE decide
 984                         keep_video = keep_video_wish
 985             except PostProcessingError as e:
 986                 self.report_error(e.msg)
 987         if keep_video is False and not self.params.get('keepvideo', False):
 988             try:
 989                 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
 990                 os.remove(encodeFilename(filename))
 991             except (IOError, OSError):
 992                 self.report_warning(u'Unable to remove downloaded video file')
 993
 994     def _make_archive_id(self, info_dict):
 995         # Future-proof against any change in case
 996         # and backwards compatibility with prior versions
 997         extractor = info_dict.get('extractor_key')
 998         if extractor is None:
 999             if 'id' in info_dict:
1000                 extractor = info_dict.get('ie_key')  # key in a playlist
1001         if extractor is None:
1002             return None  # Incomplete video information
1003         return extractor.lower() + u' ' + info_dict['id']
1004
1005     def in_download_archive(self, info_dict):
1006         fn = self.params.get('download_archive')
1007         if fn is None:
1008             return False
1009
1010         vid_id = self._make_archive_id(info_dict)
1011         if vid_id is None:
1012             return False  # Incomplete video information
1013
1014         try:
1015             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1016                 for line in archive_file:
1017                     if line.strip() == vid_id:
1018                         return True
1019         except IOError as ioe:
1020             if ioe.errno != errno.ENOENT:
1021                 raise
1022         return False
1023
1024     def record_download_archive(self, info_dict):
1025         fn = self.params.get('download_archive')
1026         if fn is None:
1027             return
1028         vid_id = self._make_archive_id(info_dict)
1029         assert vid_id
1030         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1031             archive_file.write(vid_id + u'\n')
1032
1033     @staticmethod
1034     def format_resolution(format, default='unknown'):
1035         if format.get('vcodec') == 'none':
1036             return 'audio only'
1037         if format.get('resolution') is not None:
1038             return format['resolution']
1039         if format.get('height') is not None:
1040             if format.get('width') is not None:
1041                 res = u'%sx%s' % (format['width'], format['height'])
1042             else:
1043                 res = u'%sp' % format['height']
1044         elif format.get('width') is not None:
1045             res = u'?x%d' % format['width']
1046         else:
1047             res = default
1048         return res
1049
1050     def list_formats(self, info_dict):
1051         def format_note(fdict):
1052             res = u''
1053             if fdict.get('ext') in ['f4f', 'f4m']:
1054                 res += u'(unsupported) '
1055             if fdict.get('format_note') is not None:
1056                 res += fdict['format_note'] + u' '
1057             if fdict.get('tbr') is not None:
1058                 res += u'%4dk ' % fdict['tbr']
1059             if (fdict.get('vcodec') is not None and
1060                     fdict.get('vcodec') != 'none'):
1061                 res += u'%-5s@' % fdict['vcodec']
1062             elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1063                 res += u'video@'
1064             if fdict.get('vbr') is not None:
1065                 res += u'%4dk' % fdict['vbr']
1066             if fdict.get('acodec') is not None:
1067                 if res:
1068                     res += u', '
1069                 res += u'%-5s' % fdict['acodec']
1070             elif fdict.get('abr') is not None:
1071                 if res:
1072                     res += u', '
1073                 res += 'audio'
1074             if fdict.get('abr') is not None:
1075                 res += u'@%3dk' % fdict['abr']
1076             if fdict.get('filesize') is not None:
1077                 if res:
1078                     res += u', '
1079                 res += format_bytes(fdict['filesize'])
1080             return res
1081
1082         def line(format, idlen=20):
1083             return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
1084                 format['format_id'],
1085                 format['ext'],
1086                 self.format_resolution(format),
1087                 format_note(format),
1088             ))
1089
1090         formats = info_dict.get('formats', [info_dict])
1091         idlen = max(len(u'format code'),
1092                     max(len(f['format_id']) for f in formats))
1093         formats_s = [line(f, idlen) for f in formats]
1094         if len(formats) > 1:
1095             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1096             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1097
1098         header_line = line({
1099             'format_id': u'format code', 'ext': u'extension',
1100             'resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
1101         self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
1102                        (info_dict['id'], header_line, u"\n".join(formats_s)))
1103
1104     def urlopen(self, req):
1105         """ Start an HTTP download """
1106         return self._opener.open(req)
1107
1108     def print_debug_header(self):
1109         if not self.params.get('verbose'):
1110             return
1111         write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
1112         try:
1113             sp = subprocess.Popen(
1114                 ['git', 'rev-parse', '--short', 'HEAD'],
1115                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1116                 cwd=os.path.dirname(os.path.abspath(__file__)))
1117             out, err = sp.communicate()
1118             out = out.decode().strip()
1119             if re.match('[0-9a-f]+', out):
1120                 write_string(u'[debug] Git HEAD: ' + out + u'\n')
1121         except:
1122             try:
1123                 sys.exc_clear()
1124             except:
1125                 pass
1126         write_string(u'[debug] Python version %s - %s' %
1127                      (platform.python_version(), platform_name()) + u'\n')
1128
1129         proxy_map = {}
1130         for handler in self._opener.handlers:
1131             if hasattr(handler, 'proxies'):
1132                 proxy_map.update(handler.proxies)
1133         write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
1134
1135     def _setup_opener(self):
1136         timeout_val = self.params.get('socket_timeout')
1137         timeout = 600 if timeout_val is None else float(timeout_val)
1138
1139         opts_cookiefile = self.params.get('cookiefile')
1140         opts_proxy = self.params.get('proxy')
1141
1142         if opts_cookiefile is None:
1143             self.cookiejar = compat_cookiejar.CookieJar()
1144         else:
1145             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1146                 opts_cookiefile)
1147             if os.access(opts_cookiefile, os.R_OK):
1148                 self.cookiejar.load()
1149
1150         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1151             self.cookiejar)
1152         if opts_proxy is not None:
1153             if opts_proxy == '':
1154                 proxies = {}
1155             else:
1156                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1157         else:
1158             proxies = compat_urllib_request.getproxies()
1159             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1160             if 'http' in proxies and 'https' not in proxies:
1161                 proxies['https'] = proxies['http']
1162         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1163
1164         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1165         https_handler = make_HTTPS_handler(
1166             self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1167         ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1168         opener = compat_urllib_request.build_opener(
1169             https_handler, proxy_handler, cookie_processor, ydlh)
1170         # Delete the default user-agent header, which would otherwise apply in
1171         # cases where our custom HTTP handler doesn't come into play
1172         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1173         opener.addheaders = []
1174         self._opener = opener
1175
1176         # TODO remove this global modification
1177         compat_urllib_request.install_opener(opener)
1178         socket.setdefaulttimeout(timeout)