2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
17 from .extractor import get_info_extractor, gen_extractors
18 from .FileDownloader import FileDownloader
21 class YoutubeDL(object):
24 YoutubeDL objects are the ones responsible of downloading the
25 actual video file and writing it to disk if the user has requested
26 it, among some other tasks. In most cases there should be one per
27 program. As, given a video URL, the downloader doesn't know how to
28 extract all the needed information, task that InfoExtractors do, it
29 has to pass the URL to one of them.
31 For this, YoutubeDL objects have a method that allows
32 InfoExtractors to be registered in a given order. When it is passed
33 a URL, the YoutubeDL object handles it to the first InfoExtractor it
34 finds that reports being able to handle it. The InfoExtractor extracts
35 all the information about the video or videos the URL refers to, and
36 YoutubeDL process the extracted information, possibly using a File
37 Downloader to download the video.
39 YoutubeDL objects accept a lot of parameters. In order not to saturate
40 the object constructor with arguments, it receives a dictionary of
41 options instead. These options are available through the params
42 attribute for the InfoExtractors to use. The YoutubeDL also
43 registers itself as the downloader in charge for the InfoExtractors
44 that are added to it, so this is a "mutual registration".
48 username: Username for authentication purposes.
49 password: Password for authentication purposes.
50 videopassword: Password for acces a video.
51 usenetrc: Use netrc for authentication instead.
52 verbose: Print additional info to stdout.
53 quiet: Do not print messages to stdout.
54 forceurl: Force printing final URL.
55 forcetitle: Force printing title.
56 forceid: Force printing ID.
57 forcethumbnail: Force printing thumbnail URL.
58 forcedescription: Force printing description.
59 forcefilename: Force printing final filename.
60 simulate: Do not download the video files.
61 format: Video format code.
62 format_limit: Highest quality format to try.
63 outtmpl: Template for output names.
64 restrictfilenames: Do not allow "&" and spaces in file names
65 ignoreerrors: Do not stop on download errors.
66 nooverwrites: Prevent overwriting files.
67 playliststart: Playlist item to start at.
68 playlistend: Playlist item to end at.
69 matchtitle: Download only matching titles.
70 rejecttitle: Reject downloads for matching titles.
71 logtostderr: Log messages to stderr instead of stdout.
72 writedescription: Write the video description to a .description file
73 writeinfojson: Write the video description to a .info.json file
74 writethumbnail: Write the thumbnail image to a file
75 writesubtitles: Write the video subtitles to a file
76 writeautomaticsub: Write the automatic subtitles to a file
77 allsubtitles: Downloads all the subtitles of the video
78 (requires writesubtitles or writeautomaticsub)
79 listsubtitles: Lists all available subtitles for the video
80 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
81 subtitleslangs: List of languages of the subtitles to download
82 keepvideo: Keep the video file after post-processing
83 daterange: A DateRange object, download only if the upload_date is in the range.
84 skip_download: Skip the actual download of the video file
85 cachedir: Location of the cache files in the filesystem.
86 None to disable filesystem cache.
87 noplaylist: Download single video instead of a playlist if in doubt.
88 age_limit: An integer representing the user's age in years.
89 Unsuitable videos for the given age are skipped.
90 downloadarchive: File name of a file where all downloads are recorded.
91 Videos already present in the file are not downloaded
94 The following parameters are not used by YoutubeDL itself, they are used by
96 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
97 noresizebuffer, retries, continuedl, noprogress, consoletitle
103 _download_retcode = None
104 _num_downloads = None
107 def __init__(self, params):
108 """Create a FileDownloader object with the given options."""
110 self._ies_instances = {}
112 self._progress_hooks = []
113 self._download_retcode = 0
114 self._num_downloads = 0
115 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
117 if (sys.version_info >= (3,) and sys.platform != 'win32' and
118 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
119 and not params['restrictfilenames']):
120 # On Python 3, the Unicode filesystem API will throw errors (#1474)
122 u'Assuming --restrict-filenames since file system encoding '
123 u'cannot encode all charactes. '
124 u'Set the LC_ALL environment variable to fix this.')
125 params['restrictfilenames'] = True
128 self.fd = FileDownloader(self, self.params)
130 if '%(stitle)s' in self.params['outtmpl']:
131 self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
133 def add_info_extractor(self, ie):
134 """Add an InfoExtractor object to the end of the list."""
136 self._ies_instances[ie.ie_key()] = ie
137 ie.set_downloader(self)
139 def get_info_extractor(self, ie_key):
141 Get an instance of an IE with name ie_key, it will try to get one from
142 the _ies list, if there's no instance it will create a new one and add
143 it to the extractor list.
145 ie = self._ies_instances.get(ie_key)
147 ie = get_info_extractor(ie_key)()
148 self.add_info_extractor(ie)
151 def add_default_info_extractors(self):
153 Add the InfoExtractors returned by gen_extractors to the end of the list
155 for ie in gen_extractors():
156 self.add_info_extractor(ie)
158 def add_post_processor(self, pp):
159 """Add a PostProcessor object to the end of the chain."""
161 pp.set_downloader(self)
163 def to_screen(self, message, skip_eol=False):
164 """Print message to stdout if not in quiet mode."""
165 if not self.params.get('quiet', False):
166 terminator = [u'\n', u''][skip_eol]
167 output = message + terminator
168 write_string(output, self._screen_file)
170 def to_stderr(self, message):
171 """Print message to stderr."""
172 assert type(message) == type(u'')
173 output = message + u'\n'
174 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
175 output = output.encode(preferredencoding())
176 sys.stderr.write(output)
178 def fixed_template(self):
179 """Checks if the output template is fixed."""
180 return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
182 def trouble(self, message=None, tb=None):
183 """Determine action to take when a download problem appears.
185 Depending on if the downloader has been configured to ignore
186 download errors or not, this method may throw an exception or
187 not when errors are found, after printing the message.
189 tb, if given, is additional traceback information.
191 if message is not None:
192 self.to_stderr(message)
193 if self.params.get('verbose'):
195 if sys.exc_info()[0]: # if .trouble has been called from an except block
197 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
198 tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
199 tb += compat_str(traceback.format_exc())
201 tb_data = traceback.format_list(traceback.extract_stack())
202 tb = u''.join(tb_data)
204 if not self.params.get('ignoreerrors', False):
205 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
206 exc_info = sys.exc_info()[1].exc_info
208 exc_info = sys.exc_info()
209 raise DownloadError(message, exc_info)
210 self._download_retcode = 1
212 def report_warning(self, message):
214 Print the message to stderr, it will be prefixed with 'WARNING:'
215 If stderr is a tty file the 'WARNING:' will be colored
217 if sys.stderr.isatty() and os.name != 'nt':
218 _msg_header=u'\033[0;33mWARNING:\033[0m'
220 _msg_header=u'WARNING:'
221 warning_message=u'%s %s' % (_msg_header,message)
222 self.to_stderr(warning_message)
224 def report_error(self, message, tb=None):
226 Do the same as trouble, but prefixes the message with 'ERROR:', colored
227 in red if stderr is a tty file.
229 if sys.stderr.isatty() and os.name != 'nt':
230 _msg_header = u'\033[0;31mERROR:\033[0m'
232 _msg_header = u'ERROR:'
233 error_message = u'%s %s' % (_msg_header, message)
234 self.trouble(error_message, tb)
236 def slow_down(self, start_time, byte_counter):
237 """Sleep if the download speed is over the rate limit."""
238 rate_limit = self.params.get('ratelimit', None)
239 if rate_limit is None or byte_counter == 0:
242 elapsed = now - start_time
245 speed = float(byte_counter) / elapsed
246 if speed > rate_limit:
247 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
249 def report_writedescription(self, descfn):
250 """ Report that the description file is being written """
251 self.to_screen(u'[info] Writing video description to: ' + descfn)
253 def report_writesubtitles(self, sub_filename):
254 """ Report that the subtitles file is being written """
255 self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
257 def report_writeinfojson(self, infofn):
258 """ Report that the metadata file has been written """
259 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
261 def report_file_already_downloaded(self, file_name):
262 """Report file has already been fully downloaded."""
264 self.to_screen(u'[download] %s has already been downloaded' % file_name)
265 except (UnicodeEncodeError) as err:
266 self.to_screen(u'[download] The file has already been downloaded')
268 def increment_downloads(self):
269 """Increment the ordinal that assigns a number to each file."""
270 self._num_downloads += 1
272 def prepare_filename(self, info_dict):
273 """Generate the output filename."""
275 template_dict = dict(info_dict)
277 template_dict['epoch'] = int(time.time())
278 autonumber_size = self.params.get('autonumber_size')
279 if autonumber_size is None:
281 autonumber_templ = u'%0' + str(autonumber_size) + u'd'
282 template_dict['autonumber'] = autonumber_templ % self._num_downloads
283 if template_dict['playlist_index'] is not None:
284 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
286 sanitize = lambda k,v: sanitize_filename(
287 u'NA' if v is None else compat_str(v),
288 restricted=self.params.get('restrictfilenames'),
290 template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
292 filename = self.params['outtmpl'] % template_dict
294 except KeyError as err:
295 self.report_error(u'Erroneous output template')
297 except ValueError as err:
298 self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
301 def _match_entry(self, info_dict):
302 """ Returns None iff the file should be downloaded """
304 title = info_dict['title']
305 matchtitle = self.params.get('matchtitle', False)
307 if not re.search(matchtitle, title, re.IGNORECASE):
308 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
309 rejecttitle = self.params.get('rejecttitle', False)
311 if re.search(rejecttitle, title, re.IGNORECASE):
312 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
313 date = info_dict.get('upload_date', None)
315 dateRange = self.params.get('daterange', DateRange())
316 if date not in dateRange:
317 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
318 age_limit = self.params.get('age_limit')
319 if age_limit is not None:
320 if age_limit < info_dict.get('age_limit', 0):
321 return u'Skipping "' + title + '" because it is age restricted'
322 if self.in_download_archive(info_dict):
323 return (u'%(title)s has already been recorded in archive'
327 def extract_info(self, url, download=True, ie_key=None, extra_info={}):
329 Returns a list with a dictionary for each video we find.
330 If 'download', also downloads the videos.
331 extra_info is a dict containing the extra values to add to each result
335 ies = [self.get_info_extractor(ie_key)]
340 if not ie.suitable(url):
344 self.report_warning(u'The program functionality for this site has been marked as broken, '
345 u'and will probably not work.')
348 ie_result = ie.extract(url)
349 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
351 if isinstance(ie_result, list):
352 # Backwards compatibility: old IE result format
353 for result in ie_result:
354 result.update(extra_info)
356 '_type': 'compat_list',
357 'entries': ie_result,
360 ie_result.update(extra_info)
361 if 'extractor' not in ie_result:
362 ie_result['extractor'] = ie.IE_NAME
363 return self.process_ie_result(ie_result, download=download)
364 except ExtractorError as de: # An error we somewhat expected
365 self.report_error(compat_str(de), de.format_traceback())
367 except Exception as e:
368 if self.params.get('ignoreerrors', False):
369 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
374 self.report_error(u'no suitable InfoExtractor: %s' % url)
376 def process_ie_result(self, ie_result, download=True, extra_info={}):
378 Take the result of the ie(may be modified) and resolve all unresolved
379 references (URLs, playlist items).
381 It will also download the videos if 'download'.
382 Returns the resolved ie_result.
385 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
386 if result_type == 'video':
387 ie_result.update(extra_info)
388 return self.process_video_result(ie_result)
389 elif result_type == 'url':
390 # We have to add extra_info to the results because it may be
391 # contained in a playlist
392 return self.extract_info(ie_result['url'],
394 ie_key=ie_result.get('ie_key'),
395 extra_info=extra_info)
396 elif result_type == 'playlist':
397 # We process each entry in the playlist
398 playlist = ie_result.get('title', None) or ie_result.get('id', None)
399 self.to_screen(u'[download] Downloading playlist: %s' % playlist)
401 playlist_results = []
403 n_all_entries = len(ie_result['entries'])
404 playliststart = self.params.get('playliststart', 1) - 1
405 playlistend = self.params.get('playlistend', -1)
407 if playlistend == -1:
408 entries = ie_result['entries'][playliststart:]
410 entries = ie_result['entries'][playliststart:playlistend]
412 n_entries = len(entries)
414 self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
415 (ie_result['extractor'], playlist, n_all_entries, n_entries))
417 for i,entry in enumerate(entries,1):
418 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
420 'playlist': playlist,
421 'playlist_index': i + playliststart,
423 if not 'extractor' in entry:
424 # We set the extractor, if it's an url it will be set then to
425 # the new extractor, but if it's already a video we must make
426 # sure it's present: see issue #877
427 entry['extractor'] = ie_result['extractor']
428 entry_result = self.process_ie_result(entry,
431 playlist_results.append(entry_result)
432 ie_result['entries'] = playlist_results
434 elif result_type == 'compat_list':
436 r.setdefault('extractor', ie_result['extractor'])
438 ie_result['entries'] = [
439 self.process_ie_result(_fixup(r), download=download)
440 for r in ie_result['entries']
444 raise Exception('Invalid result type: %s' % result_type)
446 def process_video_result(self, info_dict, download=True):
447 assert info_dict.get('_type', 'video') == 'video'
449 if 'playlist' not in info_dict:
450 # It isn't part of a playlist
451 info_dict['playlist'] = None
452 info_dict['playlist_index'] = None
454 # We now pick which formats have to be downloaded
455 if info_dict.get('formats') is None:
456 # There's only one format available
457 formats = [info_dict]
459 formats = info_dict['formats']
461 # We check that all the formats have the format and format_id fields
462 for (i, format) in enumerate(formats):
463 if format.get('format') is None:
464 format['format'] = compat_str(i)
465 if format.get('format_id') is None:
466 format['format_id'] = compat_str(i)
468 if self.params.get('listformats', None):
469 self.list_formats(info_dict)
472 format_limit = self.params.get('format_limit', None)
474 formats = [f for f in formats if f['format_id'] <= format_limit]
476 req_format = self.params.get('format', 'best')
477 formats_to_download = []
478 if req_format == 'best' or req_format is None:
479 formats_to_download = [formats[-1]]
480 elif req_format == 'worst':
481 formats_to_download = [formats[0]]
482 # The -1 is for supporting YoutubeIE
483 elif req_format in ('-1', 'all'):
484 formats_to_download = formats
486 # We can accept formats requestd in the format: 34/10/5, we pick
487 # the first that is availble, starting from left
488 req_formats = req_format.split('/')
489 for rf in req_formats:
490 matches = filter(lambda f:f['format_id'] == rf ,formats)
492 formats_to_download = [matches[0]]
494 if not formats_to_download:
495 raise ExtractorError(u'requested format not available')
498 if len(formats_to_download) > 1:
499 self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
500 for format in formats_to_download:
501 new_info = dict(info_dict)
502 new_info.update(format)
503 self.process_info(new_info)
504 # We update the info dict with the best quality format (backwards compatibility)
505 info_dict.update(formats_to_download[-1])
508 def process_info(self, info_dict):
509 """Process a single resolved IE result."""
511 assert info_dict.get('_type', 'video') == 'video'
512 #We increment the download the download count here to match the previous behaviour.
513 self.increment_downloads()
515 info_dict['fulltitle'] = info_dict['title']
516 if len(info_dict['title']) > 200:
517 info_dict['title'] = info_dict['title'][:197] + u'...'
519 # Keep for backwards compatibility
520 info_dict['stitle'] = info_dict['title']
522 if not 'format' in info_dict:
523 info_dict['format'] = info_dict['ext']
525 reason = self._match_entry(info_dict)
526 if reason is not None:
527 self.to_screen(u'[download] ' + reason)
530 max_downloads = self.params.get('max_downloads')
531 if max_downloads is not None:
532 if self._num_downloads > int(max_downloads):
533 raise MaxDownloadsReached()
535 filename = self.prepare_filename(info_dict)
538 if self.params.get('forcetitle', False):
539 compat_print(info_dict['title'])
540 if self.params.get('forceid', False):
541 compat_print(info_dict['id'])
542 if self.params.get('forceurl', False):
543 # For RTMP URLs, also include the playpath
544 compat_print(info_dict['url'] + info_dict.get('play_path', u''))
545 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
546 compat_print(info_dict['thumbnail'])
547 if self.params.get('forcedescription', False) and 'description' in info_dict:
548 compat_print(info_dict['description'])
549 if self.params.get('forcefilename', False) and filename is not None:
550 compat_print(filename)
551 if self.params.get('forceformat', False):
552 compat_print(info_dict['format'])
554 # Do nothing else if in simulate mode
555 if self.params.get('simulate', False):
562 dn = os.path.dirname(encodeFilename(filename))
563 if dn != '' and not os.path.exists(dn):
565 except (OSError, IOError) as err:
566 self.report_error(u'unable to create directory ' + compat_str(err))
569 if self.params.get('writedescription', False):
571 descfn = filename + u'.description'
572 self.report_writedescription(descfn)
573 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
574 descfile.write(info_dict['description'])
575 except (KeyError, TypeError):
576 self.report_warning(u'There\'s no description to write.')
577 except (OSError, IOError):
578 self.report_error(u'Cannot write description file ' + descfn)
581 subtitles_are_requested = any([self.params.get('writesubtitles', False),
582 self.params.get('writeautomaticsub')])
584 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
585 # subtitles download errors are already managed as troubles in relevant IE
586 # that way it will silently go on when used with unsupporting IE
587 subtitles = info_dict['subtitles']
588 sub_format = self.params.get('subtitlesformat')
589 for sub_lang in subtitles.keys():
590 sub = subtitles[sub_lang]
594 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
595 self.report_writesubtitles(sub_filename)
596 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
598 except (OSError, IOError):
599 self.report_error(u'Cannot write subtitles file ' + descfn)
602 if self.params.get('writeinfojson', False):
603 infofn = filename + u'.info.json'
604 self.report_writeinfojson(infofn)
606 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
607 write_json_file(json_info_dict, encodeFilename(infofn))
608 except (OSError, IOError):
609 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
612 if self.params.get('writethumbnail', False):
613 if info_dict.get('thumbnail') is not None:
614 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
615 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
616 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
617 (info_dict['extractor'], info_dict['id']))
619 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
620 with open(thumb_filename, 'wb') as thumbf:
621 shutil.copyfileobj(uf, thumbf)
622 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
623 (info_dict['extractor'], info_dict['id'], thumb_filename))
624 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625 self.report_warning(u'Unable to download thumbnail "%s": %s' %
626 (info_dict['thumbnail'], compat_str(err)))
628 if not self.params.get('skip_download', False):
629 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
633 success = self.fd._do_download(filename, info_dict)
634 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
635 self.report_error(u'unable to download video data: %s' % str(err))
637 except (OSError, IOError) as err:
638 raise UnavailableVideoError(err)
639 except (ContentTooShortError, ) as err:
640 self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
645 self.post_process(filename, info_dict)
646 except (PostProcessingError) as err:
647 self.report_error(u'postprocessing: %s' % str(err))
650 self.record_download_archive(info_dict)
652 def download(self, url_list):
653 """Download a given list of URLs."""
654 if len(url_list) > 1 and self.fixed_template():
655 raise SameFileError(self.params['outtmpl'])
659 #It also downloads the videos
660 videos = self.extract_info(url)
661 except UnavailableVideoError:
662 self.report_error(u'unable to download video')
663 except MaxDownloadsReached:
664 self.to_screen(u'[info] Maximum number of downloaded files reached.')
667 return self._download_retcode
669 def post_process(self, filename, ie_info):
670 """Run all the postprocessors on the given file."""
672 info['filepath'] = filename
676 keep_video_wish,new_info = pp.run(info)
677 if keep_video_wish is not None:
679 keep_video = keep_video_wish
680 elif keep_video is None:
681 # No clear decision yet, let IE decide
682 keep_video = keep_video_wish
683 except PostProcessingError as e:
684 self.report_error(e.msg)
685 if keep_video is False and not self.params.get('keepvideo', False):
687 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
688 os.remove(encodeFilename(filename))
689 except (IOError, OSError):
690 self.report_warning(u'Unable to remove downloaded video file')
692 def in_download_archive(self, info_dict):
693 fn = self.params.get('download_archive')
696 vid_id = info_dict['extractor'] + u' ' + info_dict['id']
698 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
699 for line in archive_file:
700 if line.strip() == vid_id:
702 except IOError as ioe:
703 if ioe.errno != errno.ENOENT:
707 def record_download_archive(self, info_dict):
708 fn = self.params.get('download_archive')
711 vid_id = info_dict['extractor'] + u' ' + info_dict['id']
712 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
713 archive_file.write(vid_id + u'\n')
715 def list_formats(self, info_dict):
717 for format in info_dict.get('formats', [info_dict]):
718 formats_s.append("%s\t:\t%s\t[%s]" % (format['format_id'],
720 format.get('format', '???'),
723 if len(formats_s) != 1:
724 formats_s[0] += ' (worst)'
725 formats_s[-1] += ' (best)'
726 formats_s = "\n".join(formats_s)
727 self.to_screen(u"[info] Available formats for %s:\nformat code\textension\n%s" % (info_dict['id'], formats_s))