2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
17 from .extractor import get_info_extractor, gen_extractors
18 from .FileDownloader import FileDownloader
21 class YoutubeDL(object):
24 YoutubeDL objects are the ones responsible of downloading the
25 actual video file and writing it to disk if the user has requested
26 it, among some other tasks. In most cases there should be one per
27 program. As, given a video URL, the downloader doesn't know how to
28 extract all the needed information, task that InfoExtractors do, it
29 has to pass the URL to one of them.
31 For this, YoutubeDL objects have a method that allows
32 InfoExtractors to be registered in a given order. When it is passed
33 a URL, the YoutubeDL object handles it to the first InfoExtractor it
34 finds that reports being able to handle it. The InfoExtractor extracts
35 all the information about the video or videos the URL refers to, and
36 YoutubeDL process the extracted information, possibly using a File
37 Downloader to download the video.
39 YoutubeDL objects accept a lot of parameters. In order not to saturate
40 the object constructor with arguments, it receives a dictionary of
41 options instead. These options are available through the params
42 attribute for the InfoExtractors to use. The YoutubeDL also
43 registers itself as the downloader in charge for the InfoExtractors
44 that are added to it, so this is a "mutual registration".
48 username: Username for authentication purposes.
49 password: Password for authentication purposes.
50 videopassword: Password for acces a video.
51 usenetrc: Use netrc for authentication instead.
52 verbose: Print additional info to stdout.
53 quiet: Do not print messages to stdout.
54 forceurl: Force printing final URL.
55 forcetitle: Force printing title.
56 forceid: Force printing ID.
57 forcethumbnail: Force printing thumbnail URL.
58 forcedescription: Force printing description.
59 forcefilename: Force printing final filename.
60 simulate: Do not download the video files.
61 format: Video format code.
62 format_limit: Highest quality format to try.
63 outtmpl: Template for output names.
64 restrictfilenames: Do not allow "&" and spaces in file names
65 ignoreerrors: Do not stop on download errors.
66 nooverwrites: Prevent overwriting files.
67 playliststart: Playlist item to start at.
68 playlistend: Playlist item to end at.
69 matchtitle: Download only matching titles.
70 rejecttitle: Reject downloads for matching titles.
71 logtostderr: Log messages to stderr instead of stdout.
72 writedescription: Write the video description to a .description file
73 writeinfojson: Write the video description to a .info.json file
74 writethumbnail: Write the thumbnail image to a file
75 writesubtitles: Write the video subtitles to a file
76 writeautomaticsub: Write the automatic subtitles to a file
77 allsubtitles: Downloads all the subtitles of the video
78 (requires writesubtitles or writeautomaticsub)
79 listsubtitles: Lists all available subtitles for the video
80 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
81 subtitleslangs: List of languages of the subtitles to download
82 keepvideo: Keep the video file after post-processing
83 daterange: A DateRange object, download only if the upload_date is in the range.
84 skip_download: Skip the actual download of the video file
85 cachedir: Location of the cache files in the filesystem.
86 None to disable filesystem cache.
87 noplaylist: Download single video instead of a playlist if in doubt.
88 downloadarchive: File name of a file where all downloads are recorded.
89 Videos already present in the file are not downloaded
92 The following parameters are not used by YoutubeDL itself, they are used by
94 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
95 noresizebuffer, retries, continuedl, noprogress, consoletitle
101 _download_retcode = None
102 _num_downloads = None
105 def __init__(self, params):
106 """Create a FileDownloader object with the given options."""
108 self._ies_instances = {}
110 self._progress_hooks = []
111 self._download_retcode = 0
112 self._num_downloads = 0
113 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
115 if (sys.version_info >= (3,) and sys.platform != 'win32' and
116 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
117 and not params['restrictfilenames']):
118 # On Python 3, the Unicode filesystem API will throw errors (#1474)
120 u'Assuming --restrict-filenames isnce file system encoding '
121 u'cannot encode all charactes. '
122 u'Set the LC_ALL environment variable to fix this.')
123 params['restrictfilenames'] = True
126 self.fd = FileDownloader(self, self.params)
128 if '%(stitle)s' in self.params['outtmpl']:
129 self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
131 def add_info_extractor(self, ie):
132 """Add an InfoExtractor object to the end of the list."""
134 self._ies_instances[ie.ie_key()] = ie
135 ie.set_downloader(self)
137 def get_info_extractor(self, ie_key):
139 Get an instance of an IE with name ie_key, it will try to get one from
140 the _ies list, if there's no instance it will create a new one and add
141 it to the extractor list.
143 ie = self._ies_instances.get(ie_key)
145 ie = get_info_extractor(ie_key)()
146 self.add_info_extractor(ie)
149 def add_default_info_extractors(self):
151 Add the InfoExtractors returned by gen_extractors to the end of the list
153 for ie in gen_extractors():
154 self.add_info_extractor(ie)
156 def add_post_processor(self, pp):
157 """Add a PostProcessor object to the end of the chain."""
159 pp.set_downloader(self)
161 def to_screen(self, message, skip_eol=False):
162 """Print message to stdout if not in quiet mode."""
163 if not self.params.get('quiet', False):
164 terminator = [u'\n', u''][skip_eol]
165 output = message + terminator
166 write_string(output, self._screen_file)
168 def to_stderr(self, message):
169 """Print message to stderr."""
170 assert type(message) == type(u'')
171 output = message + u'\n'
172 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
173 output = output.encode(preferredencoding())
174 sys.stderr.write(output)
176 def fixed_template(self):
177 """Checks if the output template is fixed."""
178 return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
180 def trouble(self, message=None, tb=None):
181 """Determine action to take when a download problem appears.
183 Depending on if the downloader has been configured to ignore
184 download errors or not, this method may throw an exception or
185 not when errors are found, after printing the message.
187 tb, if given, is additional traceback information.
189 if message is not None:
190 self.to_stderr(message)
191 if self.params.get('verbose'):
193 if sys.exc_info()[0]: # if .trouble has been called from an except block
195 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
196 tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
197 tb += compat_str(traceback.format_exc())
199 tb_data = traceback.format_list(traceback.extract_stack())
200 tb = u''.join(tb_data)
202 if not self.params.get('ignoreerrors', False):
203 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
204 exc_info = sys.exc_info()[1].exc_info
206 exc_info = sys.exc_info()
207 raise DownloadError(message, exc_info)
208 self._download_retcode = 1
210 def report_warning(self, message):
212 Print the message to stderr, it will be prefixed with 'WARNING:'
213 If stderr is a tty file the 'WARNING:' will be colored
215 if sys.stderr.isatty() and os.name != 'nt':
216 _msg_header=u'\033[0;33mWARNING:\033[0m'
218 _msg_header=u'WARNING:'
219 warning_message=u'%s %s' % (_msg_header,message)
220 self.to_stderr(warning_message)
222 def report_error(self, message, tb=None):
224 Do the same as trouble, but prefixes the message with 'ERROR:', colored
225 in red if stderr is a tty file.
227 if sys.stderr.isatty() and os.name != 'nt':
228 _msg_header = u'\033[0;31mERROR:\033[0m'
230 _msg_header = u'ERROR:'
231 error_message = u'%s %s' % (_msg_header, message)
232 self.trouble(error_message, tb)
234 def slow_down(self, start_time, byte_counter):
235 """Sleep if the download speed is over the rate limit."""
236 rate_limit = self.params.get('ratelimit', None)
237 if rate_limit is None or byte_counter == 0:
240 elapsed = now - start_time
243 speed = float(byte_counter) / elapsed
244 if speed > rate_limit:
245 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
247 def report_writedescription(self, descfn):
248 """ Report that the description file is being written """
249 self.to_screen(u'[info] Writing video description to: ' + descfn)
251 def report_writesubtitles(self, sub_filename):
252 """ Report that the subtitles file is being written """
253 self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
255 def report_writeinfojson(self, infofn):
256 """ Report that the metadata file has been written """
257 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
259 def report_file_already_downloaded(self, file_name):
260 """Report file has already been fully downloaded."""
262 self.to_screen(u'[download] %s has already been downloaded' % file_name)
263 except (UnicodeEncodeError) as err:
264 self.to_screen(u'[download] The file has already been downloaded')
266 def increment_downloads(self):
267 """Increment the ordinal that assigns a number to each file."""
268 self._num_downloads += 1
270 def prepare_filename(self, info_dict):
271 """Generate the output filename."""
273 template_dict = dict(info_dict)
275 template_dict['epoch'] = int(time.time())
276 autonumber_size = self.params.get('autonumber_size')
277 if autonumber_size is None:
279 autonumber_templ = u'%0' + str(autonumber_size) + u'd'
280 template_dict['autonumber'] = autonumber_templ % self._num_downloads
281 if template_dict['playlist_index'] is not None:
282 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
284 sanitize = lambda k,v: sanitize_filename(
285 u'NA' if v is None else compat_str(v),
286 restricted=self.params.get('restrictfilenames'),
288 template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
290 filename = self.params['outtmpl'] % template_dict
292 except KeyError as err:
293 self.report_error(u'Erroneous output template')
295 except ValueError as err:
296 self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
299 def _match_entry(self, info_dict):
300 """ Returns None iff the file should be downloaded """
302 title = info_dict['title']
303 matchtitle = self.params.get('matchtitle', False)
305 if not re.search(matchtitle, title, re.IGNORECASE):
306 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
307 rejecttitle = self.params.get('rejecttitle', False)
309 if re.search(rejecttitle, title, re.IGNORECASE):
310 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
311 date = info_dict.get('upload_date', None)
313 dateRange = self.params.get('daterange', DateRange())
314 if date not in dateRange:
315 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
316 if self.in_download_archive(info_dict):
317 return (u'%(title)s has already been recorded in archive'
321 def extract_info(self, url, download=True, ie_key=None, extra_info={}):
323 Returns a list with a dictionary for each video we find.
324 If 'download', also downloads the videos.
325 extra_info is a dict containing the extra values to add to each result
329 ies = [self.get_info_extractor(ie_key)]
334 if not ie.suitable(url):
338 self.report_warning(u'The program functionality for this site has been marked as broken, '
339 u'and will probably not work.')
342 ie_result = ie.extract(url)
343 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
345 if isinstance(ie_result, list):
346 # Backwards compatibility: old IE result format
347 for result in ie_result:
348 result.update(extra_info)
350 '_type': 'compat_list',
351 'entries': ie_result,
354 ie_result.update(extra_info)
355 if 'extractor' not in ie_result:
356 ie_result['extractor'] = ie.IE_NAME
357 return self.process_ie_result(ie_result, download=download)
358 except ExtractorError as de: # An error we somewhat expected
359 self.report_error(compat_str(de), de.format_traceback())
361 except Exception as e:
362 if self.params.get('ignoreerrors', False):
363 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
368 self.report_error(u'no suitable InfoExtractor: %s' % url)
370 def process_ie_result(self, ie_result, download=True, extra_info={}):
372 Take the result of the ie(may be modified) and resolve all unresolved
373 references (URLs, playlist items).
375 It will also download the videos if 'download'.
376 Returns the resolved ie_result.
379 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
380 if result_type == 'video':
381 ie_result.update(extra_info)
382 if 'playlist' not in ie_result:
383 # It isn't part of a playlist
384 ie_result['playlist'] = None
385 ie_result['playlist_index'] = None
387 self.process_info(ie_result)
389 elif result_type == 'url':
390 # We have to add extra_info to the results because it may be
391 # contained in a playlist
392 return self.extract_info(ie_result['url'],
394 ie_key=ie_result.get('ie_key'),
395 extra_info=extra_info)
396 elif result_type == 'playlist':
397 # We process each entry in the playlist
398 playlist = ie_result.get('title', None) or ie_result.get('id', None)
399 self.to_screen(u'[download] Downloading playlist: %s' % playlist)
401 playlist_results = []
403 n_all_entries = len(ie_result['entries'])
404 playliststart = self.params.get('playliststart', 1) - 1
405 playlistend = self.params.get('playlistend', -1)
407 if playlistend == -1:
408 entries = ie_result['entries'][playliststart:]
410 entries = ie_result['entries'][playliststart:playlistend]
412 n_entries = len(entries)
414 self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
415 (ie_result['extractor'], playlist, n_all_entries, n_entries))
417 for i,entry in enumerate(entries,1):
418 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
420 'playlist': playlist,
421 'playlist_index': i + playliststart,
423 if not 'extractor' in entry:
424 # We set the extractor, if it's an url it will be set then to
425 # the new extractor, but if it's already a video we must make
426 # sure it's present: see issue #877
427 entry['extractor'] = ie_result['extractor']
428 entry_result = self.process_ie_result(entry,
431 playlist_results.append(entry_result)
432 ie_result['entries'] = playlist_results
434 elif result_type == 'compat_list':
436 r.setdefault('extractor', ie_result['extractor'])
438 ie_result['entries'] = [
439 self.process_ie_result(_fixup(r), download=download)
440 for r in ie_result['entries']
444 raise Exception('Invalid result type: %s' % result_type)
446 def process_info(self, info_dict):
447 """Process a single resolved IE result."""
449 assert info_dict.get('_type', 'video') == 'video'
450 #We increment the download the download count here to match the previous behaviour.
451 self.increment_downloads()
453 info_dict['fulltitle'] = info_dict['title']
454 if len(info_dict['title']) > 200:
455 info_dict['title'] = info_dict['title'][:197] + u'...'
457 # Keep for backwards compatibility
458 info_dict['stitle'] = info_dict['title']
460 if not 'format' in info_dict:
461 info_dict['format'] = info_dict['ext']
463 reason = self._match_entry(info_dict)
464 if reason is not None:
465 self.to_screen(u'[download] ' + reason)
468 max_downloads = self.params.get('max_downloads')
469 if max_downloads is not None:
470 if self._num_downloads > int(max_downloads):
471 raise MaxDownloadsReached()
473 filename = self.prepare_filename(info_dict)
476 if self.params.get('forcetitle', False):
477 compat_print(info_dict['title'])
478 if self.params.get('forceid', False):
479 compat_print(info_dict['id'])
480 if self.params.get('forceurl', False):
481 # For RTMP URLs, also include the playpath
482 compat_print(info_dict['url'] + info_dict.get('play_path', u''))
483 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
484 compat_print(info_dict['thumbnail'])
485 if self.params.get('forcedescription', False) and 'description' in info_dict:
486 compat_print(info_dict['description'])
487 if self.params.get('forcefilename', False) and filename is not None:
488 compat_print(filename)
489 if self.params.get('forceformat', False):
490 compat_print(info_dict['format'])
492 # Do nothing else if in simulate mode
493 if self.params.get('simulate', False):
500 dn = os.path.dirname(encodeFilename(filename))
501 if dn != '' and not os.path.exists(dn):
503 except (OSError, IOError) as err:
504 self.report_error(u'unable to create directory ' + compat_str(err))
507 if self.params.get('writedescription', False):
509 descfn = filename + u'.description'
510 self.report_writedescription(descfn)
511 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
512 descfile.write(info_dict['description'])
513 except (KeyError, TypeError):
514 self.report_warning(u'There\'s no description to write.')
515 except (OSError, IOError):
516 self.report_error(u'Cannot write description file ' + descfn)
519 subtitles_are_requested = any([self.params.get('writesubtitles', False),
520 self.params.get('writeautomaticsub')])
522 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
523 # subtitles download errors are already managed as troubles in relevant IE
524 # that way it will silently go on when used with unsupporting IE
525 subtitles = info_dict['subtitles']
526 sub_format = self.params.get('subtitlesformat')
527 for sub_lang in subtitles.keys():
528 sub = subtitles[sub_lang]
532 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
533 self.report_writesubtitles(sub_filename)
534 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
536 except (OSError, IOError):
537 self.report_error(u'Cannot write subtitles file ' + descfn)
540 if self.params.get('writeinfojson', False):
541 infofn = filename + u'.info.json'
542 self.report_writeinfojson(infofn)
544 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
545 write_json_file(json_info_dict, encodeFilename(infofn))
546 except (OSError, IOError):
547 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
550 if self.params.get('writethumbnail', False):
551 if info_dict.get('thumbnail') is not None:
552 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
553 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
554 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
555 (info_dict['extractor'], info_dict['id']))
557 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
558 with open(thumb_filename, 'wb') as thumbf:
559 shutil.copyfileobj(uf, thumbf)
560 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
561 (info_dict['extractor'], info_dict['id'], thumb_filename))
562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
563 self.report_warning(u'Unable to download thumbnail "%s": %s' %
564 (info_dict['thumbnail'], compat_str(err)))
566 if not self.params.get('skip_download', False):
567 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
571 success = self.fd._do_download(filename, info_dict)
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self.report_error(u'unable to download video data: %s' % str(err))
575 except (OSError, IOError) as err:
576 raise UnavailableVideoError(err)
577 except (ContentTooShortError, ) as err:
578 self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
583 self.post_process(filename, info_dict)
584 except (PostProcessingError) as err:
585 self.report_error(u'postprocessing: %s' % str(err))
588 self.record_download_archive(info_dict)
590 def download(self, url_list):
591 """Download a given list of URLs."""
592 if len(url_list) > 1 and self.fixed_template():
593 raise SameFileError(self.params['outtmpl'])
597 #It also downloads the videos
598 videos = self.extract_info(url)
599 except UnavailableVideoError:
600 self.report_error(u'unable to download video')
601 except MaxDownloadsReached:
602 self.to_screen(u'[info] Maximum number of downloaded files reached.')
605 return self._download_retcode
607 def post_process(self, filename, ie_info):
608 """Run all the postprocessors on the given file."""
610 info['filepath'] = filename
614 keep_video_wish,new_info = pp.run(info)
615 if keep_video_wish is not None:
617 keep_video = keep_video_wish
618 elif keep_video is None:
619 # No clear decision yet, let IE decide
620 keep_video = keep_video_wish
621 except PostProcessingError as e:
622 self.report_error(e.msg)
623 if keep_video is False and not self.params.get('keepvideo', False):
625 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
626 os.remove(encodeFilename(filename))
627 except (IOError, OSError):
628 self.report_warning(u'Unable to remove downloaded video file')
630 def in_download_archive(self, info_dict):
631 fn = self.params.get('download_archive')
634 vid_id = info_dict['extractor'] + u' ' + info_dict['id']
636 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
637 for line in archive_file:
638 if line.strip() == vid_id:
640 except IOError as ioe:
641 if ioe.errno != errno.ENOENT:
645 def record_download_archive(self, info_dict):
646 fn = self.params.get('download_archive')
649 vid_id = info_dict['extractor'] + u' ' + info_dict['id']
650 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
651 archive_file.write(vid_id + u'\n')