From 24edc6d4de0c388ca3c4faa790479a66100e89e6 Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Mon, 25 May 2009 12:52:20 +0200 Subject: [PATCH] watcher: handle full and drained queue cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the watcher is broken when the queue is full, thus not fulfilling its job as a queue cleaner. It also doesn't handle nicely the queue drained status. This patch does a few changes: - first archive jobs, and only after submit jobs; this fixes the case where the queue is already full and there are jobs suited for archiving (but not the case where the jobs all too young to be archived) - handle nicely the job queue full and drained casesβinstead of tracebacks, log such cases nicely - reverse the initial value and special cases for update_file; we now whitelist instead of blacklist cases, since we have much more blacklist cases than vice versa, and we set the flag to True only after the run is successful The last change, especially, is a significant one: now errors during the watcher run will not update the status file, and thus they won't be lost again in the logs. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Michael Hanselmann <hansmi@google.com> --- daemons/ganeti-watcher | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 3cf96e5be..b762c6a9a 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -298,6 +298,9 @@ class Watcher(object): master = client.QueryConfigValues(["master_node"])[0] if master != utils.HostInfo().name: raise NotMasterError("This is not the master node") + # first archive old jobs + self.ArchiveJobs(opts.job_age) + # and only then submit new ones self.instances, self.bootids, self.smap = GetClusterData() self.started_instances = set() self.opts = opts @@ -307,12 +310,12 @@ class Watcher(object): """ notepad = self.notepad - self.ArchiveJobs(self.opts.job_age) self.CheckInstances(notepad) self.CheckDisks(notepad) self.VerifyDisks() - def ArchiveJobs(self, age): + @staticmethod + def ArchiveJobs(age): """Archive old jobs. """ @@ -459,7 +462,7 @@ def main(): utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug, stderr_logging=options.debug) - update_file = True + update_file = False try: notepad = WatcherState() try: @@ -468,13 +471,13 @@ def main(): except errors.OpPrereqError: # this is, from cli.GetClient, a not-master case logging.debug("Not on master, exiting") + update_file = True sys.exit(constants.EXIT_SUCCESS) except luxi.NoMasterError, err: logging.warning("Master seems to be down (%s), trying to restart", str(err)) if not StartMaster(): logging.critical("Can't start the master, exiting") - update_file = False sys.exit(constants.EXIT_FAILURE) # else retry the connection client = cli.GetClient() @@ -483,9 +486,12 @@ def main(): watcher = Watcher(options, notepad) except errors.ConfigurationError: # Just exit if there's no configuration + update_file = True sys.exit(constants.EXIT_SUCCESS) watcher.Run() + update_file = True + finally: if update_file: notepad.Save() @@ -499,6 +505,10 @@ def main(): except errors.ResolverError, err: logging.error("Cannot resolve hostname '%s', exiting.", err.args[0]) sys.exit(constants.EXIT_NODESETUP_ERROR) + except errors.JobQueueFull: + logging.error("Job queue is full, can't query cluster state") + except errors.JobQueueDrainError: + logging.error("Job queue is drained, can't maintain cluster state") except Exception, err: logging.error(str(err), exc_info=True) sys.exit(constants.EXIT_FAILURE) -- GitLab