Commit 24edc6d4 authored by Iustin Pop's avatar Iustin Pop
Browse files

watcher: handle full and drained queue cases



Currently the watcher is broken when the queue is full, thus not
fulfilling its job as a queue cleaner. It also doesn't handle nicely the
queue drained status.

This patch does a few changes:
  - first archive jobs, and only after submit jobs; this fixes the case
    where the queue is already full and there are jobs suited for
    archiving (but not the case where the jobs all too young to be
    archived)
  - handle nicely the job queue full and drained cases—instead of
    tracebacks, log such cases nicely
  - reverse the initial value and special cases for update_file; we now
    whitelist instead of blacklist cases, since we have much more
    blacklist cases than vice versa, and we set the flag to True only
    after the run is successful

The last change, especially, is a significant one: now errors during the
watcher run will not update the status file, and thus they won't be lost
again in the logs.
Signed-off-by: default avatarIustin Pop <iustin@google.com>
Reviewed-by: default avatarMichael Hanselmann <hansmi@google.com>
parent 59b4eeef
......@@ -298,6 +298,9 @@ class Watcher(object):
master = client.QueryConfigValues(["master_node"])[0]
if master != utils.HostInfo().name:
raise NotMasterError("This is not the master node")
# first archive old jobs
self.ArchiveJobs(opts.job_age)
# and only then submit new ones
self.instances, self.bootids, self.smap = GetClusterData()
self.started_instances = set()
self.opts = opts
......@@ -307,12 +310,12 @@ class Watcher(object):
"""
notepad = self.notepad
self.ArchiveJobs(self.opts.job_age)
self.CheckInstances(notepad)
self.CheckDisks(notepad)
self.VerifyDisks()
def ArchiveJobs(self, age):
@staticmethod
def ArchiveJobs(age):
"""Archive old jobs.
"""
......@@ -459,7 +462,7 @@ def main():
utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
stderr_logging=options.debug)
update_file = True
update_file = False
try:
notepad = WatcherState()
try:
......@@ -468,13 +471,13 @@ def main():
except errors.OpPrereqError:
# this is, from cli.GetClient, a not-master case
logging.debug("Not on master, exiting")
update_file = True
sys.exit(constants.EXIT_SUCCESS)
except luxi.NoMasterError, err:
logging.warning("Master seems to be down (%s), trying to restart",
str(err))
if not StartMaster():
logging.critical("Can't start the master, exiting")
update_file = False
sys.exit(constants.EXIT_FAILURE)
# else retry the connection
client = cli.GetClient()
......@@ -483,9 +486,12 @@ def main():
watcher = Watcher(options, notepad)
except errors.ConfigurationError:
# Just exit if there's no configuration
update_file = True
sys.exit(constants.EXIT_SUCCESS)
watcher.Run()
update_file = True
finally:
if update_file:
notepad.Save()
......@@ -499,6 +505,10 @@ def main():
except errors.ResolverError, err:
logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
sys.exit(constants.EXIT_NODESETUP_ERROR)
except errors.JobQueueFull:
logging.error("Job queue is full, can't query cluster state")
except errors.JobQueueDrainError:
logging.error("Job queue is drained, can't maintain cluster state")
except Exception, err:
logging.error(str(err), exc_info=True)
sys.exit(constants.EXIT_FAILURE)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment