From 24edc6d4de0c388ca3c4faa790479a66100e89e6 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Mon, 25 May 2009 12:52:20 +0200
Subject: [PATCH] watcher: handle full and drained queue cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the watcher is broken when the queue is full, thus not
fulfilling its job as a queue cleaner. It also doesn't handle nicely the
queue drained status.

This patch does a few changes:
  - first archive jobs, and only after submit jobs; this fixes the case
    where the queue is already full and there are jobs suited for
    archiving (but not the case where the jobs all too young to be
    archived)
  - handle nicely the job queue full and drained casesβinstead of
    tracebacks, log such cases nicely
  - reverse the initial value and special cases for update_file; we now
    whitelist instead of blacklist cases, since we have much more
    blacklist cases than vice versa, and we set the flag to True only
    after the run is successful

The last change, especially, is a significant one: now errors during the
watcher run will not update the status file, and thus they won't be lost
again in the logs.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Michael Hanselmann <hansmi@google.com>
---
 daemons/ganeti-watcher | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 3cf96e5be..b762c6a9a 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -298,6 +298,9 @@ class Watcher(object):
     master = client.QueryConfigValues(["master_node"])[0]
     if master != utils.HostInfo().name:
       raise NotMasterError("This is not the master node")
+    # first archive old jobs
+    self.ArchiveJobs(opts.job_age)
+    # and only then submit new ones
     self.instances, self.bootids, self.smap = GetClusterData()
     self.started_instances = set()
     self.opts = opts
@@ -307,12 +310,12 @@ class Watcher(object):
 
     """
     notepad = self.notepad
-    self.ArchiveJobs(self.opts.job_age)
     self.CheckInstances(notepad)
     self.CheckDisks(notepad)
     self.VerifyDisks()
 
-  def ArchiveJobs(self, age):
+  @staticmethod
+  def ArchiveJobs(age):
     """Archive old jobs.
 
     """
@@ -459,7 +462,7 @@ def main():
   utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
                      stderr_logging=options.debug)
 
-  update_file = True
+  update_file = False
   try:
     notepad = WatcherState()
     try:
@@ -468,13 +471,13 @@ def main():
       except errors.OpPrereqError:
         # this is, from cli.GetClient, a not-master case
         logging.debug("Not on master, exiting")
+        update_file = True
         sys.exit(constants.EXIT_SUCCESS)
       except luxi.NoMasterError, err:
         logging.warning("Master seems to be down (%s), trying to restart",
                         str(err))
         if not StartMaster():
           logging.critical("Can't start the master, exiting")
-          update_file = False
           sys.exit(constants.EXIT_FAILURE)
         # else retry the connection
         client = cli.GetClient()
@@ -483,9 +486,12 @@ def main():
         watcher = Watcher(options, notepad)
       except errors.ConfigurationError:
         # Just exit if there's no configuration
+        update_file = True
         sys.exit(constants.EXIT_SUCCESS)
 
       watcher.Run()
+      update_file = True
+
     finally:
       if update_file:
         notepad.Save()
@@ -499,6 +505,10 @@ def main():
   except errors.ResolverError, err:
     logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
     sys.exit(constants.EXIT_NODESETUP_ERROR)
+  except errors.JobQueueFull:
+    logging.error("Job queue is full, can't query cluster state")
+  except errors.JobQueueDrainError:
+    logging.error("Job queue is drained, can't maintain cluster state")
   except Exception, err:
     logging.error(str(err), exc_info=True)
     sys.exit(constants.EXIT_FAILURE)
-- 
GitLab