From f5116c873189a6e0743b29d0690e516621631728 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Fri, 23 Jul 2010 17:41:35 -0400
Subject: [PATCH] watcher: smarter handling of instance records
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch implements a few changes to the instance handling. First, old
instances which no longer exist on the cluster are removed from the
state file, to keep things clean.

Second, the instance restart counters are reset every 8 hours, since
some error cases might be transient (e.g. networking issues, or machine
temporarily down), and if the problem takes more than 5 restarts but is
not permanent, watcher will not restart the instance. The value of 8
hours is, I think, both conservative (as not to hammer the cluster too
often with restarts) and fast enough to clear semi-transient problems.

And last, if an instance is not restarted due to exhausted retries, this
should be warned, otherwise it's hard to understand why watcher doesn't
want to restart an ERROR_down instance.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: RenΓ© Nussbaumer <rn@google.com>
---
 daemons/ganeti-watcher | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 87079dbc3..1a57d3bd8 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -55,6 +55,11 @@ import ganeti.rapi.client # pylint: disable-msg=W0611
 
 
 MAXTRIES = 5
+# Delete any record that is older than 8 hours; this value is based on
+# the fact that the current retry counter is 5, and watcher runs every
+# 5 minutes, so it takes around half an hour to exceed the retry
+# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
+RETRY_EXPIRATION = 8 * 3600
 BAD_STATES = ['ERROR_down']
 HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
 NOTICE = 'NOTICE'
@@ -326,6 +331,28 @@ class WatcherState(object):
 
     return 0
 
+  def MaintainInstanceList(self, instances):
+    """Perform maintenance on the recorded instances.
+
+    @type instances: list of string
+    @param instances: the list of currently existing instances
+
+    """
+    idict = self._data["instance"]
+    # First, delete obsolete instances
+    obsolete_instances = set(idict).difference(instances)
+    for inst in obsolete_instances:
+      logging.debug("Forgetting obsolete instance %s", inst)
+      del idict[inst]
+
+    # Second, delete expired records
+    earliest = time.time() - RETRY_EXPIRATION
+    expired_instances = [i for i in idict
+                         if idict[i][KEY_RESTART_WHEN] < earliest]
+    for inst in expired_instances:
+      logging.debug("Expiring record for instance %s", inst)
+      del idict[inst]
+
   def RecordRestartAttempt(self, instance):
     """Record a restart attempt.
 
@@ -513,12 +540,15 @@ class Watcher(object):
     """Make a pass over the list of instances, restarting downed ones.
 
     """
+    notepad.MaintainInstanceList(self.instances.keys())
+
     for instance in self.instances.values():
       if instance.state in BAD_STATES:
         n = notepad.NumberOfRestartAttempts(instance)
 
         if n > MAXTRIES:
-          # stay quiet.
+          logging.warning("Not restarting instance %s, retries exhausted",
+                          instance.name)
           continue
         elif n < MAXTRIES:
           last = " (Attempt #%d)" % (n + 1)
-- 
GitLab