From f5116c873189a6e0743b29d0690e516621631728 Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Fri, 23 Jul 2010 17:41:35 -0400 Subject: [PATCH] watcher: smarter handling of instance records MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements a few changes to the instance handling. First, old instances which no longer exist on the cluster are removed from the state file, to keep things clean. Second, the instance restart counters are reset every 8 hours, since some error cases might be transient (e.g. networking issues, or machine temporarily down), and if the problem takes more than 5 restarts but is not permanent, watcher will not restart the instance. The value of 8 hours is, I think, both conservative (as not to hammer the cluster too often with restarts) and fast enough to clear semi-transient problems. And last, if an instance is not restarted due to exhausted retries, this should be warned, otherwise it's hard to understand why watcher doesn't want to restart an ERROR_down instance. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: RenΓ© Nussbaumer <rn@google.com> --- daemons/ganeti-watcher | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 87079dbc3..1a57d3bd8 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -55,6 +55,11 @@ import ganeti.rapi.client # pylint: disable-msg=W0611 MAXTRIES = 5 +# Delete any record that is older than 8 hours; this value is based on +# the fact that the current retry counter is 5, and watcher runs every +# 5 minutes, so it takes around half an hour to exceed the retry +# counter, so 8 hours (16*1/2h) seems like a reasonable reset time +RETRY_EXPIRATION = 8 * 3600 BAD_STATES = ['ERROR_down'] HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline'] NOTICE = 'NOTICE' @@ -326,6 +331,28 @@ class WatcherState(object): return 0 + def MaintainInstanceList(self, instances): + """Perform maintenance on the recorded instances. + + @type instances: list of string + @param instances: the list of currently existing instances + + """ + idict = self._data["instance"] + # First, delete obsolete instances + obsolete_instances = set(idict).difference(instances) + for inst in obsolete_instances: + logging.debug("Forgetting obsolete instance %s", inst) + del idict[inst] + + # Second, delete expired records + earliest = time.time() - RETRY_EXPIRATION + expired_instances = [i for i in idict + if idict[i][KEY_RESTART_WHEN] < earliest] + for inst in expired_instances: + logging.debug("Expiring record for instance %s", inst) + del idict[inst] + def RecordRestartAttempt(self, instance): """Record a restart attempt. @@ -513,12 +540,15 @@ class Watcher(object): """Make a pass over the list of instances, restarting downed ones. """ + notepad.MaintainInstanceList(self.instances.keys()) + for instance in self.instances.values(): if instance.state in BAD_STATES: n = notepad.NumberOfRestartAttempts(instance) if n > MAXTRIES: - # stay quiet. + logging.warning("Not restarting instance %s, retries exhausted", + instance.name) continue elif n < MAXTRIES: last = " (Attempt #%d)" % (n + 1) -- GitLab