diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 87079dbc35d4163d768571c1f855a0280d117718..1a57d3bd80068d5cad3f0be939f516a613d428b4 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -55,6 +55,11 @@ import ganeti.rapi.client # pylint: disable-msg=W0611 MAXTRIES = 5 +# Delete any record that is older than 8 hours; this value is based on +# the fact that the current retry counter is 5, and watcher runs every +# 5 minutes, so it takes around half an hour to exceed the retry +# counter, so 8 hours (16*1/2h) seems like a reasonable reset time +RETRY_EXPIRATION = 8 * 3600 BAD_STATES = ['ERROR_down'] HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline'] NOTICE = 'NOTICE' @@ -326,6 +331,28 @@ class WatcherState(object): return 0 + def MaintainInstanceList(self, instances): + """Perform maintenance on the recorded instances. + + @type instances: list of string + @param instances: the list of currently existing instances + + """ + idict = self._data["instance"] + # First, delete obsolete instances + obsolete_instances = set(idict).difference(instances) + for inst in obsolete_instances: + logging.debug("Forgetting obsolete instance %s", inst) + del idict[inst] + + # Second, delete expired records + earliest = time.time() - RETRY_EXPIRATION + expired_instances = [i for i in idict + if idict[i][KEY_RESTART_WHEN] < earliest] + for inst in expired_instances: + logging.debug("Expiring record for instance %s", inst) + del idict[inst] + def RecordRestartAttempt(self, instance): """Record a restart attempt. @@ -513,12 +540,15 @@ class Watcher(object): """Make a pass over the list of instances, restarting downed ones. """ + notepad.MaintainInstanceList(self.instances.keys()) + for instance in self.instances.values(): if instance.state in BAD_STATES: n = notepad.NumberOfRestartAttempts(instance) if n > MAXTRIES: - # stay quiet. + logging.warning("Not restarting instance %s, retries exhausted", + instance.name) continue elif n < MAXTRIES: last = " (Attempt #%d)" % (n + 1)