Commit b7309a0d authored by Iustin Pop's avatar Iustin Pop
Browse files

Fix the watcher not restarting instance bug

The watcher was using conflicting attributes of the instance:
  - it queried the admin_/oper_state, which are booleans
  - but it compared those to the status (which is a text field)

The code was changed to query the aggregated 'status' field, as that
will also return indication of node problems, and we can use this only
one field for all decisions. We still ask for the admin_state field as
that is needed for the activate disks check (in secondary node restart).

The patch also touches the watcher in some other parts:
  - log exceptions nicer
  - convert a method to @staticmethod
  - remove unused imports

Reviewed-by: imsnah
parent 5188ab37
......@@ -30,8 +30,6 @@ by a node reboot. Run from cron or similar.
import os
import sys
import time
import fcntl
import errno
import logging
from optparse import OptionParser
......@@ -46,8 +44,8 @@ from ganeti import cli
MAXTRIES = 5
BAD_STATES = ['stopped']
HELPLESS_STATES = ['(node down)']
BAD_STATES = ['ERROR_down']
HELPLESS_STATES = ['ERROR_nodedown']
NOTICE = 'NOTICE'
ERROR = 'ERROR'
KEY_RESTART_COUNT = "restart_count"
......@@ -238,7 +236,7 @@ def GetInstanceList(with_secondaries=None):
"""Get a list of instances on this cluster.
"""
fields = ["name", "oper_state", "admin_state"]
fields = ["name", "status", "admin_state"]
if with_secondaries is not None:
fields.append("snodes")
......@@ -327,8 +325,9 @@ class Watcher(object):
try:
logging.info("Activating disks for instance %s", instance.name)
instance.ActivateDisks()
except Exception, err:
logging.error(str(err), exc_info=True)
except Exception:
logging.exception("Error while activating disks for instance %s",
instance.name)
# Keep changed boot IDs
for name in check_nodes:
......@@ -339,10 +338,6 @@ class Watcher(object):
"""
for instance in self.instances:
# Don't care about manually stopped instances
if not instance.autostart:
continue
if instance.state in BAD_STATES:
n = notepad.NumberOfRestartAttempts(instance)
......@@ -361,8 +356,8 @@ class Watcher(object):
instance.name, last)
instance.Restart()
self.started_instances.add(instance.name)
except Exception, err:
logging.error(str(err), exc_info=True)
except Exception:
logging.exception("Erro while restarting instance %s", instance.name)
notepad.RecordRestartAttempt(instance)
elif instance.state in HELPLESS_STATES:
......@@ -373,7 +368,8 @@ class Watcher(object):
notepad.RemoveInstance(instance)
logging.info("Restart of %s succeeded", instance.name)
def VerifyDisks(self):
@staticmethod
def VerifyDisks():
"""Run gnt-cluster verify-disks.
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment