From cbfc468166d63c9f02f21018140df2558770ff9f Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Fri, 5 Dec 2008 02:58:40 +0000 Subject: [PATCH] watcher: handle offline nodes better This patch changes the LUQueryInstances to show a different state for offline nodes and also modifies the watcher to understand the offline state in its checks. Reviewed-by: ultrotter --- daemons/ganeti-watcher | 13 +++++++------ lib/cmdlib.py | 8 +++++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 351f10229..c3f4ac990 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -43,7 +43,7 @@ from ganeti import cli MAXTRIES = 5 BAD_STATES = ['ERROR_down'] -HELPLESS_STATES = ['ERROR_nodedown'] +HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline'] NOTICE = 'NOTICE' ERROR = 'ERROR' KEY_RESTART_COUNT = "restart_count" @@ -267,8 +267,8 @@ def GetNodeBootIDs(): """Get a dict mapping nodes to boot IDs. """ - result = client.QueryNodes([], ["name", "bootid"]) - return dict([(name, bootid) for name, bootid in result]) + result = client.QueryNodes([], ["name", "bootid", "offline"]) + return dict([(name, (bootid, offline)) for name, bootid, offline in result]) class Watcher(object): @@ -301,12 +301,13 @@ class Watcher(object): """ check_nodes = [] - for name, new_id in self.bootids.iteritems(): + for name, (new_id, offline) in self.bootids.iteritems(): old = notepad.GetNodeBootID(name) if new_id is None: # Bad node, not returning a boot id - logging.debug("Node %s missing boot id, skipping secondary checks", - name) + if not offline: + logging.debug("Node %s missing boot id, skipping secondary checks", + name) continue if old != new_id: # Node's boot ID has changed, proably through a reboot. diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 76b3b07b5..6e2f41cd2 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -3007,11 +3007,15 @@ class LUQueryInstances(NoHooksLU): hv_list = list(set([inst.hypervisor for inst in instance_list])) bad_nodes = [] + off_nodes = [] if self.do_locking: live_data = {} node_data = self.rpc.call_all_instances_info(nodes, hv_list) for name in nodes: result = node_data[name] + if result.offline: + # offline nodes will be in both lists + off_nodes.append(name) if result.failed: bad_nodes.append(name) else: @@ -3048,7 +3052,9 @@ class LUQueryInstances(NoHooksLU): else: val = bool(live_data.get(instance.name)) elif field == "status": - if instance.primary_node in bad_nodes: + if instance.primary_node in off_nodes: + val = "ERROR_nodeoffline" + elif instance.primary_node in bad_nodes: val = "ERROR_nodedown" else: running = bool(live_data.get(instance.name)) -- GitLab