diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 351f102292be468b2b53533295d4c306f56ef51f..c3f4ac990af476a79f0ae377ebee9e29efce86b2 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -43,7 +43,7 @@ from ganeti import cli MAXTRIES = 5 BAD_STATES = ['ERROR_down'] -HELPLESS_STATES = ['ERROR_nodedown'] +HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline'] NOTICE = 'NOTICE' ERROR = 'ERROR' KEY_RESTART_COUNT = "restart_count" @@ -267,8 +267,8 @@ def GetNodeBootIDs(): """Get a dict mapping nodes to boot IDs. """ - result = client.QueryNodes([], ["name", "bootid"]) - return dict([(name, bootid) for name, bootid in result]) + result = client.QueryNodes([], ["name", "bootid", "offline"]) + return dict([(name, (bootid, offline)) for name, bootid, offline in result]) class Watcher(object): @@ -301,12 +301,13 @@ class Watcher(object): """ check_nodes = [] - for name, new_id in self.bootids.iteritems(): + for name, (new_id, offline) in self.bootids.iteritems(): old = notepad.GetNodeBootID(name) if new_id is None: # Bad node, not returning a boot id - logging.debug("Node %s missing boot id, skipping secondary checks", - name) + if not offline: + logging.debug("Node %s missing boot id, skipping secondary checks", + name) continue if old != new_id: # Node's boot ID has changed, proably through a reboot. diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 76b3b07b56ae1db9ab6c9264848448e72516602f..6e2f41cd2498ce5bc886708fabbdd4350570eaa2 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -3007,11 +3007,15 @@ class LUQueryInstances(NoHooksLU): hv_list = list(set([inst.hypervisor for inst in instance_list])) bad_nodes = [] + off_nodes = [] if self.do_locking: live_data = {} node_data = self.rpc.call_all_instances_info(nodes, hv_list) for name in nodes: result = node_data[name] + if result.offline: + # offline nodes will be in both lists + off_nodes.append(name) if result.failed: bad_nodes.append(name) else: @@ -3048,7 +3052,9 @@ class LUQueryInstances(NoHooksLU): else: val = bool(live_data.get(instance.name)) elif field == "status": - if instance.primary_node in bad_nodes: + if instance.primary_node in off_nodes: + val = "ERROR_nodeoffline" + elif instance.primary_node in bad_nodes: val = "ERROR_nodedown" else: running = bool(live_data.get(instance.name))