Commit cbfc4681 authored by Iustin Pop's avatar Iustin Pop
Browse files

watcher: handle offline nodes better

This patch changes the LUQueryInstances to show a different state for
offline nodes and also modifies the watcher to understand the offline
state in its checks.

Reviewed-by: ultrotter
parent 9ddb5e45
......@@ -43,7 +43,7 @@ from ganeti import cli
MAXTRIES = 5
BAD_STATES = ['ERROR_down']
HELPLESS_STATES = ['ERROR_nodedown']
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
NOTICE = 'NOTICE'
ERROR = 'ERROR'
KEY_RESTART_COUNT = "restart_count"
......@@ -267,8 +267,8 @@ def GetNodeBootIDs():
"""Get a dict mapping nodes to boot IDs.
"""
result = client.QueryNodes([], ["name", "bootid"])
return dict([(name, bootid) for name, bootid in result])
result = client.QueryNodes([], ["name", "bootid", "offline"])
return dict([(name, (bootid, offline)) for name, bootid, offline in result])
class Watcher(object):
......@@ -301,12 +301,13 @@ class Watcher(object):
"""
check_nodes = []
for name, new_id in self.bootids.iteritems():
for name, (new_id, offline) in self.bootids.iteritems():
old = notepad.GetNodeBootID(name)
if new_id is None:
# Bad node, not returning a boot id
logging.debug("Node %s missing boot id, skipping secondary checks",
name)
if not offline:
logging.debug("Node %s missing boot id, skipping secondary checks",
name)
continue
if old != new_id:
# Node's boot ID has changed, proably through a reboot.
......
......@@ -3007,11 +3007,15 @@ class LUQueryInstances(NoHooksLU):
hv_list = list(set([inst.hypervisor for inst in instance_list]))
bad_nodes = []
off_nodes = []
if self.do_locking:
live_data = {}
node_data = self.rpc.call_all_instances_info(nodes, hv_list)
for name in nodes:
result = node_data[name]
if result.offline:
# offline nodes will be in both lists
off_nodes.append(name)
if result.failed:
bad_nodes.append(name)
else:
......@@ -3048,7 +3052,9 @@ class LUQueryInstances(NoHooksLU):
else:
val = bool(live_data.get(instance.name))
elif field == "status":
if instance.primary_node in bad_nodes:
if instance.primary_node in off_nodes:
val = "ERROR_nodeoffline"
elif instance.primary_node in bad_nodes:
val = "ERROR_nodedown"
else:
running = bool(live_data.get(instance.name))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment