From cbfc468166d63c9f02f21018140df2558770ff9f Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Fri, 5 Dec 2008 02:58:40 +0000
Subject: [PATCH] watcher: handle offline nodes better

This patch changes the LUQueryInstances to show a different state for
offline nodes and also modifies the watcher to understand the offline
state in its checks.

Reviewed-by: ultrotter
---
 daemons/ganeti-watcher | 13 +++++++------
 lib/cmdlib.py          |  8 +++++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 351f10229..c3f4ac990 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -43,7 +43,7 @@ from ganeti import cli
 
 MAXTRIES = 5
 BAD_STATES = ['ERROR_down']
-HELPLESS_STATES = ['ERROR_nodedown']
+HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
 NOTICE = 'NOTICE'
 ERROR = 'ERROR'
 KEY_RESTART_COUNT = "restart_count"
@@ -267,8 +267,8 @@ def GetNodeBootIDs():
   """Get a dict mapping nodes to boot IDs.
 
   """
-  result = client.QueryNodes([], ["name", "bootid"])
-  return dict([(name, bootid) for name, bootid in result])
+  result = client.QueryNodes([], ["name", "bootid", "offline"])
+  return dict([(name, (bootid, offline)) for name, bootid, offline in result])
 
 
 class Watcher(object):
@@ -301,12 +301,13 @@ class Watcher(object):
 
     """
     check_nodes = []
-    for name, new_id in self.bootids.iteritems():
+    for name, (new_id, offline) in self.bootids.iteritems():
       old = notepad.GetNodeBootID(name)
       if new_id is None:
         # Bad node, not returning a boot id
-        logging.debug("Node %s missing boot id, skipping secondary checks",
-                      name)
+        if not offline:
+          logging.debug("Node %s missing boot id, skipping secondary checks",
+                        name)
         continue
       if old != new_id:
         # Node's boot ID has changed, proably through a reboot.
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index 76b3b07b5..6e2f41cd2 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -3007,11 +3007,15 @@ class LUQueryInstances(NoHooksLU):
     hv_list = list(set([inst.hypervisor for inst in instance_list]))
 
     bad_nodes = []
+    off_nodes = []
     if self.do_locking:
       live_data = {}
       node_data = self.rpc.call_all_instances_info(nodes, hv_list)
       for name in nodes:
         result = node_data[name]
+        if result.offline:
+          # offline nodes will be in both lists
+          off_nodes.append(name)
         if result.failed:
           bad_nodes.append(name)
         else:
@@ -3048,7 +3052,9 @@ class LUQueryInstances(NoHooksLU):
           else:
             val = bool(live_data.get(instance.name))
         elif field == "status":
-          if instance.primary_node in bad_nodes:
+          if instance.primary_node in off_nodes:
+            val = "ERROR_nodeoffline"
+          elif instance.primary_node in bad_nodes:
             val = "ERROR_nodedown"
           else:
             running = bool(live_data.get(instance.name))
-- 
GitLab