From 2c2f257d065d8d16355dcf798c06aee796700936 Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <hansmi@google.com>
Date: Wed, 25 Jan 2012 15:00:29 +0100
Subject: [PATCH] Fix cluster verification issues on multi-group clusters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch attempts to fix a number of issues with β€œgnt-cluster verify”
in presence of multiple node groups and DRBD8 instances split over nodes
in more than one group.

- Look up instances in a group only by their primary node (otherwise
  split instances would be considered when verifying any of their node's
  groups)
- When gathering additional nodes for LV checks, just compare instance's
  node's groups with the currently verified group instead of comparing
  against the primary node's group
- Exclude nodes in other groups when calculating N+1 errors and checking
  logical volumes

Not directly related, but a small error text is also clarified.

Signed-off-by: Michael Hanselmann <hansmi@google.com>
Reviewed-by: Iustin Pop <iustin@google.com>
---
 lib/cmdlib.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index a052039c1..fb49df1ac 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -1704,7 +1704,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
     self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
 
     # Get instances in node group; this is unsafe and needs verification later
-    inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
+    inst_names = \
+      self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
 
     self.needed_locks = {
       locking.LEVEL_INSTANCE: inst_names,
@@ -1738,7 +1739,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
     self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
 
     group_nodes = set(self.group_info.members)
-    group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
+    group_instances = \
+      self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
 
     unlocked_nodes = \
         group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
@@ -1748,11 +1750,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
 
     if unlocked_nodes:
       raise errors.OpPrereqError("Missing lock for nodes: %s" %
-                                 utils.CommaJoin(unlocked_nodes))
+                                 utils.CommaJoin(unlocked_nodes),
+                                 errors.ECODE_STATE)
 
     if unlocked_instances:
       raise errors.OpPrereqError("Missing lock for instances: %s" %
-                                 utils.CommaJoin(unlocked_instances))
+                                 utils.CommaJoin(unlocked_instances),
+                                 errors.ECODE_STATE)
 
     self.all_node_info = self.cfg.GetAllNodesInfo()
     self.all_inst_info = self.cfg.GetAllInstancesInfo()
@@ -1772,17 +1776,17 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
 
     for inst in self.my_inst_info.values():
       if inst.disk_template in constants.DTS_INT_MIRROR:
-        group = self.my_node_info[inst.primary_node].group
-        for nname in inst.secondary_nodes:
-          if self.all_node_info[nname].group != group:
+        for nname in inst.all_nodes:
+          if self.all_node_info[nname].group != self.group_uuid:
             extra_lv_nodes.add(nname)
 
     unlocked_lv_nodes = \
         extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
 
     if unlocked_lv_nodes:
-      raise errors.OpPrereqError("these nodes could be locked: %s" %
-                                 utils.CommaJoin(unlocked_lv_nodes))
+      raise errors.OpPrereqError("Missing node locks for LV check: %s" %
+                                 utils.CommaJoin(unlocked_lv_nodes),
+                                 errors.ECODE_STATE)
     self.extra_lv_nodes = list(extra_lv_nodes)
 
   def _VerifyNode(self, ninfo, nresult):
@@ -2052,7 +2056,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
 
     """
     for node, n_img in node_image.items():
-      if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
+      if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
+          self.all_node_info[node].group != self.group_uuid):
         # skip non-healthy nodes
         continue
       for volume in n_img.volumes:
@@ -2079,11 +2084,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
       # WARNING: we currently take into account down instances as well
       # as up ones, considering that even if they're down someone
       # might want to start them even in the event of a node failure.
-      if n_img.offline:
-        # we're skipping offline nodes from the N+1 warning, since
-        # most likely we don't have good memory infromation from them;
-        # we already list instances living on such nodes, and that's
-        # enough warning
+      if n_img.offline or self.all_node_info[node].group != self.group_uuid:
+        # we're skipping nodes marked offline and nodes in other groups from
+        # the N+1 warning, since most likely we don't have good memory
+        # infromation from them; we already list instances living on such
+        # nodes, and that's enough warning
         continue
       for prinode, instances in n_img.sbp.items():
         needed_mem = 0
-- 
GitLab