From 2c2f257d065d8d16355dcf798c06aee796700936 Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Wed, 25 Jan 2012 15:00:29 +0100 Subject: [PATCH] Fix cluster verification issues on multi-group clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch attempts to fix a number of issues with βgnt-cluster verifyβ in presence of multiple node groups and DRBD8 instances split over nodes in more than one group. - Look up instances in a group only by their primary node (otherwise split instances would be considered when verifying any of their node's groups) - When gathering additional nodes for LV checks, just compare instance's node's groups with the currently verified group instead of comparing against the primary node's group - Exclude nodes in other groups when calculating N+1 errors and checking logical volumes Not directly related, but a small error text is also clarified. Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Iustin Pop <iustin@google.com> --- lib/cmdlib.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/lib/cmdlib.py b/lib/cmdlib.py index a052039c1..fb49df1ac 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -1704,7 +1704,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name) # Get instances in node group; this is unsafe and needs verification later - inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid) + inst_names = \ + self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) self.needed_locks = { locking.LEVEL_INSTANCE: inst_names, @@ -1738,7 +1739,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): self.group_info = self.cfg.GetNodeGroup(self.group_uuid) group_nodes = set(self.group_info.members) - group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid) + group_instances = \ + self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True) unlocked_nodes = \ group_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) @@ -1748,11 +1750,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if unlocked_nodes: raise errors.OpPrereqError("Missing lock for nodes: %s" % - utils.CommaJoin(unlocked_nodes)) + utils.CommaJoin(unlocked_nodes), + errors.ECODE_STATE) if unlocked_instances: raise errors.OpPrereqError("Missing lock for instances: %s" % - utils.CommaJoin(unlocked_instances)) + utils.CommaJoin(unlocked_instances), + errors.ECODE_STATE) self.all_node_info = self.cfg.GetAllNodesInfo() self.all_inst_info = self.cfg.GetAllInstancesInfo() @@ -1772,17 +1776,17 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): for inst in self.my_inst_info.values(): if inst.disk_template in constants.DTS_INT_MIRROR: - group = self.my_node_info[inst.primary_node].group - for nname in inst.secondary_nodes: - if self.all_node_info[nname].group != group: + for nname in inst.all_nodes: + if self.all_node_info[nname].group != self.group_uuid: extra_lv_nodes.add(nname) unlocked_lv_nodes = \ extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE)) if unlocked_lv_nodes: - raise errors.OpPrereqError("these nodes could be locked: %s" % - utils.CommaJoin(unlocked_lv_nodes)) + raise errors.OpPrereqError("Missing node locks for LV check: %s" % + utils.CommaJoin(unlocked_lv_nodes), + errors.ECODE_STATE) self.extra_lv_nodes = list(extra_lv_nodes) def _VerifyNode(self, ninfo, nresult): @@ -2052,7 +2056,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): """ for node, n_img in node_image.items(): - if n_img.offline or n_img.rpc_fail or n_img.lvm_fail: + if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or + self.all_node_info[node].group != self.group_uuid): # skip non-healthy nodes continue for volume in n_img.volumes: @@ -2079,11 +2084,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # WARNING: we currently take into account down instances as well # as up ones, considering that even if they're down someone # might want to start them even in the event of a node failure. - if n_img.offline: - # we're skipping offline nodes from the N+1 warning, since - # most likely we don't have good memory infromation from them; - # we already list instances living on such nodes, and that's - # enough warning + if n_img.offline or self.all_node_info[node].group != self.group_uuid: + # we're skipping nodes marked offline and nodes in other groups from + # the N+1 warning, since most likely we don't have good memory + # infromation from them; we already list instances living on such + # nodes, and that's enough warning continue for prinode, instances in n_img.sbp.items(): needed_mem = 0 -- GitLab