Commit 2c2f257d authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

Fix cluster verification issues on multi-group clusters



This patch attempts to fix a number of issues with “gnt-cluster verify”
in presence of multiple node groups and DRBD8 instances split over nodes
in more than one group.

- Look up instances in a group only by their primary node (otherwise
  split instances would be considered when verifying any of their node's
  groups)
- When gathering additional nodes for LV checks, just compare instance's
  node's groups with the currently verified group instead of comparing
  against the primary node's group
- Exclude nodes in other groups when calculating N+1 errors and checking
  logical volumes

Not directly related, but a small error text is also clarified.
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent 6b826dfa
......@@ -1704,7 +1704,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
# Get instances in node group; this is unsafe and needs verification later
inst_names = self.cfg.GetNodeGroupInstances(self.group_uuid)
inst_names = \
self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
self.needed_locks = {
locking.LEVEL_INSTANCE: inst_names,
......@@ -1738,7 +1739,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
group_nodes = set(self.group_info.members)
group_instances = self.cfg.GetNodeGroupInstances(self.group_uuid)
group_instances = \
self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
unlocked_nodes = \
group_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
......@@ -1748,11 +1750,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
if unlocked_nodes:
raise errors.OpPrereqError("Missing lock for nodes: %s" %
utils.CommaJoin(unlocked_nodes))
utils.CommaJoin(unlocked_nodes),
errors.ECODE_STATE)
if unlocked_instances:
raise errors.OpPrereqError("Missing lock for instances: %s" %
utils.CommaJoin(unlocked_instances))
utils.CommaJoin(unlocked_instances),
errors.ECODE_STATE)
self.all_node_info = self.cfg.GetAllNodesInfo()
self.all_inst_info = self.cfg.GetAllInstancesInfo()
......@@ -1772,17 +1776,17 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
for inst in self.my_inst_info.values():
if inst.disk_template in constants.DTS_INT_MIRROR:
group = self.my_node_info[inst.primary_node].group
for nname in inst.secondary_nodes:
if self.all_node_info[nname].group != group:
for nname in inst.all_nodes:
if self.all_node_info[nname].group != self.group_uuid:
extra_lv_nodes.add(nname)
unlocked_lv_nodes = \
extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
if unlocked_lv_nodes:
raise errors.OpPrereqError("these nodes could be locked: %s" %
utils.CommaJoin(unlocked_lv_nodes))
raise errors.OpPrereqError("Missing node locks for LV check: %s" %
utils.CommaJoin(unlocked_lv_nodes),
errors.ECODE_STATE)
self.extra_lv_nodes = list(extra_lv_nodes)
def _VerifyNode(self, ninfo, nresult):
......@@ -2052,7 +2056,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
"""
for node, n_img in node_image.items():
if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
self.all_node_info[node].group != self.group_uuid):
# skip non-healthy nodes
continue
for volume in n_img.volumes:
......@@ -2079,11 +2084,11 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
# WARNING: we currently take into account down instances as well
# as up ones, considering that even if they're down someone
# might want to start them even in the event of a node failure.
if n_img.offline:
# we're skipping offline nodes from the N+1 warning, since
# most likely we don't have good memory infromation from them;
# we already list instances living on such nodes, and that's
# enough warning
if n_img.offline or self.all_node_info[node].group != self.group_uuid:
# we're skipping nodes marked offline and nodes in other groups from
# the N+1 warning, since most likely we don't have good memory
# infromation from them; we already list instances living on such
# nodes, and that's enough warning
continue
for prinode, instances in n_img.sbp.items():
needed_mem = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment