Commit 14970c32 authored by Adeodato Simo's avatar Adeodato Simo Committed by Guido Trotter
Browse files

Cluster verify: make "instance runs in wrong node" node-driven



Previously, the "instance should not be running in this node" error was
computed by verifying, for each instance, whether any node other than its
primary was running it. But this is not a well-suited approach if we were
to shard cluster verification (because, for each instance, we won't have
information whether it's running *outside* the current set of nodes).

By reversing the logic of the check, and asking instead, for each node,
"is it running any instance for which it's not primary", we catch all
occurrences of the problem even if running sharded.

Because of this, we can also detect orphan instances at the same time
(instances that are not known in the cluster config). We warn about them
here too, and drop the later _VerifyOrphanInstances check.
Signed-off-by: default avatarAdeodato Simo <dato@google.com>
Signed-off-by: default avatarGuido Trotter <ultrotter@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent 4e272d8c
......@@ -1672,12 +1672,6 @@ class LUClusterVerify(LogicalUnit):
"instance not running on its primary node %s",
node_current)
for node, n_img in node_image.items():
if node != node_current:
test = instance in n_img.instances
_ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
"instance should not run on node %s", node)
diskdata = [(nname, success, status, idx)
for (nname, disks) in diskstatus.items()
for idx, (success, status) in enumerate(disks)]
......@@ -1717,18 +1711,6 @@ class LUClusterVerify(LogicalUnit):
self._ErrorIf(test, self.ENODEORPHANLV, node,
"volume %s is unknown", volume)
def _VerifyOrphanInstances(self, instancelist, node_image):
"""Verify the list of running instances.
This checks what instances are running but unknown to the cluster.
"""
for node, n_img in node_image.items():
for o_inst in n_img.instances:
test = o_inst not in instancelist
self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
"instance %s on node %s should not exist", o_inst, node)
def _VerifyNPlusOneMemory(self, node_image, instance_cfg):
"""Verify N+1 Memory Resilience.
......@@ -2490,12 +2472,25 @@ class LUClusterVerify(LogicalUnit):
self._UpdateNodeInstances(node_i, nresult, nimg)
self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
self._UpdateNodeOS(node_i, nresult, nimg)
if not nimg.os_fail:
if refos_img is None:
refos_img = nimg
self._VerifyNodeOS(node_i, nimg, refos_img)
self._VerifyNodeBridges(node_i, nresult, bridges)
# Check whether all running instancies are primary for the node. (This
# can no longer be done from _VerifyInstance below, since some of the
# wrong instances could be from other node groups.)
non_primary_inst = set(nimg.instances).difference(nimg.pinst)
for inst in non_primary_inst:
test = inst in self.all_inst_info
_ErrorIf(test, self.EINSTANCEWRONGNODE, inst,
"instance should not run on node %s", node_i.name)
_ErrorIf(not test, self.ENODEORPHANINSTANCE, node_i.name,
"node is running unknown instance %s", inst)
feedback_fn("* Verifying instance status")
for instance in self.my_inst_names:
if verbose:
......@@ -2576,9 +2571,6 @@ class LUClusterVerify(LogicalUnit):
reserved = utils.FieldSet(*cluster.reserved_lvs)
self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
feedback_fn("* Verifying orphan instances")
self._VerifyOrphanInstances(set(self.all_inst_info.keys()), node_image)
if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
feedback_fn("* Verifying N+1 Memory redundancy")
self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment