Commit 64c7b383 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

LUClusterVerifyGroup: Spread SSH checks over more nodes

When verifying a group the code would always check SSH to all nodes in
the same group, as well as the first node for every other group. On big
clusters this can cause issues since many nodes will try to connect to
the first node of another group at the same time. This patch changes the
algorithm to choose a different node every time.

A unittest for the selection algorithm is included.
Signed-off-by: default avatarMichael Hanselmann <>
Reviewed-by: default avatarIustin Pop <>
parent 11705e3d
......@@ -520,12 +520,25 @@ def VerifyNode(what, cluster_name):
if constants.NV_NODELIST in what:
result[constants.NV_NODELIST] = tmp = {}
for node in what[constants.NV_NODELIST]:
(nodes, bynode) = what[constants.NV_NODELIST]
# Add nodes from other groups (different for each node)
except KeyError:
# Use a random order
# Try to contact all nodes
val = {}
for node in nodes:
success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node)
if not success:
tmp[node] = message
val[node] = message
result[constants.NV_NODELIST] = val
if constants.NV_NODENETTEST in what:
result[constants.NV_NODENETTEST] = tmp = {}
......@@ -2542,6 +2542,40 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
return instdisk
def _SshNodeSelector(group_uuid, all_nodes):
"""Create endless iterators for all potential SSH check hosts.
nodes = [node for node in all_nodes
if ( != group_uuid and
not node.offline)]
keyfunc = operator.attrgetter("group")
return map(itertools.cycle,
[sorted(map(operator.attrgetter("name"), names))
for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
"""Choose which nodes should talk to which other nodes.
We will make nodes contact all nodes in their group, and one node from
every other group.
@warning: This algorithm has a known issue if one node group is much
smaller than others (e.g. just one node). In such a case all other
nodes will talk to the single node.
online_nodes = sorted( for node in group_nodes if not node.offline)
sel = cls._SshNodeSelector(group_uuid, all_nodes)
return (online_nodes,
dict((name, sorted([ for i in sel]))
for name in online_nodes))
def BuildHooksEnv(self):
"""Build hooks env.
......@@ -2605,25 +2639,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
# We will make nodes contact all nodes in their group, and one node from
# every other group.
# TODO: should it be a *random* node, different every time?
online_nodes = [ for node in node_data_list if not node.offline]
other_group_nodes = {}
for name in sorted(self.all_node_info):
node = self.all_node_info[name]
if ( not in other_group_nodes
and != self.group_uuid
and not node.offline):
other_group_nodes[] =
node_verify_param = {
for files in filemap
for filename in files),
constants.NV_NODELIST: online_nodes + other_group_nodes.values(),
self._SelectSshCheckNodes(node_data_list, self.group_uuid,
constants.NV_HYPERVISOR: hypervisors,
_GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
......@@ -27,6 +27,7 @@ import unittest
import time
import tempfile
import shutil
import operator
from ganeti import constants
from ganeti import mcpu
......@@ -207,5 +208,64 @@ class TestLUGroupAssignNodes(unittest.TestCase):
self.assertEqual(set(["inst3c"]), set(prev))
class TestClusterVerifySsh(unittest.TestCase):
def testMultipleGroups(self):
fn = cmdlib.LUClusterVerifyGroup._SelectSshCheckNodes
mygroupnodes = [
objects.Node(name="node20", group="my", offline=False),
objects.Node(name="node21", group="my", offline=False),
objects.Node(name="node22", group="my", offline=False),
objects.Node(name="node23", group="my", offline=False),
objects.Node(name="node24", group="my", offline=False),
objects.Node(name="node25", group="my", offline=False),
objects.Node(name="node26", group="my", offline=True),
nodes = [
objects.Node(name="node1", group="g1", offline=True),
objects.Node(name="node2", group="g1", offline=False),
objects.Node(name="node3", group="g1", offline=False),
objects.Node(name="node4", group="g1", offline=True),
objects.Node(name="node5", group="g1", offline=False),
objects.Node(name="node10", group="xyz", offline=False),
objects.Node(name="node11", group="xyz", offline=False),
objects.Node(name="node40", group="alloff", offline=True),
objects.Node(name="node41", group="alloff", offline=True),
objects.Node(name="node50", group="aaa", offline=False),
] + mygroupnodes
assert not utils.FindDuplicates(map(operator.attrgetter("name"), nodes))
(online, perhost) = fn(mygroupnodes, "my", nodes)
self.assertEqual(online, ["node%s" % i for i in range(20, 26)])
self.assertEqual(set(perhost.keys()), set(online))
self.assertEqual(perhost, {
"node20": ["node10", "node2", "node50"],
"node21": ["node11", "node3", "node50"],
"node22": ["node10", "node5", "node50"],
"node23": ["node11", "node2", "node50"],
"node24": ["node10", "node3", "node50"],
"node25": ["node11", "node5", "node50"],
def testSingleGroup(self):
fn = cmdlib.LUClusterVerifyGroup._SelectSshCheckNodes
nodes = [
objects.Node(name="node1", group="default", offline=True),
objects.Node(name="node2", group="default", offline=False),
objects.Node(name="node3", group="default", offline=False),
objects.Node(name="node4", group="default", offline=True),
assert not utils.FindDuplicates(map(operator.attrgetter("name"), nodes))
(online, perhost) = fn(nodes, "default", nodes)
self.assertEqual(online, ["node2", "node3"])
self.assertEqual(set(perhost.keys()), set(online))
self.assertEqual(perhost, {
"node2": [],
"node3": [],
if __name__ == "__main__":
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment