From 64c7b3831dcf35a515ac7928cdb62bfb78b47291 Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Fri, 30 Sep 2011 17:48:28 +0200 Subject: [PATCH] LUClusterVerifyGroup: Spread SSH checks over more nodes When verifying a group the code would always check SSH to all nodes in the same group, as well as the first node for every other group. On big clusters this can cause issues since many nodes will try to connect to the first node of another group at the same time. This patch changes the algorithm to choose a different node every time. A unittest for the selection algorithm is included. Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Iustin Pop <iustin@google.com> --- lib/backend.py | 21 +++++++++--- lib/cmdlib.py | 51 +++++++++++++++++++++-------- test/ganeti.cmdlib_unittest.py | 60 ++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 18 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index d02a88841..3f793da86 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -520,12 +520,25 @@ def VerifyNode(what, cluster_name): what[constants.NV_FILELIST]) if constants.NV_NODELIST in what: - result[constants.NV_NODELIST] = tmp = {} - random.shuffle(what[constants.NV_NODELIST]) - for node in what[constants.NV_NODELIST]: + (nodes, bynode) = what[constants.NV_NODELIST] + + # Add nodes from other groups (different for each node) + try: + nodes.extend(bynode[my_name]) + except KeyError: + pass + + # Use a random order + random.shuffle(nodes) + + # Try to contact all nodes + val = {} + for node in nodes: success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node) if not success: - tmp[node] = message + val[node] = message + + result[constants.NV_NODELIST] = val if constants.NV_NODENETTEST in what: result[constants.NV_NODENETTEST] = tmp = {} diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 2b2ea239f..25113bd6a 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -2542,6 +2542,40 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): return instdisk + @staticmethod + def _SshNodeSelector(group_uuid, all_nodes): + """Create endless iterators for all potential SSH check hosts. + + """ + nodes = [node for node in all_nodes + if (node.group != group_uuid and + not node.offline)] + keyfunc = operator.attrgetter("group") + + return map(itertools.cycle, + [sorted(map(operator.attrgetter("name"), names)) + for _, names in itertools.groupby(sorted(nodes, key=keyfunc), + keyfunc)]) + + @classmethod + def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes): + """Choose which nodes should talk to which other nodes. + + We will make nodes contact all nodes in their group, and one node from + every other group. + + @warning: This algorithm has a known issue if one node group is much + smaller than others (e.g. just one node). In such a case all other + nodes will talk to the single node. + + """ + online_nodes = sorted(node.name for node in group_nodes if not node.offline) + sel = cls._SshNodeSelector(group_uuid, all_nodes) + + return (online_nodes, + dict((name, sorted([i.next() for i in sel])) + for name in online_nodes)) + def BuildHooksEnv(self): """Build hooks env. @@ -2605,25 +2639,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names)) - # We will make nodes contact all nodes in their group, and one node from - # every other group. - # TODO: should it be a *random* node, different every time? - online_nodes = [node.name for node in node_data_list if not node.offline] - other_group_nodes = {} - - for name in sorted(self.all_node_info): - node = self.all_node_info[name] - if (node.group not in other_group_nodes - and node.group != self.group_uuid - and not node.offline): - other_group_nodes[node.group] = node.name - node_verify_param = { constants.NV_FILELIST: utils.UniqueSequence(filename for files in filemap for filename in files), - constants.NV_NODELIST: online_nodes + other_group_nodes.values(), + constants.NV_NODELIST: + self._SelectSshCheckNodes(node_data_list, self.group_uuid, + self.all_node_info.values()), constants.NV_HYPERVISOR: hypervisors, constants.NV_HVPARAMS: _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), diff --git a/test/ganeti.cmdlib_unittest.py b/test/ganeti.cmdlib_unittest.py index b44c5476a..feb69b508 100755 --- a/test/ganeti.cmdlib_unittest.py +++ b/test/ganeti.cmdlib_unittest.py @@ -27,6 +27,7 @@ import unittest import time import tempfile import shutil +import operator from ganeti import constants from ganeti import mcpu @@ -207,5 +208,64 @@ class TestLUGroupAssignNodes(unittest.TestCase): self.assertEqual(set(["inst3c"]), set(prev)) +class TestClusterVerifySsh(unittest.TestCase): + def testMultipleGroups(self): + fn = cmdlib.LUClusterVerifyGroup._SelectSshCheckNodes + mygroupnodes = [ + objects.Node(name="node20", group="my", offline=False), + objects.Node(name="node21", group="my", offline=False), + objects.Node(name="node22", group="my", offline=False), + objects.Node(name="node23", group="my", offline=False), + objects.Node(name="node24", group="my", offline=False), + objects.Node(name="node25", group="my", offline=False), + objects.Node(name="node26", group="my", offline=True), + ] + nodes = [ + objects.Node(name="node1", group="g1", offline=True), + objects.Node(name="node2", group="g1", offline=False), + objects.Node(name="node3", group="g1", offline=False), + objects.Node(name="node4", group="g1", offline=True), + objects.Node(name="node5", group="g1", offline=False), + objects.Node(name="node10", group="xyz", offline=False), + objects.Node(name="node11", group="xyz", offline=False), + objects.Node(name="node40", group="alloff", offline=True), + objects.Node(name="node41", group="alloff", offline=True), + objects.Node(name="node50", group="aaa", offline=False), + ] + mygroupnodes + assert not utils.FindDuplicates(map(operator.attrgetter("name"), nodes)) + + (online, perhost) = fn(mygroupnodes, "my", nodes) + self.assertEqual(online, ["node%s" % i for i in range(20, 26)]) + self.assertEqual(set(perhost.keys()), set(online)) + + self.assertEqual(perhost, { + "node20": ["node10", "node2", "node50"], + "node21": ["node11", "node3", "node50"], + "node22": ["node10", "node5", "node50"], + "node23": ["node11", "node2", "node50"], + "node24": ["node10", "node3", "node50"], + "node25": ["node11", "node5", "node50"], + }) + + def testSingleGroup(self): + fn = cmdlib.LUClusterVerifyGroup._SelectSshCheckNodes + nodes = [ + objects.Node(name="node1", group="default", offline=True), + objects.Node(name="node2", group="default", offline=False), + objects.Node(name="node3", group="default", offline=False), + objects.Node(name="node4", group="default", offline=True), + ] + assert not utils.FindDuplicates(map(operator.attrgetter("name"), nodes)) + + (online, perhost) = fn(nodes, "default", nodes) + self.assertEqual(online, ["node2", "node3"]) + self.assertEqual(set(perhost.keys()), set(online)) + + self.assertEqual(perhost, { + "node2": [], + "node3": [], + }) + + if __name__ == "__main__": testutils.GanetiTestProgram() -- GitLab