From 218f4c3de706aca7e4521d7e1975f517cf5ecb9b Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Thu, 24 Nov 2011 08:43:04 +0100 Subject: [PATCH] LUGroupAssignNodes: Fix node membership corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: This bug only manifests itself in Ganeti 2.5, but since the problematic code also exists in 2.4, I decided to fix it there. If a node was assigned to a new group using βgnt-group assign-nodesβ the node object's group would be changed, but not the duplicate member list in the group object. The latter is an optimization to require fewer locks for other operations. The per-group member list is only kept in memory and not written to disk. Ganeti 2.5 starts to make use of the data kept in the per-group member list and consequently fails when it is out of date. The following commands can be used to reproduce the issue in 2.5 (in 2.4 the issue was confirmed using additional logging): $ gnt-group add foo $ gnt-group assign-nodes foo $(gnt-node list --no-header -o name) $ gnt-cluster verify # Fails with KeyError This patch moves the code modifying node and group objects into βconfig.ConfigWriterβ to do the complete operation under the config lock, and also to avoid making use of side-effects of modifying objects without calling βConfigWriter.Updateβ. A unittest is included. Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Iustin Pop <iustin@google.com> --- lib/cmdlib.py | 8 +-- lib/config.py | 74 +++++++++++++++++++ test/ganeti.config_unittest.py | 126 ++++++++++++++++++++++++++++++++- 3 files changed, 201 insertions(+), 7 deletions(-) diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 4a10432c1..b287f331d 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -10395,13 +10395,9 @@ class LUGroupAssignNodes(NoHooksLU): """Assign nodes to a new group. """ - for node in self.op.nodes: - self.node_data[node].group = self.group_uuid - - # FIXME: Depends on side-effects of modifying the result of - # C{cfg.GetAllNodesInfo} + mods = [(node_name, self.group_uuid) for node_name in self.op.nodes] - self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes. + self.cfg.AssignGroupNodes(mods) @staticmethod def CheckAssignmentForSplitInstances(changes, node_data, instance_data): diff --git a/lib/config.py b/lib/config.py index a7d7b77ed..e77bb3407 100644 --- a/lib/config.py +++ b/lib/config.py @@ -38,6 +38,7 @@ import os import random import logging import time +import itertools from ganeti import errors from ganeti import locking @@ -1517,6 +1518,79 @@ class ConfigWriter: else: nodegroup_obj.members.remove(node.name) + @locking.ssynchronized(_config_lock) + def AssignGroupNodes(self, mods): + """Changes the group of a number of nodes. + + @type mods: list of tuples; (node name, new group UUID) + @param modes: Node membership modifications + + """ + groups = self._config_data.nodegroups + nodes = self._config_data.nodes + + resmod = [] + + # Try to resolve names/UUIDs first + for (node_name, new_group_uuid) in mods: + try: + node = nodes[node_name] + except KeyError: + raise errors.ConfigurationError("Unable to find node '%s'" % node_name) + + if node.group == new_group_uuid: + # Node is being assigned to its current group + logging.debug("Node '%s' was assigned to its current group (%s)", + node_name, node.group) + continue + + # Try to find current group of node + try: + old_group = groups[node.group] + except KeyError: + raise errors.ConfigurationError("Unable to find old group '%s'" % + node.group) + + # Try to find new group for node + try: + new_group = groups[new_group_uuid] + except KeyError: + raise errors.ConfigurationError("Unable to find new group '%s'" % + new_group_uuid) + + assert node.name in old_group.members, \ + ("Inconsistent configuration: node '%s' not listed in members for its" + " old group '%s'" % (node.name, old_group.uuid)) + assert node.name not in new_group.members, \ + ("Inconsistent configuration: node '%s' already listed in members for" + " its new group '%s'" % (node.name, new_group.uuid)) + + resmod.append((node, old_group, new_group)) + + # Apply changes + for (node, old_group, new_group) in resmod: + assert node.uuid != new_group.uuid and old_group.uuid != new_group.uuid, \ + "Assigning to current group is not possible" + + node.group = new_group.uuid + + # Update members of involved groups + if node.name in old_group.members: + old_group.members.remove(node.name) + if node.name not in new_group.members: + new_group.members.append(node.name) + + # Update timestamps and serials (only once per node/group object) + now = time.time() + for obj in frozenset(itertools.chain(*resmod)): # pylint: disable-msg=W0142 + obj.serial_no += 1 + obj.mtime = now + + # Force ssconf update + self._config_data.cluster.serial_no += 1 + + self._WriteConfig() + def _BumpSerialNo(self): """Bump up the serial number of the config. diff --git a/test/ganeti.config_unittest.py b/test/ganeti.config_unittest.py index a21bfa6da..e82872c6b 100755 --- a/test/ganeti.config_unittest.py +++ b/test/ganeti.config_unittest.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -# Copyright (C) 2006, 2007, 2010 Google Inc. +# Copyright (C) 2006, 2007, 2010, 2011 Google Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -28,6 +28,8 @@ import time import tempfile import os.path import socket +import operator +import itertools from ganeti import bootstrap from ganeti import config @@ -36,6 +38,7 @@ from ganeti import errors from ganeti import objects from ganeti import utils from ganeti import netutils +from ganeti import compat from ganeti.config import TemporaryReservationManager @@ -239,6 +242,127 @@ class TestConfigRunner(unittest.TestCase): cfg.AddNodeGroup(group, "my-job", check_uuid=False) # Does not raise. self.assertEqual(uuid, group.uuid) + def testAssignGroupNodes(self): + me = netutils.Hostname() + cfg = self._get_object() + + # Create two groups + grp1 = objects.NodeGroup(name="grp1", members=[], + uuid="2f2fadf7-2a70-4a23-9ab5-2568c252032c") + grp1_serial = 1 + cfg.AddNodeGroup(grp1, "job") + + grp2 = objects.NodeGroup(name="grp2", members=[], + uuid="798d0de3-680f-4a0e-b29a-0f54f693b3f1") + grp2_serial = 1 + cfg.AddNodeGroup(grp2, "job") + self.assertEqual(set(map(operator.attrgetter("name"), + cfg.GetAllNodeGroupsInfo().values())), + set(["grp1", "grp2", constants.INITIAL_NODE_GROUP_NAME])) + + # No-op + cluster_serial = cfg.GetClusterInfo().serial_no + cfg.AssignGroupNodes([]) + cluster_serial += 1 + + # Create two nodes + node1 = objects.Node(name="node1", group=grp1.uuid, ndparams={}) + node1_serial = 1 + node2 = objects.Node(name="node2", group=grp2.uuid, ndparams={}) + node2_serial = 1 + cfg.AddNode(node1, "job") + cfg.AddNode(node2, "job") + cluster_serial += 2 + self.assertEqual(set(cfg.GetNodeList()), set(["node1", "node2", me.name])) + + def _VerifySerials(): + self.assertEqual(cfg.GetClusterInfo().serial_no, cluster_serial) + self.assertEqual(node1.serial_no, node1_serial) + self.assertEqual(node2.serial_no, node2_serial) + self.assertEqual(grp1.serial_no, grp1_serial) + self.assertEqual(grp2.serial_no, grp2_serial) + + _VerifySerials() + + self.assertEqual(set(grp1.members), set(["node1"])) + self.assertEqual(set(grp2.members), set(["node2"])) + + # Check invalid nodes and groups + self.assertRaises(errors.ConfigurationError, cfg.AssignGroupNodes, [ + ("unknown.node.example.com", grp2.uuid), + ]) + self.assertRaises(errors.ConfigurationError, cfg.AssignGroupNodes, [ + (node1.name, "unknown-uuid"), + ]) + + self.assertEqual(node1.group, grp1.uuid) + self.assertEqual(node2.group, grp2.uuid) + self.assertEqual(set(grp1.members), set(["node1"])) + self.assertEqual(set(grp2.members), set(["node2"])) + + # Another no-op + cfg.AssignGroupNodes([]) + cluster_serial += 1 + _VerifySerials() + + # Assign to the same group (should be a no-op) + self.assertEqual(node2.group, grp2.uuid) + cfg.AssignGroupNodes([ + (node2.name, grp2.uuid), + ]) + cluster_serial += 1 + self.assertEqual(node2.group, grp2.uuid) + _VerifySerials() + self.assertEqual(set(grp1.members), set(["node1"])) + self.assertEqual(set(grp2.members), set(["node2"])) + + # Assign node 2 to group 1 + self.assertEqual(node2.group, grp2.uuid) + cfg.AssignGroupNodes([ + (node2.name, grp1.uuid), + ]) + cluster_serial += 1 + node2_serial += 1 + grp1_serial += 1 + grp2_serial += 1 + self.assertEqual(node2.group, grp1.uuid) + _VerifySerials() + self.assertEqual(set(grp1.members), set(["node1", "node2"])) + self.assertFalse(grp2.members) + + # And assign both nodes to group 2 + self.assertEqual(node1.group, grp1.uuid) + self.assertEqual(node2.group, grp1.uuid) + self.assertNotEqual(grp1.uuid, grp2.uuid) + cfg.AssignGroupNodes([ + (node1.name, grp2.uuid), + (node2.name, grp2.uuid), + ]) + cluster_serial += 1 + node1_serial += 1 + node2_serial += 1 + grp1_serial += 1 + grp2_serial += 1 + self.assertEqual(node1.group, grp2.uuid) + self.assertEqual(node2.group, grp2.uuid) + _VerifySerials() + self.assertFalse(grp1.members) + self.assertEqual(set(grp2.members), set(["node1", "node2"])) + + # Destructive tests + orig_group = node2.group + try: + other_uuid = "68b3d087-6ea5-491c-b81f-0a47d90228c5" + assert compat.all(node.group != other_uuid + for node in cfg.GetAllNodesInfo().values()) + node2.group = "68b3d087-6ea5-491c-b81f-0a47d90228c5" + self.assertRaises(errors.ConfigurationError, cfg.AssignGroupNodes, [ + ("node2", grp2.uuid), + ]) + _VerifySerials() + finally: + node2.group = orig_group + class TestTRM(unittest.TestCase): EC_ID = 1 -- GitLab