Commit 54c31fd3 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

LUGroupAssignNodes: Fix node membership corruption



Note: This bug only manifests itself in Ganeti 2.5, but since the
problematic code also exists in 2.4, I decided to fix it there.

If a node was assigned to a new group using “gnt-group assign-nodes” the
node object's group would be changed, but not the duplicate member list
in the group object. The latter is an optimization to require fewer
locks for other operations. The per-group member list is only kept in
memory and not written to disk.

Ganeti 2.5 starts to make use of the data kept in the per-group member
list and consequently fails when it is out of date. The following
commands can be used to reproduce the issue in 2.5 (in 2.4 the issue was
confirmed using additional logging):

  $ gnt-group add foo
  $ gnt-group assign-nodes foo $(gnt-node list --no-header -o name)
  $ gnt-cluster verify  # Fails with KeyError

This patch moves the code modifying node and group objects into
“config.ConfigWriter” to do the complete operation under the config
lock, and also to avoid making use of side-effects of modifying objects
without calling “ConfigWriter.Update”. A unittest is included.
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
(cherry picked from commit 218f4c3d)
parent 9c4f4dd6
......@@ -11992,13 +11992,9 @@ class LUGroupAssignNodes(NoHooksLU):
"""Assign nodes to a new group.
"""
for node in self.op.nodes:
self.node_data[node].group = self.group_uuid
mods = [(node_name, self.group_uuid) for node_name in self.op.nodes]
# FIXME: Depends on side-effects of modifying the result of
# C{cfg.GetAllNodesInfo}
self.cfg.Update(self.group, feedback_fn) # Saves all modified nodes.
self.cfg.AssignGroupNodes(mods)
@staticmethod
def CheckAssignmentForSplitInstances(changes, node_data, instance_data):
......
......@@ -38,6 +38,7 @@ import os
import random
import logging
import time
import itertools
from ganeti import errors
from ganeti import locking
......@@ -1594,6 +1595,79 @@ class ConfigWriter:
else:
nodegroup_obj.members.remove(node.name)
@locking.ssynchronized(_config_lock)
def AssignGroupNodes(self, mods):
"""Changes the group of a number of nodes.
@type mods: list of tuples; (node name, new group UUID)
@param modes: Node membership modifications
"""
groups = self._config_data.nodegroups
nodes = self._config_data.nodes
resmod = []
# Try to resolve names/UUIDs first
for (node_name, new_group_uuid) in mods:
try:
node = nodes[node_name]
except KeyError:
raise errors.ConfigurationError("Unable to find node '%s'" % node_name)
if node.group == new_group_uuid:
# Node is being assigned to its current group
logging.debug("Node '%s' was assigned to its current group (%s)",
node_name, node.group)
continue
# Try to find current group of node
try:
old_group = groups[node.group]
except KeyError:
raise errors.ConfigurationError("Unable to find old group '%s'" %
node.group)
# Try to find new group for node
try:
new_group = groups[new_group_uuid]
except KeyError:
raise errors.ConfigurationError("Unable to find new group '%s'" %
new_group_uuid)
assert node.name in old_group.members, \
("Inconsistent configuration: node '%s' not listed in members for its"
" old group '%s'" % (node.name, old_group.uuid))
assert node.name not in new_group.members, \
("Inconsistent configuration: node '%s' already listed in members for"
" its new group '%s'" % (node.name, new_group.uuid))
resmod.append((node, old_group, new_group))
# Apply changes
for (node, old_group, new_group) in resmod:
assert node.uuid != new_group.uuid and old_group.uuid != new_group.uuid, \
"Assigning to current group is not possible"
node.group = new_group.uuid
# Update members of involved groups
if node.name in old_group.members:
old_group.members.remove(node.name)
if node.name not in new_group.members:
new_group.members.append(node.name)
# Update timestamps and serials (only once per node/group object)
now = time.time()
for obj in frozenset(itertools.chain(*resmod)): # pylint: disable-msg=W0142
obj.serial_no += 1
obj.mtime = now
# Force ssconf update
self._config_data.cluster.serial_no += 1
self._WriteConfig()
def _BumpSerialNo(self):
"""Bump up the serial number of the config.
......
#!/usr/bin/python
#
# Copyright (C) 2006, 2007, 2010 Google Inc.
# Copyright (C) 2006, 2007, 2010, 2011 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
......@@ -28,6 +28,8 @@ import time
import tempfile
import os.path
import socket
import operator
import itertools
from ganeti import bootstrap
from ganeti import config
......@@ -36,6 +38,7 @@ from ganeti import errors
from ganeti import objects
from ganeti import utils
from ganeti import netutils
from ganeti import compat
from ganeti.config import TemporaryReservationManager
......@@ -239,6 +242,127 @@ class TestConfigRunner(unittest.TestCase):
cfg.AddNodeGroup(group, "my-job", check_uuid=False) # Does not raise.
self.assertEqual(uuid, group.uuid)
def testAssignGroupNodes(self):
me = netutils.Hostname()
cfg = self._get_object()
# Create two groups
grp1 = objects.NodeGroup(name="grp1", members=[],
uuid="2f2fadf7-2a70-4a23-9ab5-2568c252032c")
grp1_serial = 1
cfg.AddNodeGroup(grp1, "job")
grp2 = objects.NodeGroup(name="grp2", members=[],
uuid="798d0de3-680f-4a0e-b29a-0f54f693b3f1")
grp2_serial = 1
cfg.AddNodeGroup(grp2, "job")
self.assertEqual(set(map(operator.attrgetter("name"),
cfg.GetAllNodeGroupsInfo().values())),
set(["grp1", "grp2", constants.INITIAL_NODE_GROUP_NAME]))
# No-op
cluster_serial = cfg.GetClusterInfo().serial_no
cfg.AssignGroupNodes([])
cluster_serial += 1
# Create two nodes
node1 = objects.Node(name="node1", group=grp1.uuid, ndparams={})
node1_serial = 1
node2 = objects.Node(name="node2", group=grp2.uuid, ndparams={})
node2_serial = 1
cfg.AddNode(node1, "job")
cfg.AddNode(node2, "job")
cluster_serial += 2
self.assertEqual(set(cfg.GetNodeList()), set(["node1", "node2", me.name]))
def _VerifySerials():
self.assertEqual(cfg.GetClusterInfo().serial_no, cluster_serial)
self.assertEqual(node1.serial_no, node1_serial)
self.assertEqual(node2.serial_no, node2_serial)
self.assertEqual(grp1.serial_no, grp1_serial)
self.assertEqual(grp2.serial_no, grp2_serial)
_VerifySerials()
self.assertEqual(set(grp1.members), set(["node1"]))
self.assertEqual(set(grp2.members), set(["node2"]))
# Check invalid nodes and groups
self.assertRaises(errors.ConfigurationError, cfg.AssignGroupNodes, [
("unknown.node.example.com", grp2.uuid),
])
self.assertRaises(errors.ConfigurationError, cfg.AssignGroupNodes, [
(node1.name, "unknown-uuid"),
])
self.assertEqual(node1.group, grp1.uuid)
self.assertEqual(node2.group, grp2.uuid)
self.assertEqual(set(grp1.members), set(["node1"]))
self.assertEqual(set(grp2.members), set(["node2"]))
# Another no-op
cfg.AssignGroupNodes([])
cluster_serial += 1
_VerifySerials()
# Assign to the same group (should be a no-op)
self.assertEqual(node2.group, grp2.uuid)
cfg.AssignGroupNodes([
(node2.name, grp2.uuid),
])
cluster_serial += 1
self.assertEqual(node2.group, grp2.uuid)
_VerifySerials()
self.assertEqual(set(grp1.members), set(["node1"]))
self.assertEqual(set(grp2.members), set(["node2"]))
# Assign node 2 to group 1
self.assertEqual(node2.group, grp2.uuid)
cfg.AssignGroupNodes([
(node2.name, grp1.uuid),
])
cluster_serial += 1
node2_serial += 1
grp1_serial += 1
grp2_serial += 1
self.assertEqual(node2.group, grp1.uuid)
_VerifySerials()
self.assertEqual(set(grp1.members), set(["node1", "node2"]))
self.assertFalse(grp2.members)
# And assign both nodes to group 2
self.assertEqual(node1.group, grp1.uuid)
self.assertEqual(node2.group, grp1.uuid)
self.assertNotEqual(grp1.uuid, grp2.uuid)
cfg.AssignGroupNodes([
(node1.name, grp2.uuid),
(node2.name, grp2.uuid),
])
cluster_serial += 1
node1_serial += 1
node2_serial += 1
grp1_serial += 1
grp2_serial += 1
self.assertEqual(node1.group, grp2.uuid)
self.assertEqual(node2.group, grp2.uuid)
_VerifySerials()
self.assertFalse(grp1.members)
self.assertEqual(set(grp2.members), set(["node1", "node2"]))
# Destructive tests
orig_group = node2.group
try:
other_uuid = "68b3d087-6ea5-491c-b81f-0a47d90228c5"
assert compat.all(node.group != other_uuid
for node in cfg.GetAllNodesInfo().values())
node2.group = "68b3d087-6ea5-491c-b81f-0a47d90228c5"
self.assertRaises(errors.ConfigurationError, cfg.AssignGroupNodes, [
("node2", grp2.uuid),
])
_VerifySerials()
finally:
node2.group = orig_group
class TestTRM(unittest.TestCase):
EC_ID = 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment