Commit 1c3231aa authored by Thomas Thrainer's avatar Thomas Thrainer
Browse files

Index nodes by their UUID

No longer index nodes by their name but by their UUID in the cluster
config. This change changes large parts of the code, as the following
adjustments were necessary:
 * Change the index key to UUID in the configuration and the
   ConfigWriter, including all methods.
 * Change all cross-references to nodes to use UUID's.
 * External interfaces (command line interface, IAllocator interface,
   hook scripts, etc.) are kept stable.
 * RPC-calls can resolve UUID's as target node arguments, if the RPC
   runner is based on a ConfigWriter instance. The result dictionary is
   presented in the form the nodes are addressed: by UUID if UUID's were
   given, or by name if names were given.
 * Node UUID's are resolved in ExpandNames and then stored in the
   OpCode. This allows to check for node renames if the OpCode is
   reloaded after a cluster restart. This check is currently only done
   for single node parameters.
 * Variable names are renamed to follow the following pattern:
   - Suffix is 'node' or 'nodes': Variable holds Node objects
   - Suffix is 'name' or 'names': Variable holds node names
   - Suffix is 'uuid' or 'uuids': Variable holds node UUID's
 * Tests are adapted.
Signed-off-by: default avatarThomas Thrainer <>
Reviewed-by: default avatarKlaus Aehlig <>
parent bc0a2284
......@@ -596,6 +596,7 @@ HS_LIB_SRCS = \
src/Ganeti/OpCodes.hs \
src/Ganeti/OpParams.hs \
src/Ganeti/Path.hs \
src/Ganeti/Query/Cluster.hs \
src/Ganeti/Query/Common.hs \
src/Ganeti/Query/Export.hs \
src/Ganeti/Query/Filter.hs \
......@@ -3777,14 +3777,13 @@ def CleanupImportExport(name):
shutil.rmtree(status_dir, ignore_errors=True)
def _FindDisks(nodes_ip, disks):
def _FindDisks(target_node_uuid, nodes_ip, disks):
"""Sets the physical ID on disks and returns the block devices.
# set the correct physical ID
my_name = netutils.Hostname.GetSysName()
for cf in disks:
cf.SetPhysicalID(my_name, nodes_ip)
cf.SetPhysicalID(target_node_uuid, nodes_ip)
bdevs = []
......@@ -3796,11 +3795,11 @@ def _FindDisks(nodes_ip, disks):
return bdevs
def DrbdDisconnectNet(nodes_ip, disks):
def DrbdDisconnectNet(target_node_uuid, nodes_ip, disks):
"""Disconnects the network on a list of drbd devices.
bdevs = _FindDisks(nodes_ip, disks)
bdevs = _FindDisks(target_node_uuid, nodes_ip, disks)
# disconnect disks
for rd in bdevs:
......@@ -3811,11 +3810,12 @@ def DrbdDisconnectNet(nodes_ip, disks):
err, exc=True)
def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
def DrbdAttachNet(target_node_uuid, nodes_ip, disks, instance_name,
"""Attaches the network on a list of drbd devices.
bdevs = _FindDisks(nodes_ip, disks)
bdevs = _FindDisks(target_node_uuid, nodes_ip, disks)
if multimaster:
for idx, rd in enumerate(bdevs):
......@@ -3873,7 +3873,7 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
_Fail("Can't change to primary mode: %s", err)
def DrbdWaitSync(nodes_ip, disks):
def DrbdWaitSync(target_node_uuid, nodes_ip, disks):
"""Wait until DRBDs have synchronized.
......@@ -3883,7 +3883,7 @@ def DrbdWaitSync(nodes_ip, disks):
raise utils.RetryAgain()
return stats
bdevs = _FindDisks(nodes_ip, disks)
bdevs = _FindDisks(target_node_uuid, nodes_ip, disks)
min_resync = 100
alldone = True
......@@ -609,7 +609,6 @@ def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914
......@@ -688,13 +687,14 @@ def InitConfig(version, cluster_config, master_node_config,
master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
cluster_config.master_node = master_node_config.uuid
nodes = { master_node_config,
master_node_config.uuid: master_node_config,
default_nodegroup = objects.NodeGroup(
uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
nodegroups = {
......@@ -714,7 +714,7 @@ def InitConfig(version, cluster_config, master_node_config,
def FinalizeClusterDestroy(master):
def FinalizeClusterDestroy(master_uuid):
"""Execute the last steps of cluster destroy
This function shuts down all the daemons, completing the destroy
......@@ -725,22 +725,24 @@ def FinalizeClusterDestroy(master):
modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
runner = rpc.BootstrapRunner()
master_name = cfg.GetNodeName(master_uuid)
master_params = cfg.GetMasterNetworkParameters() = master
master_params.uuid = master_uuid
ems = cfg.GetUseExternalMipScript()
result = runner.call_node_deactivate_master_ip(,
master_params, ems)
result = runner.call_node_deactivate_master_ip(master_name, master_params,
msg = result.fail_msg
if msg:
logging.warning("Could not disable the master IP: %s", msg)
result = runner.call_node_stop_master(master)
result = runner.call_node_stop_master(master_name)
msg = result.fail_msg
if msg:
logging.warning("Could not disable the master role: %s", msg)
result = runner.call_node_leave_cluster(master, modify_ssh_setup)
result = runner.call_node_leave_cluster(master_name, modify_ssh_setup)
msg = result.fail_msg
if msg:
logging.warning("Could not shutdown the node daemon and cleanup"
......@@ -788,7 +790,7 @@ def MasterFailover(no_voting=False):
sstore = ssconf.SimpleStore()
old_master, new_master = ssconf.GetMasterAndMyself(sstore)
node_list = sstore.GetNodeList()
node_names = sstore.GetNodeList()
mc_list = sstore.GetMasterCandidates()
if old_master == new_master:
......@@ -807,7 +809,7 @@ def MasterFailover(no_voting=False):
if not no_voting:
vote_list = GatherMasterVotes(node_list)
vote_list = GatherMasterVotes(node_names)
if vote_list:
voted_master = vote_list[0][0]
......@@ -832,8 +834,20 @@ def MasterFailover(no_voting=False):
# configuration data
cfg = config.ConfigWriter(accept_foreign=True)
old_master_node = cfg.GetNodeInfoByName(old_master)
if old_master_node is None:
raise errors.OpPrereqError("Could not find old master node '%s' in"
" cluster configuration." % old_master,
cluster_info = cfg.GetClusterInfo()
cluster_info.master_node = new_master
new_master_node = cfg.GetNodeInfoByName(new_master)
if new_master_node is None:
raise errors.OpPrereqError("Could not find new master node '%s' in"
" cluster configuration." % new_master,
cluster_info.master_node = new_master_node.uuid
# this will also regenerate the ssconf files, since we updated the
# cluster info
cfg.Update(cluster_info, logging.error)
......@@ -851,9 +865,9 @@ def MasterFailover(no_voting=False):
runner = rpc.BootstrapRunner()
master_params = cfg.GetMasterNetworkParameters() = old_master
master_params.uuid = old_master_node.uuid
ems = cfg.GetUseExternalMipScript()
result = runner.call_node_deactivate_master_ip(,
result = runner.call_node_deactivate_master_ip(old_master,
master_params, ems)
msg = result.fail_msg
......@@ -917,7 +931,7 @@ def GetMaster():
return old_master
def GatherMasterVotes(node_list):
def GatherMasterVotes(node_names):
"""Check the agreement on who is the master.
This function will return a list of (node, number of votes), ordered
......@@ -931,8 +945,8 @@ def GatherMasterVotes(node_list):
since we use the same source for configuration information for both
backend and boostrap, we'll always vote for ourselves.
@type node_list: list
@param node_list: the list of nodes to query for master info; the current
@type node_names: list
@param node_names: the list of nodes to query for master info; the current
node will be removed if it is in the list
@rtype: list
@return: list of (node, votes)
......@@ -940,30 +954,31 @@ def GatherMasterVotes(node_list):
myself = netutils.Hostname.GetSysName()
except ValueError:
if not node_list:
if not node_names:
# no nodes left (eventually after removing myself)
return []
results = rpc.BootstrapRunner().call_master_info(node_list)
results = rpc.BootstrapRunner().call_master_info(node_names)
if not isinstance(results, dict):
# this should not happen (unless internal error in rpc)
logging.critical("Can't complete rpc call, aborting master startup")
return [(None, len(node_list))]
return [(None, len(node_names))]
votes = {}
for node in results:
nres = results[node]
for node_name in results:
nres = results[node_name]
data = nres.payload
msg = nres.fail_msg
fail = False
if msg:
logging.warning("Error contacting node %s: %s", node, msg)
logging.warning("Error contacting node %s: %s", node_name, msg)
fail = True
# for now we accept both length 3, 4 and 5 (data[3] is primary ip version
# and data[4] is the master netmask)
elif not isinstance(data, (tuple, list)) or len(data) < 3:
logging.warning("Invalid data received from node %s: %s", node, data)
logging.warning("Invalid data received from node %s: %s",
node_name, data)
fail = True
if fail:
if None not in votes:
......@@ -275,10 +275,10 @@ def DestroyCluster(opts, args):
return 1
op = opcodes.OpClusterDestroy()
master = SubmitOpCode(op, opts=opts)
master_uuid = SubmitOpCode(op, opts=opts)
# if we reached this, the opcode didn't fail; we can proceed to
# shutdown all the daemons
return 0
......@@ -915,25 +915,29 @@ def _DoConsole(console, show_command, cluster_name, feedback_fn=ToStdout,
return constants.EXIT_SUCCESS
def _FormatLogicalID(dev_type, logical_id, roman):
def _FormatDiskDetails(dev_type, dev, roman):
"""Formats the logical_id of a disk.
if dev_type == constants.LD_DRBD8:
node_a, node_b, port, minor_a, minor_b, key = logical_id
drbd_info = dev["drbd_info"]
data = [
("nodeA", "%s, minor=%s" % (node_a, compat.TryToRoman(minor_a,
("nodeB", "%s, minor=%s" % (node_b, compat.TryToRoman(minor_b,
("port", str(compat.TryToRoman(port, convert=roman))),
("auth key", str(key)),
("nodeA", "%s, minor=%s" %
("nodeB", "%s, minor=%s" %
("port", str(compat.TryToRoman(drbd_info["port"], convert=roman))),
("auth key", str(drbd_info["secret"])),
elif dev_type == constants.LD_LV:
vg_name, lv_name = logical_id
vg_name, lv_name = dev["logical_id"]
data = ["%s/%s" % (vg_name, lv_name)]
data = [str(logical_id)]
data = [str(dev["logical_id"])]
return data
......@@ -1032,7 +1036,7 @@ def _FormatBlockDevInfo(idx, top_level, dev, roman):
data.append(("access mode", dev["mode"]))
if dev["logical_id"] is not None:
l_id = _FormatLogicalID(dev["dev_type"], dev["logical_id"], roman)
l_id = _FormatDiskDetails(dev["dev_type"], dev, roman)
except ValueError:
l_id = [str(dev["logical_id"])]
if len(l_id) == 1:
......@@ -35,7 +35,7 @@ from ganeti import utils
from ganeti.cmdlib.base import QueryBase, NoHooksLU, LogicalUnit
from ganeti.cmdlib.common import GetWantedNodes, ShareAll, CheckNodeOnline, \
from ganeti.cmdlib.instance_storage import StartInstanceDisks, \
from ganeti.cmdlib.instance_utils import GetClusterDomainSecret, \
......@@ -53,7 +53,7 @@ class ExportQuery(QueryBase):
# The following variables interact with _QueryBase._GetNames
if self.names:
self.wanted = GetWantedNodes(lu, self.names)
(self.wanted, _) = GetWantedNodes(lu, self.names)
self.wanted = locking.ALL_SET
......@@ -82,15 +82,15 @@ class ExportQuery(QueryBase):
if level != locking.LEVEL_CLUSTER) or
self.do_locking or self.use_locking)
nodes = self._GetNames(lu, lu.cfg.GetNodeList(), locking.LEVEL_NODE)
node_uuids = self._GetNames(lu, lu.cfg.GetNodeList(), locking.LEVEL_NODE)
result = []
for (node, nres) in lu.rpc.call_export_list(nodes).items():
for (node_uuid, nres) in lu.rpc.call_export_list(node_uuids).items():
if nres.fail_msg:
result.append((node, None))
result.append((node_uuid, None))
result.extend((node, expname) for expname in nres.payload)
result.extend((node_uuid, expname) for expname in nres.payload)
return result
......@@ -154,10 +154,12 @@ class LUBackupPrepare(NoHooksLU):
if self.op.mode == constants.EXPORT_MODE_REMOTE:
salt = utils.GenerateSecret(8)
feedback_fn("Generating X509 certificate on %s" % instance.primary_node)
feedback_fn("Generating X509 certificate on %s" %
result = self.rpc.call_x509_cert_create(instance.primary_node,
result.Raise("Can't create X509 key and certificate on %s" % result.node)
result.Raise("Can't create X509 key and certificate on %s" %
(name, cert_pem) = result.payload
......@@ -203,6 +205,9 @@ class LUBackupExport(LogicalUnit):
# Lock all nodes for local exports
if self.op.mode == constants.EXPORT_MODE_LOCAL:
(self.op.target_node_uuid, self.op.target_node) = \
ExpandNodeUuidAndName(self.cfg, self.op.target_node_uuid,
# FIXME: lock only instance primary and destination node
# Sad but true, for now we have do lock all nodes, as we don't know where
......@@ -248,7 +253,7 @@ class LUBackupExport(LogicalUnit):
nl = [self.cfg.GetMasterNode(), self.instance.primary_node]
if self.op.mode == constants.EXPORT_MODE_LOCAL:
return (nl, nl)
......@@ -272,12 +277,11 @@ class LUBackupExport(LogicalUnit):
" down before", errors.ECODE_STATE)
if self.op.mode == constants.EXPORT_MODE_LOCAL:
self.op.target_node = ExpandNodeName(self.cfg, self.op.target_node)
self.dst_node = self.cfg.GetNodeInfo(self.op.target_node)
self.dst_node = self.cfg.GetNodeInfo(self.op.target_node_uuid)
assert self.dst_node is not None
CheckNodeOnline(self, self.dst_node.uuid)
CheckNodeNotDrained(self, self.dst_node.uuid)
self._cds = None
self.dest_disk_info = None
......@@ -355,24 +359,25 @@ class LUBackupExport(LogicalUnit):
assert self.op.mode != constants.EXPORT_MODE_REMOTE
nodelist = self.cfg.GetNodeList()
node_uuids = self.cfg.GetNodeList()
# on one-node clusters nodelist will be empty after the removal
# if we proceed the backup would be removed because OpBackupQuery
# substitutes an empty list with the full cluster node list.
iname =
if nodelist:
if node_uuids:
feedback_fn("Removing old exports for instance %s" % iname)
exportlist = self.rpc.call_export_list(nodelist)
for node in exportlist:
if exportlist[node].fail_msg:
exportlist = self.rpc.call_export_list(node_uuids)
for node_uuid in exportlist:
if exportlist[node_uuid].fail_msg:
if iname in exportlist[node].payload:
msg = self.rpc.call_export_remove(node, iname).fail_msg
if iname in exportlist[node_uuid].payload:
msg = self.rpc.call_export_remove(node_uuid, iname).fail_msg
if msg:
self.LogWarning("Could not remove older export for instance %s"
" on node %s: %s", iname, node, msg)
" on node %s: %s", iname,
self.cfg.GetNodeName(node_uuid), msg)
def Exec(self, feedback_fn):
"""Export an instance to an image in the cluster.
......@@ -381,22 +386,23 @@ class LUBackupExport(LogicalUnit):
assert self.op.mode in constants.EXPORT_MODES
instance = self.instance
src_node = instance.primary_node
src_node_uuid = instance.primary_node
if self.op.shutdown:
# shutdown the instance, but not the disks
feedback_fn("Shutting down instance %s" %
result = self.rpc.call_instance_shutdown(src_node, instance,
result = self.rpc.call_instance_shutdown(src_node_uuid, instance,
# TODO: Maybe ignore failures if ignore_remove_failures is set
result.Raise("Could not shutdown instance %s on"
" node %s" % (, src_node))
" node %s" % (,
# set the disks ID correctly since call_instance_start needs the
# correct drbd minor to create the symlinks
for disk in instance.disks:
self.cfg.SetDiskID(disk, src_node)
self.cfg.SetDiskID(disk, src_node_uuid)
activate_disks = not instance.disks_active
......@@ -416,7 +422,7 @@ class LUBackupExport(LogicalUnit):
not self.op.remove_instance):
assert not activate_disks
feedback_fn("Starting instance %s" %
result = self.rpc.call_instance_start(src_node,
result = self.rpc.call_instance_start(src_node_uuid,
(instance, None, None), False,
msg = result.fail_msg
......@@ -515,18 +521,20 @@ class LUBackupRemove(NoHooksLU):
locked_nodes = self.owned_locks(locking.LEVEL_NODE)
exportlist = self.rpc.call_export_list(locked_nodes)
found = False
for node in exportlist:
msg = exportlist[node].fail_msg
for node_uuid in exportlist:
msg = exportlist[node_uuid].fail_msg
if msg:
self.LogWarning("Failed to query node %s (continuing): %s", node, msg)
self.LogWarning("Failed to query node %s (continuing): %s",
self.cfg.GetNodeName(node_uuid), msg)
if instance_name in exportlist[node].payload:
if instance_name in exportlist[node_uuid].payload:
found = True
result = self.rpc.call_export_remove(node, instance_name)
result = self.rpc.call_export_remove(node_uuid, instance_name)
msg = result.fail_msg
if msg:
logging.error("Could not remove export for instance %s"
" on node %s: %s", instance_name, node, msg)
" on node %s: %s", instance_name,
self.cfg.GetNodeName(node_uuid), msg)
if fqdn_warn and not found:
feedback_fn("Export not found. If trying to remove an export belonging"
......@@ -181,7 +181,7 @@ class LogicalUnit(object):
# Acquire just two nodes
self.needed_locks = {
locking.LEVEL_NODE: ['', ''],
locking.LEVEL_NODE: ['node1-uuid', 'node2-uuid'],
# Acquire no locks
self.needed_locks = {} # No, you can't leave it to the default value None
......@@ -269,11 +269,14 @@ class LogicalUnit(object):
def BuildHooksNodes(self):
"""Build list of nodes to run LU's hooks.
@rtype: tuple; (list, list)
@return: Tuple containing a list of node names on which the hook
should run before the execution and a list of node names on which the
hook should run after the execution. No nodes should be returned as an
empty list (and not None).
@rtype: tuple; (list, list) or (list, list, list)
@return: Tuple containing a list of node UUIDs on which the hook
should run before the execution and a list of node UUIDs on which the
hook should run after the execution. As it might be possible that the
node UUID is not known at the time this method is invoked, an optional
third list can be added which contains node names on which the hook
should run after the execution (in case of node add, for instance).
No nodes should be returned as an empty list (and not None).
@note: If the C{HPATH} attribute of the LU class is C{None}, this function
will not be called.
......@@ -356,17 +359,17 @@ class LogicalUnit(object):
# For now we'll replace self.needed_locks[locking.LEVEL_NODE], but in the
# future we might want to have different behaviors depending on the value
# of self.recalculate_locks[locking.LEVEL_NODE]
wanted_nodes = []
wanted_node_uuids = []
locked_i = self.owned_locks(locking.LEVEL_INSTANCE)
for _, instance in self.cfg.GetMultiInstanceInfo(locked_i):
if not primary_only:
if self.recalculate_locks[level] == constants.LOCKS_REPLACE:
self.needed_locks[level] = wanted_nodes
self.needed_locks[level] = wanted_node_uuids
elif self.recalculate_locks[level] == constants.LOCKS_APPEND:
raise errors.ProgrammerError("Unknown recalculation mode")
This diff is collapsed.
......@@ -48,17 +48,17 @@ CAN_CHANGE_INSTANCE_OFFLINE = (frozenset(INSTANCE_DOWN) | frozenset([
def _ExpandItemName(fn, name, kind):
def _ExpandItemName(expand_fn, name, kind):
"""Expand an item name.
@param fn: the function to use for expansion
@param expand_fn: the function to use for expansion
@param name: requested item name
@param kind: text description ('Node' or 'Instance')
@return: the resolved (full) name
@return: the result of the expand_fn, if successful
@raise errors.OpPrereqError: if the item is not found
full_name = fn(name)
full_name = expand_fn(name)
if full_name is None:
raise errors.OpPrereqError("%s '%s' not known" % (kind, name),
......@@ -70,9 +70,26 @@ def ExpandInstanceName(cfg, name):
return _ExpandItemName(cfg.ExpandInstanceName, name, "Instance")
def ExpandNodeName(cfg, name):
"""Wrapper over L{_ExpandItemName} for nodes."""
return _ExpandItemName(cfg.ExpandNodeName, name, "Node")
def ExpandNodeUuidAndName(cfg, expected_uuid, name):
"""Expand a short node name into the node UUID and full name.
@type cfg: L{config.ConfigWriter}
@param cfg: The cluster configuration
@type expected_uuid: string
@param expected_uuid: expected UUID for the node (or None if there is no
expectation). If it does not match, a L{errors.OpPrereqError} is
@type name: string
@param name: the short node name
(uuid, full_name) = _ExpandItemName(cfg.ExpandNodeName, name, "Node")
if expected_uuid is not None and uuid != expected_uuid:
raise errors.OpPrereqError(
"The nodes UUID '%s' does not match the expected UUID '%s' for node"
" '%s'. Maybe the node changed since you submitted this job." %
(uuid, expected_uuid, full_name), errors.ECODE_NOTUNIQUE)
return (uuid, full_name)
def ShareAll():
......@@ -106,22 +123,25 @@ def CheckNodeGroupInstances(cfg, group_uuid, owned_instances):
return wanted_instances
def GetWantedNodes(lu, nodes):
def GetWantedNodes(lu, short_node_names):
"""Returns list of checked and expanded node names.
@type lu: L{LogicalUnit}
@param lu: the logical unit on whose behalf we execute
@type nodes: list
@param nodes: list of node names or None for all nodes
@rtype: list
@return: the list of nodes, sorted
@type short_node_names: list
@param short_node_names: list of node names or None for all nodes
@rtype: tuple of lists
@return: tupe with (list of node UUIDs, list of node names)
@raise errors.ProgrammerError: if the nodes parameter is wrong type
if nodes:
return [ExpandNodeName(lu.cfg, name) for name in nodes]