Commit ea9c753d authored by Andrea Spadaccini's avatar Andrea Spadaccini
Browse files

Merge branch 'devel-2.5'

* devel-2.5:
  cluster-merge: log an info message at node readd
  Bump version to 2.5.0~rc1
  Fix issue when verifying cluster files
  Revert "utils.log: Write error messages to stderr"
  Fix adding nodes after commit 64c7b383


  LUClusterVerifyGroup: Spread SSH checks over more nodes
  Optimise cli.JobExecutor with many pending jobs
Signed-off-by: default avatarAndrea Spadaccini <spadaccio@google.com>
Reviewed-by: default avatarRené Nussbaumer <rn@google.com>
parents 9822b1dd a080bab8
News
====
Version 2.5.0 beta3
-------------------
Version 2.5.0 rc1
-----------------
*(Released Wed, 31 Aug 2011)*
*(Released Tue, 4 Oct 2011)*
Incompatible/important changes and bugfixes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......@@ -129,6 +129,14 @@ Misc
- DRBD metadata volumes are overwritten with zeros during disk creation.
Version 2.5.0 beta3
-------------------
*(Released Wed, 31 Aug 2011)*
This was the third beta release of the 2.5 series.
Version 2.5.0 beta2
-------------------
......
......@@ -2,7 +2,7 @@
m4_define([gnt_version_major], [2])
m4_define([gnt_version_minor], [5])
m4_define([gnt_version_revision], [0])
m4_define([gnt_version_suffix], [~beta3])
m4_define([gnt_version_suffix], [~rc1])
m4_define([gnt_version_full],
m4_format([%d.%d.%d%s],
gnt_version_major, gnt_version_minor,
......
......@@ -522,12 +522,25 @@ def VerifyNode(what, cluster_name):
what[constants.NV_FILELIST])
if constants.NV_NODELIST in what:
result[constants.NV_NODELIST] = tmp = {}
random.shuffle(what[constants.NV_NODELIST])
for node in what[constants.NV_NODELIST]:
(nodes, bynode) = what[constants.NV_NODELIST]
# Add nodes from other groups (different for each node)
try:
nodes.extend(bynode[my_name])
except KeyError:
pass
# Use a random order
random.shuffle(nodes)
# Try to contact all nodes
val = {}
for node in nodes:
success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node)
if not success:
tmp[node] = message
val[node] = message
result[constants.NV_NODELIST] = val
if constants.NV_NODENETTEST in what:
result[constants.NV_NODENETTEST] = tmp = {}
......
......@@ -260,6 +260,9 @@ _PRIONAME_TO_VALUE = dict(_PRIORITY_NAMES)
QR_UNKNOWN,
QR_INCOMPLETE) = range(3)
#: Maximum batch size for ChooseJob
_CHOOSE_BATCH = 25
class _Argument:
def __init__(self, min=0, max=None): # pylint: disable=W0622
......@@ -3073,7 +3076,8 @@ class JobExecutor(object):
"""
assert self.jobs, "_ChooseJob called with empty job list"
result = self.cl.QueryJobs([i[2] for i in self.jobs], ["status"])
result = self.cl.QueryJobs([i[2] for i in self.jobs[:_CHOOSE_BATCH]],
["status"])
assert result
for job_data, status in zip(self.jobs, result):
......
......@@ -2108,26 +2108,38 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
@param all_nvinfo: RPC results
"""
node_names = frozenset(node.name for node in nodeinfo if not node.offline)
assert master_node in node_names
assert (len(files_all | files_all_opt | files_mc | files_vm) ==
sum(map(len, [files_all, files_all_opt, files_mc, files_vm]))), \
"Found file listed in more than one file list"
# Define functions determining which nodes to consider for a file
file2nodefn = dict([(filename, fn)
for (files, fn) in [(files_all, None),
(files_all_opt, None),
(files_mc, lambda node: (node.master_candidate or
node.name == master_node)),
(files_vm, lambda node: node.vm_capable)]
for filename in files])
files2nodefn = [
(files_all, None),
(files_all_opt, None),
(files_mc, lambda node: (node.master_candidate or
node.name == master_node)),
(files_vm, lambda node: node.vm_capable),
]
# Build mapping from filename to list of nodes which should have the file
nodefiles = {}
for (files, fn) in files2nodefn:
if fn is None:
filenodes = nodeinfo
else:
filenodes = filter(fn, nodeinfo)
nodefiles.update((filename,
frozenset(map(operator.attrgetter("name"), filenodes)))
for filename in files)
assert set(nodefiles) == (files_all | files_all_opt | files_mc | files_vm)
fileinfo = dict((filename, {}) for filename in file2nodefn.keys())
fileinfo = dict((filename, {}) for filename in nodefiles)
ignore_nodes = set()
for node in nodeinfo:
if node.offline:
ignore_nodes.add(node.name)
continue
nresult = all_nvinfo[node.name]
......@@ -2141,13 +2153,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
errorif(test, cls.ENODEFILECHECK, node.name,
"Node did not return file checksum data")
if test:
ignore_nodes.add(node.name)
continue
# Build per-checksum mapping from filename to nodes having it
for (filename, checksum) in node_files.items():
# Check if the file should be considered for a node
fn = file2nodefn[filename]
if fn is None or fn(node):
fileinfo[filename].setdefault(checksum, set()).add(node.name)
assert filename in nodefiles
fileinfo[filename].setdefault(checksum, set()).add(node.name)
for (filename, checksums) in fileinfo.items():
assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
......@@ -2155,23 +2167,33 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
# Nodes having the file
with_file = frozenset(node_name
for nodes in fileinfo[filename].values()
for node_name in nodes)
for node_name in nodes) - ignore_nodes
expected_nodes = nodefiles[filename] - ignore_nodes
# Nodes missing file
missing_file = node_names - with_file
missing_file = expected_nodes - with_file
if filename in files_all_opt:
# All or no nodes
errorif(missing_file and missing_file != node_names,
errorif(missing_file and missing_file != expected_nodes,
cls.ECLUSTERFILECHECK, None,
"File %s is optional, but it must exist on all or no"
" nodes (not found on %s)",
filename, utils.CommaJoin(utils.NiceSort(missing_file)))
else:
# Non-optional files
errorif(missing_file, cls.ECLUSTERFILECHECK, None,
"File %s is missing from node(s) %s", filename,
utils.CommaJoin(utils.NiceSort(missing_file)))
# Warn if a node has a file it shouldn't
unexpected = with_file - expected_nodes
errorif(unexpected,
cls.ECLUSTERFILECHECK, None,
"File %s should not exist on node(s) %s",
filename, utils.CommaJoin(utils.NiceSort(unexpected)))
# See if there are multiple versions of the file
test = len(checksums) > 1
if test:
......@@ -2542,6 +2564,40 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
return instdisk
@staticmethod
def _SshNodeSelector(group_uuid, all_nodes):
"""Create endless iterators for all potential SSH check hosts.
"""
nodes = [node for node in all_nodes
if (node.group != group_uuid and
not node.offline)]
keyfunc = operator.attrgetter("group")
return map(itertools.cycle,
[sorted(map(operator.attrgetter("name"), names))
for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
keyfunc)])
@classmethod
def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
"""Choose which nodes should talk to which other nodes.
We will make nodes contact all nodes in their group, and one node from
every other group.
@warning: This algorithm has a known issue if one node group is much
smaller than others (e.g. just one node). In such a case all other
nodes will talk to the single node.
"""
online_nodes = sorted(node.name for node in group_nodes if not node.offline)
sel = cls._SshNodeSelector(group_uuid, all_nodes)
return (online_nodes,
dict((name, sorted([i.next() for i in sel]))
for name in online_nodes))
def BuildHooksEnv(self):
"""Build hooks env.
......@@ -2605,25 +2661,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_names))
# We will make nodes contact all nodes in their group, and one node from
# every other group.
# TODO: should it be a *random* node, different every time?
online_nodes = [node.name for node in node_data_list if not node.offline]
other_group_nodes = {}
for name in sorted(self.all_node_info):
node = self.all_node_info[name]
if (node.group not in other_group_nodes
and node.group != self.group_uuid
and not node.offline):
other_group_nodes[node.group] = node.name
node_verify_param = {
constants.NV_FILELIST:
utils.UniqueSequence(filename
for files in filemap
for filename in files),
constants.NV_NODELIST: online_nodes + other_group_nodes.values(),
constants.NV_NODELIST:
self._SelectSshCheckNodes(node_data_list, self.group_uuid,
self.all_node_info.values()),
constants.NV_HYPERVISOR: hypervisors,
constants.NV_HVPARAMS:
_GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
......@@ -5061,7 +5106,7 @@ class LUNodeAdd(LogicalUnit):
node_verify_list = [self.cfg.GetMasterNode()]
node_verify_param = {
constants.NV_NODELIST: [node],
constants.NV_NODELIST: ([node], {}),
# TODO: do a node-net-test as well?
}
......
......@@ -230,7 +230,7 @@ def SetupLogging(logfile, program, debug=0, stderr_logging=False,
if debug:
stderr_handler.setLevel(logging.NOTSET)
else:
stderr_handler.setLevel(logging.ERROR)
stderr_handler.setLevel(logging.CRITICAL)
root_logger.addHandler(stderr_handler)
if syslog in (constants.SYSLOG_YES, constants.SYSLOG_ONLY):
......
......@@ -27,6 +27,7 @@ import unittest
import time
import tempfile
import shutil
import operator
from ganeti import constants
from ganeti import mcpu
......@@ -37,6 +38,8 @@ from ganeti import utils
from ganeti import luxi
from ganeti import ht
from ganeti import objects
from ganeti import compat
from ganeti import rpc
import testutils
import mocks
......@@ -207,5 +210,158 @@ class TestLUGroupAssignNodes(unittest.TestCase):
self.assertEqual(set(["inst3c"]), set(prev))
class TestClusterVerifySsh(unittest.TestCase):
def testMultipleGroups(self):
fn = cmdlib.LUClusterVerifyGroup._SelectSshCheckNodes
mygroupnodes = [
objects.Node(name="node20", group="my", offline=False),
objects.Node(name="node21", group="my", offline=False),
objects.Node(name="node22", group="my", offline=False),
objects.Node(name="node23", group="my", offline=False),
objects.Node(name="node24", group="my", offline=False),
objects.Node(name="node25", group="my", offline=False),
objects.Node(name="node26", group="my", offline=True),
]
nodes = [
objects.Node(name="node1", group="g1", offline=True),
objects.Node(name="node2", group="g1", offline=False),
objects.Node(name="node3", group="g1", offline=False),
objects.Node(name="node4", group="g1", offline=True),
objects.Node(name="node5", group="g1", offline=False),
objects.Node(name="node10", group="xyz", offline=False),
objects.Node(name="node11", group="xyz", offline=False),
objects.Node(name="node40", group="alloff", offline=True),
objects.Node(name="node41", group="alloff", offline=True),
objects.Node(name="node50", group="aaa", offline=False),
] + mygroupnodes
assert not utils.FindDuplicates(map(operator.attrgetter("name"), nodes))
(online, perhost) = fn(mygroupnodes, "my", nodes)
self.assertEqual(online, ["node%s" % i for i in range(20, 26)])
self.assertEqual(set(perhost.keys()), set(online))
self.assertEqual(perhost, {
"node20": ["node10", "node2", "node50"],
"node21": ["node11", "node3", "node50"],
"node22": ["node10", "node5", "node50"],
"node23": ["node11", "node2", "node50"],
"node24": ["node10", "node3", "node50"],
"node25": ["node11", "node5", "node50"],
})
def testSingleGroup(self):
fn = cmdlib.LUClusterVerifyGroup._SelectSshCheckNodes
nodes = [
objects.Node(name="node1", group="default", offline=True),
objects.Node(name="node2", group="default", offline=False),
objects.Node(name="node3", group="default", offline=False),
objects.Node(name="node4", group="default", offline=True),
]
assert not utils.FindDuplicates(map(operator.attrgetter("name"), nodes))
(online, perhost) = fn(nodes, "default", nodes)
self.assertEqual(online, ["node2", "node3"])
self.assertEqual(set(perhost.keys()), set(online))
self.assertEqual(perhost, {
"node2": [],
"node3": [],
})
class TestClusterVerifyFiles(unittest.TestCase):
@staticmethod
def _FakeErrorIf(errors, cond, ecode, item, msg, *args, **kwargs):
assert ((ecode == cmdlib.LUClusterVerifyGroup.ENODEFILECHECK and
ht.TNonEmptyString(item)) or
(ecode == cmdlib.LUClusterVerifyGroup.ECLUSTERFILECHECK and
item is None))
if args:
msg = msg % args
if cond:
errors.append((item, msg))
_VerifyFiles = cmdlib.LUClusterVerifyGroup._VerifyFiles
def test(self):
errors = []
master_name = "master.example.com"
nodeinfo = [
objects.Node(name=master_name, offline=False),
objects.Node(name="node2.example.com", offline=False),
objects.Node(name="node3.example.com", master_candidate=True),
objects.Node(name="node4.example.com", offline=False),
objects.Node(name="nodata.example.com"),
objects.Node(name="offline.example.com", offline=True),
]
cluster = objects.Cluster(modify_etc_hosts=True,
enabled_hypervisors=[constants.HT_XEN_HVM])
files_all = set([
constants.CLUSTER_DOMAIN_SECRET_FILE,
constants.RAPI_CERT_FILE,
])
files_all_opt = set([
constants.RAPI_USERS_FILE,
])
files_mc = set([
constants.CLUSTER_CONF_FILE,
])
files_vm = set()
nvinfo = {
master_name: rpc.RpcResult(data=(True, {
constants.NV_FILELIST: {
constants.CLUSTER_CONF_FILE: "82314f897f38b35f9dab2f7c6b1593e0",
constants.RAPI_CERT_FILE: "babbce8f387bc082228e544a2146fee4",
constants.CLUSTER_DOMAIN_SECRET_FILE: "cds-47b5b3f19202936bb4",
}})),
"node2.example.com": rpc.RpcResult(data=(True, {
constants.NV_FILELIST: {
constants.RAPI_CERT_FILE: "97f0356500e866387f4b84233848cc4a",
}
})),
"node3.example.com": rpc.RpcResult(data=(True, {
constants.NV_FILELIST: {
constants.RAPI_CERT_FILE: "97f0356500e866387f4b84233848cc4a",
constants.CLUSTER_DOMAIN_SECRET_FILE: "cds-47b5b3f19202936bb4",
}
})),
"node4.example.com": rpc.RpcResult(data=(True, {
constants.NV_FILELIST: {
constants.RAPI_CERT_FILE: "97f0356500e866387f4b84233848cc4a",
constants.CLUSTER_CONF_FILE: "conf-a6d4b13e407867f7a7b4f0f232a8f527",
constants.CLUSTER_DOMAIN_SECRET_FILE: "cds-47b5b3f19202936bb4",
constants.RAPI_USERS_FILE: "rapiusers-ea3271e8d810ef3",
}
})),
"nodata.example.com": rpc.RpcResult(data=(True, {})),
"offline.example.com": rpc.RpcResult(offline=True),
}
assert set(nvinfo.keys()) == set(map(operator.attrgetter("name"), nodeinfo))
self._VerifyFiles(compat.partial(self._FakeErrorIf, errors), nodeinfo,
master_name, nvinfo,
(files_all, files_all_opt, files_mc, files_vm))
self.assertEqual(sorted(errors), sorted([
(None, ("File %s found with 2 different checksums (variant 1 on"
" node2.example.com, node3.example.com, node4.example.com;"
" variant 2 on master.example.com)" % constants.RAPI_CERT_FILE)),
(None, ("File %s is missing from node(s) node2.example.com" %
constants.CLUSTER_DOMAIN_SECRET_FILE)),
(None, ("File %s should not exist on node(s) node4.example.com" %
constants.CLUSTER_CONF_FILE)),
(None, ("File %s is missing from node(s) node3.example.com" %
constants.CLUSTER_CONF_FILE)),
(None, ("File %s found with 2 different checksums (variant 1 on"
" master.example.com; variant 2 on node4.example.com)" %
constants.CLUSTER_CONF_FILE)),
(None, ("File %s is optional, but it must exist on all or no nodes (not"
" found on master.example.com, node2.example.com,"
" node3.example.com)" % constants.RAPI_USERS_FILE)),
("nodata.example.com", "Node did not return file checksum data"),
]))
if __name__ == "__main__":
testutils.GanetiTestProgram()
......@@ -648,6 +648,7 @@ class Merger(object):
"""
for data in self.merger_data:
for node in data.nodes:
logging.info("Readding node %s", node)
result = utils.RunCmd(["gnt-node", "add", "--readd",
"--no-ssh-key-check", "--force-join", node])
if result.failed:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment