Commit fcad7225 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

Use LU-generated jobs for verifying cluster



This patch moves the logic for verifying the various node groups in a
cluster into the master daemon. Job dependencies are used to ensure the
configuration, which requires the BGL, is verified first.

With this change it will be possible to expose whole-cluster
verification through the remote API without requiring additional client
logic on top of standard features like LU-generated jobs and job
dependencies.
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent 57106b74
......@@ -455,40 +455,35 @@ def VerifyCluster(opts, args):
@return: the desired exit code
"""
simulate = opts.simulate_errors
skip_checks = []
if opts.nodegroup is None:
# Verify cluster config.
op = opcodes.OpClusterVerifyConfig(verbose=opts.verbose,
error_codes=opts.error_codes,
debug_simulate_errors=simulate)
success, all_groups = SubmitOpCode(op, opts=opts)
else:
success = True
all_groups = [opts.nodegroup]
if opts.skip_nplusone_mem:
skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
jex = JobExecutor(opts=opts, verbose=False)
cl = GetClient()
for group in all_groups:
op = opcodes.OpClusterVerifyGroup(group_name=group,
skip_checks=skip_checks,
verbose=opts.verbose,
error_codes=opts.error_codes,
debug_simulate_errors=simulate)
jex.QueueJob("group " + group, op)
op = opcodes.OpClusterVerify(verbose=opts.verbose,
error_codes=opts.error_codes,
debug_simulate_errors=opts.simulate_errors,
skip_checks=skip_checks,
group_name=opts.nodegroup)
result = SubmitOpCode(op, cl=cl, opts=opts)
results = jex.GetResults()
success &= compat.all(r[1][0] for r in results)
# Keep track of submitted jobs
jex = JobExecutor(cl=cl, opts=opts)
if success:
return constants.EXIT_SUCCESS
for (status, job_id) in result[constants.JOB_IDS_KEY]:
jex.AddJobId(None, status, job_id)
results = jex.GetResults()
bad_cnt = len([row for row in results if not row[0]])
if bad_cnt == 0:
rcode = constants.EXIT_SUCCESS
else:
return constants.EXIT_FAILURE
ToStdout("%s job(s) failed while verifying the cluster.", bad_cnt)
rcode = constants.EXIT_FAILURE
return rcode
def VerifyDisks(opts, args):
......
......@@ -1510,6 +1510,47 @@ class _VerifyErrors(object):
self.bad = self.bad or cond
class LUClusterVerify(NoHooksLU):
"""Submits all jobs necessary to verify the cluster.
"""
REQ_BGL = False
def ExpandNames(self):
self.needed_locks = {}
def Exec(self, feedback_fn):
jobs = []
if self.op.group_name:
groups = [self.op.group_name]
depends_fn = lambda: None
else:
groups = self.cfg.GetNodeGroupList()
# Verify global configuration
jobs.append([opcodes.OpClusterVerifyConfig()])
# Always depend on global verification
depends_fn = lambda: [(-len(jobs), [])]
jobs.extend([opcodes.OpClusterVerifyGroup(group_name=group,
depends=depends_fn())]
for group in groups)
# Fix up all parameters
for op in itertools.chain(*jobs): # pylint: disable-msg=W0142
op.debug_simulate_errors = self.op.debug_simulate_errors
op.verbose = self.op.verbose
op.error_codes = self.op.error_codes
try:
op.skip_checks = self.op.skip_checks
except AttributeError:
assert not isinstance(op, opcodes.OpClusterVerifyGroup)
return ResultWithJobs(jobs)
class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
"""Verifies the cluster config.
......
......@@ -573,6 +573,19 @@ class OpClusterQuery(OpCode):
"""Query cluster information."""
class OpClusterVerify(OpCode):
"""Submits all jobs necessary to verify the cluster.
"""
OP_PARAMS = [
_PDebugSimulateErrors,
_PErrorCodes,
_PSkipChecks,
_PVerbose,
("group_name", None, ht.TMaybeString, "Group to verify")
]
class OpClusterVerifyConfig(OpCode):
"""Verify the cluster config.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment