diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 5fbc8d91e63b8a854d60bdbf140edd3fe4224ca5..53193c3158f4e3dc7c0ad1b6e9838c29bbccdb79 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -2973,10 +2973,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): self._ErrorIf(test, self.ENODEHOOKS, node_name, "Communication failure in hooks execution: %s", msg) if res.offline or msg: - # No need to investigate payload if node is offline or gave an error. - # override manually lu_result here as _ErrorIf only - # overrides self.bad - lu_result = 1 + # No need to investigate payload if node is offline or gave + # an error. continue for script, hkr, output in res.payload: test = hkr == constants.HKR_FAIL @@ -2985,7 +2983,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if test: output = self._HOOKS_INDENT_RE.sub(" ", output) feedback_fn("%s" % output) - lu_result = 0 + lu_result = False return lu_result @@ -3697,6 +3695,9 @@ def _ComputeAncillaryFiles(cluster, redist): if not redist: files_all.update(constants.ALL_CERT_FILES) files_all.update(ssconf.SimpleStore().GetFileList()) + else: + # we need to ship at least the RAPI certificate + files_all.add(constants.RAPI_CERT_FILE) if cluster.modify_etc_hosts: files_all.add(constants.ETC_HOSTS) diff --git a/lib/rapi/baserlib.py b/lib/rapi/baserlib.py index 77dff33877ad725be9577e1e78a32cbda666f068..1daed10e2139b91f3a87b2ba62bea1165429f524 100644 --- a/lib/rapi/baserlib.py +++ b/lib/rapi/baserlib.py @@ -193,10 +193,13 @@ def FillOpcode(opcls, body, static, rename=None): @return: Opcode object """ - CheckType(body, dict, "Body contents") + if body is None: + params = {} + else: + CheckType(body, dict, "Body contents") - # Make copy to be modified - params = body.copy() + # Make copy to be modified + params = body.copy() if rename: for old, new in rename.items(): diff --git a/lib/rapi/rlib2.py b/lib/rapi/rlib2.py index 9e9f2a1f3e6dd0dd86b69dfa3708aeff25e82449..0a958da114beb3e2ee8f5d8ede3fc822d62b942a 100644 --- a/lib/rapi/rlib2.py +++ b/lib/rapi/rlib2.py @@ -922,8 +922,6 @@ class R_2_instances_name_shutdown(baserlib.R_Generic): @return: a job id """ - baserlib.CheckType(self.request_body, dict, "Body contents") - no_remember = bool(self._checkIntVariable("no_remember")) op = _ParseShutdownInstanceRequest(self.items[0], self.request_body, bool(self.dryRun()), no_remember) diff --git a/lib/ssh.py b/lib/ssh.py index 1a3c101ae574fad6e11e9f1f41b37fea2d3064f1..5c11be2129395a9975a6aa96c37fa4cad9a415f3 100644 --- a/lib/ssh.py +++ b/lib/ssh.py @@ -229,8 +229,8 @@ class SshRunner: result = utils.RunCmd(command) if result.failed: - logging.error("Copy to node %s failed (%s) error %s," - " command was %s", + logging.error("Copy to node %s failed (%s) error '%s'," + " command was '%s'", node, result.fail_reason, result.output, result.cmd) return not result.failed diff --git a/lib/utils/log.py b/lib/utils/log.py index 281f59045ac8e7e7ae2505e154d55f6fdf4f1d84..ceff2506918e51278757acccc3c45c110e7b9fac 100644 --- a/lib/utils/log.py +++ b/lib/utils/log.py @@ -230,7 +230,7 @@ def SetupLogging(logfile, program, debug=0, stderr_logging=False, if debug: stderr_handler.setLevel(logging.NOTSET) else: - stderr_handler.setLevel(logging.CRITICAL) + stderr_handler.setLevel(logging.ERROR) root_logger.addHandler(stderr_handler) if syslog in (constants.SYSLOG_YES, constants.SYSLOG_ONLY): diff --git a/man/hbal.rst b/man/hbal.rst index 1e054e57624cc4ad858570714efb8796365554fe..49fd9ec399b5bfe010f61ec257831ff0632e3cda 100644 --- a/man/hbal.rst +++ b/man/hbal.rst @@ -362,6 +362,9 @@ The options that can be passed to the program are as follows: jobset will be executed in parallel. The jobsets themselves are executed serially. + The execution of the job series can be interrupted, see below for + signal handling. + -l *N*, --max-length=*N* Restrict the solution to this length. This can be used for example to automate the execution of the balancing. @@ -399,25 +402,45 @@ The options that can be passed to the program are as follows: -V, --version Just show the program version and exit. +SIGNAL HANDLING +--------------- + +When executing jobs via LUXI (using the ``-X`` option), normally hbal +will execute all jobs until either one errors out or all the jobs finish +successfully. + +Since balancing can take a long time, it is possible to stop hbal early +in two ways: + +- by sending a ``SIGINT`` (``^C``), hbal will register the termination + request, and will wait until the currently submitted jobs finish, at + which point it will exit (with exit code 1) +- by sending a ``SIGTERM``, hbal will immediately exit (with exit code + 2); it is the responsibility of the user to follow up with Ganeti the + result of the currently-executing jobs + +Note that in any situation, it's perfectly safe to kill hbal, either via +the above signals or via any other signal (e.g. ``SIGQUIT``, +``SIGKILL``), since the jobs themselves are processed by Ganeti whereas +hbal (after submission) only watches their progression. In this case, +the use will again have to query Ganeti for job results. + EXIT STATUS ----------- -The exit status of the command will be zero, unless for some reason -the algorithm fatally failed (e.g. wrong node or instance data), or -(in case of job execution) any job has failed. +The exit status of the command will be zero, unless for some reason the +algorithm fatally failed (e.g. wrong node or instance data), or (in case +of job execution) either one of the jobs has failed or the balancing was +interrupted early. BUGS ---- -The program does not check its input data for consistency, and aborts -with cryptic errors messages in this case. +The program does not check all its input data for consistency, and +sometime aborts with cryptic errors messages with invalid data. The algorithm is not perfect. -The output format is not easily scriptable, and the program should -feed moves directly into Ganeti (either via RAPI or via a gnt-debug -input file). - EXAMPLE ------- diff --git a/qa/ganeti-qa.py b/qa/ganeti-qa.py index fc7abbcfa9b3acc202059f2ab2bb30dd46983d55..8b0c33889f36922cbf7db315afad918bf523811e 100755 --- a/qa/ganeti-qa.py +++ b/qa/ganeti-qa.py @@ -225,6 +225,12 @@ def RunCommonInstanceTests(instance): qa_rapi.TestRapiStoppedInstanceConsole, instance) RunTestIf("instance-shutdown", qa_instance.TestInstanceStartup, instance) + # Test shutdown/start via RAPI + RunTestIf(["instance-shutdown", "rapi"], + qa_rapi.TestRapiInstanceShutdown, instance) + RunTestIf(["instance-shutdown", "rapi"], + qa_rapi.TestRapiInstanceStartup, instance) + RunTestIf("instance-list", qa_instance.TestInstanceList) RunTestIf("instance-info", qa_instance.TestInstanceInfo, instance) diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py index ece6e645277af5d0d2c2d8f272ba01d84543d1ed..02218463c349dacf27c39e0052ef1d4c9a18e83e 100644 --- a/qa/qa_rapi.py +++ b/qa/qa_rapi.py @@ -598,6 +598,16 @@ def TestRapiInstanceFailover(instance): _WaitForRapiJob(_rapi_client.FailoverInstance(instance["name"])) +def TestRapiInstanceShutdown(instance): + """Test stopping an instance via RAPI""" + _WaitForRapiJob(_rapi_client.ShutdownInstance(instance["name"])) + + +def TestRapiInstanceStartup(instance): + """Test starting an instance via RAPI""" + _WaitForRapiJob(_rapi_client.StartupInstance(instance["name"])) + + def TestRapiInstanceRename(rename_source, rename_target): """Test renaming instance via RAPI""" _WaitForRapiJob(_rapi_client.RenameInstance(rename_source, rename_target)) diff --git a/tools/ganeti-listrunner b/tools/ganeti-listrunner index 13ab024271934431ffecc7c8c2b175d84855be36..566d9860aab4c28fc78e7406df1cbb8768d781b2 100755 --- a/tools/ganeti-listrunner +++ b/tools/ganeti-listrunner @@ -376,8 +376,10 @@ def HostWorker(logdir, username, password, use_agent, hostname, print " %s: uploading files" % hostname upload_dir = UploadFiles(connection, executable, filelist, logfile) - command = ("cd %s && ./%s %s" % - (upload_dir, os.path.basename(executable), exec_args)) + command = ("cd %s && ./%s" % + (upload_dir, os.path.basename(executable))) + if exec_args: + command += " %s" % exec_args print " %s: executing remote command" % hostname cmd_result = RunRemoteCommand(connection, command, logfile) if cmd_result is True: