diff --git a/lib/backend.py b/lib/backend.py index 78357af92125f05d16bd7148008ccc7916a9e00d..7964736ca74e20fa3ea7d0bd307a3724333abfd4 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -440,6 +440,7 @@ def GetNodeInfo(vgname, hypervisor_type): - memory_dom0 is the memory allocated for domain0 in MiB - memory_free is the currently available (free) ram in MiB - memory_total is the total number of ram in MiB + - hv_version: the hypervisor version, if available """ outputarray = {} diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 2b2409b362bc234916d2991a7476a27b6ccd69b9..3ebc753e491d4027c3dd7598333a07b95ffbae0e 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -2973,10 +2973,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): self._ErrorIf(test, self.ENODEHOOKS, node_name, "Communication failure in hooks execution: %s", msg) if res.offline or msg: - # No need to investigate payload if node is offline or gave an error. - # override manually lu_result here as _ErrorIf only - # overrides self.bad - lu_result = 1 + # No need to investigate payload if node is offline or gave + # an error. continue for script, hkr, output in res.payload: test = hkr == constants.HKR_FAIL @@ -2985,7 +2983,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): if test: output = self._HOOKS_INDENT_RE.sub(" ", output) feedback_fn("%s" % output) - lu_result = 0 + lu_result = False return lu_result @@ -3697,6 +3695,9 @@ def _ComputeAncillaryFiles(cluster, redist): if not redist: files_all.update(constants.ALL_CERT_FILES) files_all.update(ssconf.SimpleStore().GetFileList()) + else: + # we need to ship at least the RAPI certificate + files_all.add(constants.RAPI_CERT_FILE) if cluster.modify_etc_hosts: files_all.add(constants.ETC_HOSTS) @@ -7414,6 +7415,21 @@ class TLMigrateInstance(Tasklet): target_node = self.target_node source_node = self.source_node + # Check for hypervisor version mismatch and warn the user. + nodeinfo = self.rpc.call_node_info([source_node, target_node], + None, self.instance.hypervisor) + src_info = nodeinfo[source_node] + dst_info = nodeinfo[target_node] + + if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and + (constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)): + src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION] + dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION] + if src_version != dst_version: + self.feedback_fn("* warning: hypervisor version mismatch between" + " source (%s) and target (%s) node" % + (src_version, dst_version)) + self.feedback_fn("* checking disk consistency between source and target") for dev in instance.disks: if not _CheckDiskConsistency(self.lu, dev, target_node, False): diff --git a/lib/constants.py b/lib/constants.py index 9dc202b0f0155b70d93538fdf6088f6c0873d112..2b92284c57c5b1333a90f173fe6df3d4b76b1633 100644 --- a/lib/constants.py +++ b/lib/constants.py @@ -818,6 +818,9 @@ HV_MIGRATION_FAILED_STATUSES = frozenset([ # KVM-specific statuses HV_KVM_MIGRATION_VALID_STATUSES = HV_MIGRATION_VALID_STATUSES +# Node info keys +HV_NODEINFO_KEY_VERSION = "hv_version" + # Backend parameter names BE_MEMORY = "memory" BE_VCPUS = "vcpus" diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index eb0e2444b8d236638e0f7c594ae26c355154ea6b..577608afdde1e8aca72c7c798c2a3e89c33173f7 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -1705,15 +1705,18 @@ class KVMHypervisor(hv_base.BaseHypervisor): def GetNodeInfo(self): """Return information about the node. - This is just a wrapper over the base GetLinuxNodeInfo method. - @return: a dict with the following keys (values in MiB): - memory_total: the total memory size on the node - memory_free: the available memory on the node for instances - memory_dom0: the memory used by the node itself, if available + - hv_version: the hypervisor version in the form (major, minor, + revision) """ - return self.GetLinuxNodeInfo() + result = self.GetLinuxNodeInfo() + _, v_major, v_min, v_rev = self._GetKVMVersion() + result[constants.HV_NODEINFO_KEY_VERSION] = (v_major, v_min, v_rev) + return result @classmethod def GetInstanceConsole(cls, instance, hvparams, beparams): diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py index 3e452c412a8e7f2daf3f2d7bc90ad04590752ac5..f8113a8109e099851739a3ee5b37d8b5432a0af6 100644 --- a/lib/hypervisor/hv_xen.py +++ b/lib/hypervisor/hv_xen.py @@ -289,6 +289,7 @@ class XenHypervisor(hv_base.BaseHypervisor): - nr_cpus: total number of CPUs - nr_nodes: in a NUMA system, the number of domains - nr_sockets: the number of physical CPU sockets in the node + - hv_version: the hypervisor version in the form (major, minor) """ # note: in xen 3, memory has changed to total_memory @@ -301,6 +302,7 @@ class XenHypervisor(hv_base.BaseHypervisor): xmoutput = result.stdout.splitlines() result = {} cores_per_socket = threads_per_core = nr_cpus = None + xen_major, xen_minor = None, None for line in xmoutput: splitfields = line.split(":", 1) @@ -319,6 +321,10 @@ class XenHypervisor(hv_base.BaseHypervisor): cores_per_socket = int(val) elif key == "threads_per_core": threads_per_core = int(val) + elif key == "xen_major": + xen_major = int(val) + elif key == "xen_minor": + xen_minor = int(val) if (cores_per_socket is not None and threads_per_core is not None and nr_cpus is not None): @@ -328,6 +334,9 @@ class XenHypervisor(hv_base.BaseHypervisor): if dom0_info is not None: result["memory_dom0"] = dom0_info[2] + if not (xen_major is None or xen_minor is None): + result[constants.HV_NODEINFO_KEY_VERSION] = (xen_major, xen_minor) + return result @classmethod diff --git a/lib/ssh.py b/lib/ssh.py index 1a3c101ae574fad6e11e9f1f41b37fea2d3064f1..5c11be2129395a9975a6aa96c37fa4cad9a415f3 100644 --- a/lib/ssh.py +++ b/lib/ssh.py @@ -229,8 +229,8 @@ class SshRunner: result = utils.RunCmd(command) if result.failed: - logging.error("Copy to node %s failed (%s) error %s," - " command was %s", + logging.error("Copy to node %s failed (%s) error '%s'," + " command was '%s'", node, result.fail_reason, result.output, result.cmd) return not result.failed diff --git a/lib/utils/log.py b/lib/utils/log.py index 281f59045ac8e7e7ae2505e154d55f6fdf4f1d84..ceff2506918e51278757acccc3c45c110e7b9fac 100644 --- a/lib/utils/log.py +++ b/lib/utils/log.py @@ -230,7 +230,7 @@ def SetupLogging(logfile, program, debug=0, stderr_logging=False, if debug: stderr_handler.setLevel(logging.NOTSET) else: - stderr_handler.setLevel(logging.CRITICAL) + stderr_handler.setLevel(logging.ERROR) root_logger.addHandler(stderr_handler) if syslog in (constants.SYSLOG_YES, constants.SYSLOG_ONLY): diff --git a/man/hbal.rst b/man/hbal.rst index 1a98621cc0ff3a5b5fae82fab830f4a8cec16990..2142a2ff9c73bd3aade5261d2bde63682e1e7616 100644 --- a/man/hbal.rst +++ b/man/hbal.rst @@ -359,6 +359,9 @@ The options that can be passed to the program are as follows: jobset will be executed in parallel. The jobsets themselves are executed serially. + The execution of the job series can be interrupted, see below for + signal handling. + -l *N*, --max-length=*N* Restrict the solution to this length. This can be used for example to automate the execution of the balancing. @@ -396,25 +399,45 @@ The options that can be passed to the program are as follows: -V, --version Just show the program version and exit. +SIGNAL HANDLING +--------------- + +When executing jobs via LUXI (using the ``-X`` option), normally hbal +will execute all jobs until either one errors out or all the jobs finish +successfully. + +Since balancing can take a long time, it is possible to stop hbal early +in two ways: + +- by sending a ``SIGINT`` (``^C``), hbal will register the termination + request, and will wait until the currently submitted jobs finish, at + which point it will exit (with exit code 1) +- by sending a ``SIGTERM``, hbal will immediately exit (with exit code + 2); it is the responsibility of the user to follow up with Ganeti the + result of the currently-executing jobs + +Note that in any situation, it's perfectly safe to kill hbal, either via +the above signals or via any other signal (e.g. ``SIGQUIT``, +``SIGKILL``), since the jobs themselves are processed by Ganeti whereas +hbal (after submission) only watches their progression. In this case, +the use will again have to query Ganeti for job results. + EXIT STATUS ----------- -The exit status of the command will be zero, unless for some reason -the algorithm fatally failed (e.g. wrong node or instance data), or -(in case of job execution) any job has failed. +The exit status of the command will be zero, unless for some reason the +algorithm fatally failed (e.g. wrong node or instance data), or (in case +of job execution) either one of the jobs has failed or the balancing was +interrupted early. BUGS ---- -The program does not check its input data for consistency, and aborts -with cryptic errors messages in this case. +The program does not check all its input data for consistency, and +sometime aborts with cryptic errors messages with invalid data. The algorithm is not perfect. -The output format is not easily scriptable, and the program should -feed moves directly into Ganeti (either via RAPI or via a gnt-debug -input file). - EXAMPLE ------- diff --git a/qa/ganeti-qa.py b/qa/ganeti-qa.py index fc7abbcfa9b3acc202059f2ab2bb30dd46983d55..8b0c33889f36922cbf7db315afad918bf523811e 100755 --- a/qa/ganeti-qa.py +++ b/qa/ganeti-qa.py @@ -225,6 +225,12 @@ def RunCommonInstanceTests(instance): qa_rapi.TestRapiStoppedInstanceConsole, instance) RunTestIf("instance-shutdown", qa_instance.TestInstanceStartup, instance) + # Test shutdown/start via RAPI + RunTestIf(["instance-shutdown", "rapi"], + qa_rapi.TestRapiInstanceShutdown, instance) + RunTestIf(["instance-shutdown", "rapi"], + qa_rapi.TestRapiInstanceStartup, instance) + RunTestIf("instance-list", qa_instance.TestInstanceList) RunTestIf("instance-info", qa_instance.TestInstanceInfo, instance) diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py index ece6e645277af5d0d2c2d8f272ba01d84543d1ed..02218463c349dacf27c39e0052ef1d4c9a18e83e 100644 --- a/qa/qa_rapi.py +++ b/qa/qa_rapi.py @@ -598,6 +598,16 @@ def TestRapiInstanceFailover(instance): _WaitForRapiJob(_rapi_client.FailoverInstance(instance["name"])) +def TestRapiInstanceShutdown(instance): + """Test stopping an instance via RAPI""" + _WaitForRapiJob(_rapi_client.ShutdownInstance(instance["name"])) + + +def TestRapiInstanceStartup(instance): + """Test starting an instance via RAPI""" + _WaitForRapiJob(_rapi_client.StartupInstance(instance["name"])) + + def TestRapiInstanceRename(rename_source, rename_target): """Test renaming instance via RAPI""" _WaitForRapiJob(_rapi_client.RenameInstance(rename_source, rename_target)) diff --git a/tools/cluster-merge b/tools/cluster-merge index 7897e81b18709308b1f25acd75c741316a9680a9..7fd2b0ed670d642bb64c3797151218e7a9a37df9 100755 --- a/tools/cluster-merge +++ b/tools/cluster-merge @@ -308,7 +308,7 @@ class Merger(object): """ for data in self.merger_data: result = self._RunCmd(data.master_node, - "gnt-cluster deactivate-master-ip") + "gnt-cluster deactivate-master-ip --yes") if result.failed: raise errors.RemoteError("Unable to remove master IP on %s." diff --git a/tools/ganeti-listrunner b/tools/ganeti-listrunner index 13ab024271934431ffecc7c8c2b175d84855be36..566d9860aab4c28fc78e7406df1cbb8768d781b2 100755 --- a/tools/ganeti-listrunner +++ b/tools/ganeti-listrunner @@ -376,8 +376,10 @@ def HostWorker(logdir, username, password, use_agent, hostname, print " %s: uploading files" % hostname upload_dir = UploadFiles(connection, executable, filelist, logfile) - command = ("cd %s && ./%s %s" % - (upload_dir, os.path.basename(executable), exec_args)) + command = ("cd %s && ./%s" % + (upload_dir, os.path.basename(executable))) + if exec_args: + command += " %s" % exec_args print " %s: executing remote command" % hostname cmd_result = RunRemoteCommand(connection, command, logfile) if cmd_result is True: