diff --git a/lib/backend.py b/lib/backend.py index 685c09d942360c31ab30836ffd96d39064b71b23..2fb52983e17693866da184e97afa5e58be554700 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1706,7 +1706,7 @@ def InstanceShutdown(instance, timeout, reason, store_reason=True): return try: - hyper.StopInstance(instance, retry=self.tried_once) + hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout) if store_reason: _StoreInstReasonTrail(instance.name, reason) except errors.HypervisorError, err: diff --git a/lib/hypervisor/hv_base.py b/lib/hypervisor/hv_base.py index 64ca463e9c202e955028396ee6008b6080749625..fe69ab7981b9e33235f0529b229f256cd515720a 100644 --- a/lib/hypervisor/hv_base.py +++ b/lib/hypervisor/hv_base.py @@ -173,7 +173,8 @@ class BaseHypervisor(object): """Start an instance.""" raise NotImplementedError - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance @type instance: L{objects.Instance} @@ -186,6 +187,10 @@ class BaseHypervisor(object): @param name: if this parameter is passed, the the instance object should not be used (will be passed as None), and the shutdown must be done by name only + @type timeout: int or None + @param timeout: if the parameter is not None, a soft shutdown operation will + be killed after the specified number of seconds. A hard (forced) + shutdown cannot have a timeout """ raise NotImplementedError diff --git a/lib/hypervisor/hv_chroot.py b/lib/hypervisor/hv_chroot.py index bfdedbac4ab60cafa25dca39965594a9c7e175da..ea50233d90253d3d41caae6b5056f81cec88b06d 100644 --- a/lib/hypervisor/hv_chroot.py +++ b/lib/hypervisor/hv_chroot.py @@ -171,7 +171,8 @@ class ChrootManager(hv_base.BaseHypervisor): raise HypervisorError("Can't run the chroot start script: %s" % result.output) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. This method has complicated cleanup tests, as we must: @@ -180,6 +181,8 @@ class ChrootManager(hv_base.BaseHypervisor): - finally unmount the instance dir """ + assert(timeout is None or force is not None) + if name is None: name = instance.name @@ -187,9 +190,14 @@ class ChrootManager(hv_base.BaseHypervisor): if not os.path.exists(root_dir) or not self._IsDirLive(root_dir): return + timeout_cmd = [] + if timeout is not None: + timeout_cmd.extend(["timeout", str(timeout)]) + # Run the chroot stop script only once if not retry and not force: - result = utils.RunCmd(["chroot", root_dir, "/ganeti-chroot", "stop"]) + result = utils.RunCmd(timeout_cmd.extend(["chroot", root_dir, + "/ganeti-chroot", "stop"])) if result.failed: raise HypervisorError("Can't run the chroot stop script: %s" % result.output) diff --git a/lib/hypervisor/hv_fake.py b/lib/hypervisor/hv_fake.py index b5abf7fdd07577754467ebf31e7d7eb1b9b26ddc..7123787e6b599ad00d843ddb03ba61ed7627b6bc 100644 --- a/lib/hypervisor/hv_fake.py +++ b/lib/hypervisor/hv_fake.py @@ -174,13 +174,16 @@ class FakeHypervisor(hv_base.BaseHypervisor): raise errors.HypervisorError("Failed to start instance %s: %s" % (instance.name, err)) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. For the fake hypervisor, this just removes the file in the base dir, if it exist, otherwise we raise an exception. """ + assert(timeout is None or force is not None) + if name is None: name = instance.name if not self._IsAlive(name): diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index 4a4c60d3c68ea1a8086213d7734eb84bf43b53b6..7af296ef1f017058659fe336f5c3dc3cb1491ece 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -1703,10 +1703,15 @@ class KVMHypervisor(hv_base.BaseHypervisor): self._SaveKVMRuntime(instance, kvm_runtime) self._ExecuteKVMRuntime(instance, kvm_runtime, kvmhelp) - def _CallMonitorCommand(self, instance_name, command): + def _CallMonitorCommand(self, instance_name, command, timeout=None): """Invoke a command on the instance monitor. """ + if timeout is not None: + timeout_cmd = "timeout %s" % (timeout, ) + else: + timeout_cmd = "" + # TODO: Replace monitor calls with QMP once KVM >= 0.14 is the minimum # version. The monitor protocol is designed for human consumption, whereas # QMP is made for programmatic usage. In the worst case QMP can also @@ -1714,10 +1719,12 @@ class KVMHypervisor(hv_base.BaseHypervisor): # 500ms and likely more: socat can't detect the end of the reply and waits # for 500ms of no data received before exiting (500 ms is the default for # the "-t" parameter). - socat = ("echo %s | %s STDIO UNIX-CONNECT:%s" % + socat = ("echo %s | %s %s STDIO UNIX-CONNECT:%s" % (utils.ShellQuote(command), + timeout_cmd, constants.SOCAT_PATH, utils.ShellQuote(self._InstanceMonitor(instance_name)))) + result = utils.RunCmd(socat) if result.failed: msg = ("Failed to send command '%s' to instance '%s', reason '%s'," @@ -1794,10 +1801,13 @@ class KVMHypervisor(hv_base.BaseHypervisor): else: return "pc" - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. """ + assert(timeout is None or force is not None) + if name is not None and not force: raise errors.HypervisorError("Cannot shutdown cleanly by name only") if name is None: @@ -1810,7 +1820,7 @@ class KVMHypervisor(hv_base.BaseHypervisor): if force or not acpi: utils.KillProcess(pid) else: - self._CallMonitorCommand(name, "system_powerdown") + self._CallMonitorCommand(name, "system_powerdown", timeout) def CleanupInstance(self, instance_name): """Cleanup after a stopped instance diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py index e3f3d1e403b9d1a9b9bb7b38f459d8a58eaaf2f4..7d965ce1d5ddbb18417b7b67711f8e78f0a1282a 100644 --- a/lib/hypervisor/hv_lxc.py +++ b/lib/hypervisor/hv_lxc.py @@ -329,7 +329,8 @@ class LXCHypervisor(hv_base.BaseHypervisor): raise HypervisorError("Running the lxc-start script failed: %s" % result.output) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. This method has complicated cleanup tests, as we must: @@ -338,9 +339,15 @@ class LXCHypervisor(hv_base.BaseHypervisor): - finally unmount the instance dir """ + assert(timeout is None or force is not None) + if name is None: name = instance.name + timeout_cmd = [] + if timeout is not None: + timeout_cmd.extend(["timeout", str(timeout)]) + root_dir = self._InstanceDir(name) if not os.path.exists(root_dir): return @@ -353,7 +360,7 @@ class LXCHypervisor(hv_base.BaseHypervisor): raise HypervisorError("Running 'poweroff' on the instance" " failed: %s" % result.output) time.sleep(2) - result = utils.RunCmd(["lxc-stop", "-n", name]) + result = utils.RunCmd(timeout_cmd.extend(["lxc-stop", "-n", name])) if result.failed: logging.warning("Error while doing lxc-stop for %s: %s", name, result.output) @@ -362,12 +369,12 @@ class LXCHypervisor(hv_base.BaseHypervisor): return for mpath in self._GetMountSubdirs(root_dir): - result = utils.RunCmd(["umount", mpath]) + result = utils.RunCmd(timeout_cmd.extend(["umount", mpath])) if result.failed: logging.warning("Error while umounting subpath %s for instance %s: %s", mpath, name, result.output) - result = utils.RunCmd(["umount", root_dir]) + result = utils.RunCmd(timeout_cmd.extend(["umount", root_dir])) if result.failed and force: msg = ("Processes still alive in the chroot: %s" % utils.RunCmd("fuser -vm %s" % root_dir).output) diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py index 95ed49f278f7c0bf96f33f4de3e9508e34fbc26e..443efc6c123fd65636dd18b0b226596d7d89a46b 100644 --- a/lib/hypervisor/hv_xen.py +++ b/lib/hypervisor/hv_xen.py @@ -383,15 +383,23 @@ class XenHypervisor(hv_base.BaseHypervisor): return cmd - def _RunXen(self, args, hvparams): + def _RunXen(self, args, hvparams, timeout=None): """Wrapper around L{utils.process.RunCmd} to run Xen command. @type hvparams: dict of strings @param hvparams: dictionary of hypervisor params + @type timeout: int or None + @param timeout: if a timeout (in seconds) is specified, the command will be + terminated after that number of seconds. @see: L{utils.process.RunCmd} """ - cmd = [self._GetCommand(hvparams)] + cmd = [] + + if timeout is not None: + cmd.extend(["timeout", str(timeout)]) + + cmd.extend([self._GetCommand(hvparams)]) cmd.extend(args) return self._run_cmd_fn(cmd) @@ -604,26 +612,34 @@ class XenHypervisor(hv_base.BaseHypervisor): (instance.name, result.fail_reason, result.output, stashed_config)) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. + A soft shutdown can be interrupted. A hard shutdown tries forever. + """ + assert(timeout is None or force is not None) + if name is None: name = instance.name - return self._StopInstance(name, force, instance.hvparams) + return self._StopInstance(name, force, instance.hvparams, timeout) def _ShutdownInstance(self, name, hvparams): """Shutdown an instance if the instance is running. + The '-w' flag waits for shutdown to complete which avoids the need + to poll in the case where we want to destroy the domain + immediately after shutdown. + @type name: string @param name: name of the instance to stop @type hvparams: dict of string @param hvparams: hypervisor parameters of the instance - - The '-w' flag waits for shutdown to complete which avoids the need - to poll in the case where we want to destroy the domain - immediately after shutdown. + @type timeout: int or None + @param timeout: a timeout after which the shutdown command should be killed, + or None for no timeout """ instance_info = self.GetInstanceInfo(name, hvparams=hvparams) @@ -632,7 +648,7 @@ class XenHypervisor(hv_base.BaseHypervisor): logging.info("Failed to shutdown instance %s, not running", name) return None - return self._RunXen(["shutdown", "-w", name], hvparams) + return self._RunXen(["shutdown", "-w", name], hvparams, timeout) def _DestroyInstance(self, name, hvparams): """Destroy an instance if the instance if the instance exists. @@ -651,7 +667,7 @@ class XenHypervisor(hv_base.BaseHypervisor): return self._RunXen(["destroy", name], hvparams) - def _StopInstance(self, name, force, hvparams): + def _StopInstance(self, name, force, hvparams, timeout): """Stop an instance. @type name: string @@ -663,11 +679,15 @@ class XenHypervisor(hv_base.BaseHypervisor): @type hvparams: dict of string @param hvparams: hypervisor parameters of the instance + @type timeout: int or None + @param timeout: a timeout after which the shutdown command should be killed, + or None for no timeout + """ if force: result = self._DestroyInstance(name, hvparams) else: - self._ShutdownInstance(name, hvparams) + self._ShutdownInstance(name, hvparams, timeout) result = self._DestroyInstance(name, hvparams) if result is not None and result.failed and \ diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py index 1f7ec6556a002591be7b74295c521321a52d4399..4c2d019310dcf4b60863d7befbb167b761e05c50 100644 --- a/lib/watcher/__init__.py +++ b/lib/watcher/__init__.py @@ -329,6 +329,10 @@ def IsRapiResponding(hostname): Connects to RAPI port of hostname and does a simple test. At this time, the test is GetVersion. + If RAPI responds with error code "401 Unauthorized", the test is successful, + because the aim of this function is to assess whether RAPI is responding, not + if it is accessible. + @type hostname: string @param hostname: hostname of the node to connect to. @rtype: bool @@ -344,8 +348,12 @@ def IsRapiResponding(hostname): logging.warning("RAPI certificate error: %s", err) return False except rapi.client.GanetiApiError, err: - logging.warning("RAPI error: %s", err) - return False + if err.code == 401: + # Unauthorized, but RAPI is alive and responding + return True + else: + logging.warning("RAPI error: %s", err) + return False else: logging.debug("Reported RAPI version %s", master_version) return master_version == constants.RAPI_VERSION diff --git a/qa/qa-patch.json b/qa/qa-patch.json new file mode 100644 index 0000000000000000000000000000000000000000..fe51488c7066f6687ef680d6bfaa4f7768ef205c --- /dev/null +++ b/qa/qa-patch.json @@ -0,0 +1 @@ +[] diff --git a/qa/qa_config.py b/qa/qa_config.py index 97ccaa77b507fb247097668edf5790050e168310..a6f3097c2645d34e33dc1dca3926bd04f6ac5cdd 100644 --- a/qa/qa_config.py +++ b/qa/qa_config.py @@ -40,6 +40,10 @@ _VCLUSTER_MASTER_KEY = "vcluster-master" _VCLUSTER_BASEDIR_KEY = "vcluster-basedir" _ENABLED_DISK_TEMPLATES_KEY = "enabled-disk-templates" +# The path of an optional JSON Patch file (as per RFC6902) that modifies QA's +# configuration. +_PATCH_JSON = os.path.join(os.path.dirname(__file__), "qa-patch.json") + #: QA configuration (L{_QaConfig}) _config = None @@ -261,6 +265,20 @@ class _QaConfig(object): """ data = serializer.LoadJson(utils.ReadFile(filename)) + # Patch the document using JSON Patch (RFC6902) in file _PATCH_JSON, if + # available + try: + patch = serializer.LoadJson(utils.ReadFile(_PATCH_JSON)) + if patch: + mod = __import__("jsonpatch", fromlist=[]) + data = mod.apply_patch(data, patch) + except IOError: + pass + except ImportError: + raise qa_error.Error("If you want to use the QA JSON patching feature," + " you need to install Python modules" + " 'jsonpatch' and 'jsonpointer'.") + result = cls(dict(map(_ConvertResources, data.items()))) # pylint: disable=E1103 result.Validate() diff --git a/test/py/ganeti.hypervisor.hv_xen_unittest.py b/test/py/ganeti.hypervisor.hv_xen_unittest.py index b3d0e06314e06bbfb1184497c5173731120f162c..767eae83f7a3a49e0cfd6cd9188ef5d3cb8e8f6f 100755 --- a/test/py/ganeti.hypervisor.hv_xen_unittest.py +++ b/test/py/ganeti.hypervisor.hv_xen_unittest.py @@ -730,7 +730,14 @@ class _TestXenHypervisor(object): extra = inst.hvparams[constants.HV_KERNEL_ARGS] self.assertTrue(("extra = '%s'" % extra) in lines) - def _StopInstanceCommand(self, instance_name, force, fail, cmd): + def _StopInstanceCommand(self, instance_name, force, fail, full_cmd): + # Remove the timeout (and its number of seconds) if it's there + if full_cmd[:1][0] == "timeout": + cmd = full_cmd[2:] + else: + cmd = full_cmd + + # Test the actual command if (cmd == [self.CMD, "list"]): output = "Name ID Mem VCPUs State Time(s)\n" \ "Domain-0 0 1023 1 r----- 142691.0\n" \ @@ -767,7 +774,8 @@ class _TestXenHypervisor(object): if fail: try: - hv._StopInstance(name, force, None) + hv._StopInstance(name, force, None, + constants.DEFAULT_SHUTDOWN_TIMEOUT) except errors.HypervisorError, err: self.assertTrue(str(err).startswith("listing instances failed"), msg=str(err)) @@ -777,7 +785,8 @@ class _TestXenHypervisor(object): msg=("Configuration was removed when stopping" " instance failed")) else: - hv._StopInstance(name, force, None) + hv._StopInstance(name, force, None, + constants.DEFAULT_SHUTDOWN_TIMEOUT) self.assertFalse(os.path.exists(cfgfile)) def _MigrateNonRunningInstCmd(self, cmd):