From 874f61488693d79981723dcac778b5cef771c25b Mon Sep 17 00:00:00 2001 From: Michele Tartara Date: Fri, 7 Mar 2014 10:50:33 +0100 Subject: [PATCH] Enable a timeout for instance shutdown Add the timeout parameter to the StopInstance function of the hypervisor base class and to all its implementations. Also, change the tests as required by this change. Signed-off-by: Michele Tartara Reviewed-by: Klaus Aehlig --- lib/backend.py | 2 +- lib/hypervisor/hv_base.py | 7 +++++- lib/hypervisor/hv_chroot.py | 12 ++++++++-- lib/hypervisor/hv_fake.py | 5 +++- lib/hypervisor/hv_kvm.py | 11 ++++++--- lib/hypervisor/hv_lxc.py | 15 ++++++++---- lib/hypervisor/hv_xen.py | 24 +++++++++++++++----- test/py/ganeti.hypervisor.hv_xen_unittest.py | 13 ++++++++--- 8 files changed, 68 insertions(+), 21 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index 98a29bede..03394edd7 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1394,7 +1394,7 @@ def InstanceShutdown(instance, timeout, reason, store_reason=True): return try: - hyper.StopInstance(instance, retry=self.tried_once) + hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout) if store_reason: _StoreInstReasonTrail(instance.name, reason) except errors.HypervisorError, err: diff --git a/lib/hypervisor/hv_base.py b/lib/hypervisor/hv_base.py index 3f620f226..4fd746e34 100644 --- a/lib/hypervisor/hv_base.py +++ b/lib/hypervisor/hv_base.py @@ -173,7 +173,8 @@ class BaseHypervisor(object): """Start an instance.""" raise NotImplementedError - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance @type instance: L{objects.Instance} @@ -186,6 +187,10 @@ class BaseHypervisor(object): @param name: if this parameter is passed, the the instance object should not be used (will be passed as None), and the shutdown must be done by name only + @type timeout: int or None + @param timeout: if the parameter is not None, a soft shutdown operation will + be killed after the specified number of seconds. A hard (forced) + shutdown cannot have a timeout """ raise NotImplementedError diff --git a/lib/hypervisor/hv_chroot.py b/lib/hypervisor/hv_chroot.py index 5d88447e1..3c77fc997 100644 --- a/lib/hypervisor/hv_chroot.py +++ b/lib/hypervisor/hv_chroot.py @@ -167,7 +167,8 @@ class ChrootManager(hv_base.BaseHypervisor): raise HypervisorError("Can't run the chroot start script: %s" % result.output) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. This method has complicated cleanup tests, as we must: @@ -176,6 +177,8 @@ class ChrootManager(hv_base.BaseHypervisor): - finally unmount the instance dir """ + assert(timeout is None or force is not None) + if name is None: name = instance.name @@ -183,9 +186,14 @@ class ChrootManager(hv_base.BaseHypervisor): if not os.path.exists(root_dir) or not self._IsDirLive(root_dir): return + timeout_cmd = [] + if timeout is not None: + timeout_cmd.extend(["timeout", str(timeout)]) + # Run the chroot stop script only once if not retry and not force: - result = utils.RunCmd(["chroot", root_dir, "/ganeti-chroot", "stop"]) + result = utils.RunCmd(timeout_cmd.extend(["chroot", root_dir, + "/ganeti-chroot", "stop"])) if result.failed: raise HypervisorError("Can't run the chroot stop script: %s" % result.output) diff --git a/lib/hypervisor/hv_fake.py b/lib/hypervisor/hv_fake.py index c99e8225c..9f3946cf8 100644 --- a/lib/hypervisor/hv_fake.py +++ b/lib/hypervisor/hv_fake.py @@ -165,13 +165,16 @@ class FakeHypervisor(hv_base.BaseHypervisor): raise errors.HypervisorError("Failed to start instance %s: %s" % (instance.name, err)) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. For the fake hypervisor, this just removes the file in the base dir, if it exist, otherwise we raise an exception. """ + assert(timeout is None or force is not None) + if name is None: name = instance.name if not self._IsAlive(name): diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index 4a65c5839..5bba5d24f 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -1696,10 +1696,12 @@ class KVMHypervisor(hv_base.BaseHypervisor): # 500ms and likely more: socat can't detect the end of the reply and waits # for 500ms of no data received before exiting (500 ms is the default for # the "-t" parameter). - socat = ("echo %s | %s STDIO UNIX-CONNECT:%s" % + socat = ("echo %s | %s %s STDIO UNIX-CONNECT:%s" % (utils.ShellQuote(command), + timeout_cmd, constants.SOCAT_PATH, utils.ShellQuote(self._InstanceMonitor(instance_name)))) + result = utils.RunCmd(socat) if result.failed: msg = ("Failed to send command '%s' to instance '%s', reason '%s'," @@ -1776,10 +1778,13 @@ class KVMHypervisor(hv_base.BaseHypervisor): else: return "pc" - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. """ + assert(timeout is None or force is not None) + if name is not None and not force: raise errors.HypervisorError("Cannot shutdown cleanly by name only") if name is None: @@ -1792,7 +1797,7 @@ class KVMHypervisor(hv_base.BaseHypervisor): if force or not acpi: utils.KillProcess(pid) else: - self._CallMonitorCommand(name, "system_powerdown") + self._CallMonitorCommand(name, "system_powerdown", timeout) def CleanupInstance(self, instance_name): """Cleanup after a stopped instance diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py index b7054f555..68029e924 100644 --- a/lib/hypervisor/hv_lxc.py +++ b/lib/hypervisor/hv_lxc.py @@ -325,7 +325,8 @@ class LXCHypervisor(hv_base.BaseHypervisor): raise HypervisorError("Running the lxc-start script failed: %s" % result.output) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. This method has complicated cleanup tests, as we must: @@ -334,9 +335,15 @@ class LXCHypervisor(hv_base.BaseHypervisor): - finally unmount the instance dir """ + assert(timeout is None or force is not None) + if name is None: name = instance.name + timeout_cmd = [] + if timeout is not None: + timeout_cmd.extend(["timeout", str(timeout)]) + root_dir = self._InstanceDir(name) if not os.path.exists(root_dir): return @@ -349,7 +356,7 @@ class LXCHypervisor(hv_base.BaseHypervisor): raise HypervisorError("Running 'poweroff' on the instance" " failed: %s" % result.output) time.sleep(2) - result = utils.RunCmd(["lxc-stop", "-n", name]) + result = utils.RunCmd(timeout_cmd.extend(["lxc-stop", "-n", name])) if result.failed: logging.warning("Error while doing lxc-stop for %s: %s", name, result.output) @@ -358,12 +365,12 @@ class LXCHypervisor(hv_base.BaseHypervisor): return for mpath in self._GetMountSubdirs(root_dir): - result = utils.RunCmd(["umount", mpath]) + result = utils.RunCmd(timeout_cmd.extend(["umount", mpath])) if result.failed: logging.warning("Error while umounting subpath %s for instance %s: %s", mpath, name, result.output) - result = utils.RunCmd(["umount", root_dir]) + result = utils.RunCmd(timeout_cmd.extend(["umount", root_dir])) if result.failed and force: msg = ("Processes still alive in the chroot: %s" % utils.RunCmd("fuser -vm %s" % root_dir).output) diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py index 88f71d175..90e9ecb4c 100644 --- a/lib/hypervisor/hv_xen.py +++ b/lib/hypervisor/hv_xen.py @@ -519,16 +519,21 @@ class XenHypervisor(hv_base.BaseHypervisor): (instance.name, result.fail_reason, result.output, stashed_config)) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. + A soft shutdown can be interrupted. A hard shutdown tries forever. + """ + assert(timeout is None or force is not None) + if name is None: name = instance.name - return self._StopInstance(name, force) + return self._StopInstance(name, force, timeout) - def _ShutdownInstance(self, name): + def _ShutdownInstance(self, name, timeout): """Shutdown an instance if the instance is running. The '-w' flag waits for shutdown to complete which avoids the need @@ -537,6 +542,9 @@ class XenHypervisor(hv_base.BaseHypervisor): @type name: string @param name: name of the instance to stop + @type timeout: int or None + @param timeout: a timeout after which the shutdown command should be killed, + or None for no timeout """ instance_info = self.GetInstanceInfo(name) @@ -545,7 +553,7 @@ class XenHypervisor(hv_base.BaseHypervisor): logging.info("Failed to shutdown instance %s, not running", name) return None - return self._RunXen(["shutdown", "-w", name]) + return self._RunXen(["shutdown", "-w", name], timeout) def _DestroyInstance(self, name): """Destroy an instance if the instance if the instance exists. @@ -562,7 +570,7 @@ class XenHypervisor(hv_base.BaseHypervisor): return self._RunXen(["destroy", name]) - def _StopInstance(self, name, force): + def _StopInstance(self, name, force, timeout): """Stop an instance. @type name: string @@ -571,11 +579,15 @@ class XenHypervisor(hv_base.BaseHypervisor): @type force: boolean @param force: whether to do a "hard" stop (destroy) + @type timeout: int or None + @param timeout: a timeout after which the shutdown command should be killed, + or None for no timeout + """ if force: result = self._DestroyInstance(name) else: - self._ShutdownInstance(name) + self._ShutdownInstance(name, timeout) result = self._DestroyInstance(name) if result is not None and result.failed and \ diff --git a/test/py/ganeti.hypervisor.hv_xen_unittest.py b/test/py/ganeti.hypervisor.hv_xen_unittest.py index 15e4391a2..0b94ceb6f 100755 --- a/test/py/ganeti.hypervisor.hv_xen_unittest.py +++ b/test/py/ganeti.hypervisor.hv_xen_unittest.py @@ -555,7 +555,14 @@ class _TestXenHypervisor(object): extra = inst.hvparams[constants.HV_KERNEL_ARGS] self.assertTrue(("extra = '%s'" % extra) in lines) - def _StopInstanceCommand(self, instance_name, force, fail, cmd): + def _StopInstanceCommand(self, instance_name, force, fail, full_cmd): + # Remove the timeout (and its number of seconds) if it's there + if full_cmd[:1][0] == "timeout": + cmd = full_cmd[2:] + else: + cmd = full_cmd + + # Test the actual command if (cmd == [self.CMD, "list"]): output = "Name ID Mem VCPUs State Time(s)\n" \ "Domain-0 0 1023 1 r----- 142691.0\n" \ @@ -592,7 +599,7 @@ class _TestXenHypervisor(object): if fail: try: - hv._StopInstance(name, force) + hv._StopInstance(name, force, constants.DEFAULT_SHUTDOWN_TIMEOUT) except errors.HypervisorError, err: self.assertTrue(str(err).startswith("xm list failed"), msg=str(err)) @@ -602,7 +609,7 @@ class _TestXenHypervisor(object): msg=("Configuration was removed when stopping" " instance failed")) else: - hv._StopInstance(name, force) + hv._StopInstance(name, force, constants.DEFAULT_SHUTDOWN_TIMEOUT) self.assertFalse(os.path.exists(cfgfile)) def _MigrateNonRunningInstCmd(self, cmd): -- GitLab