diff --git a/lib/backend.py b/lib/backend.py index 98a29bedec53b33c2e332d24df77e198ed050453..03394edd7bfd595f0c1a209a625abab1fcd650ed 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1394,7 +1394,7 @@ def InstanceShutdown(instance, timeout, reason, store_reason=True): return try: - hyper.StopInstance(instance, retry=self.tried_once) + hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout) if store_reason: _StoreInstReasonTrail(instance.name, reason) except errors.HypervisorError, err: diff --git a/lib/hypervisor/hv_base.py b/lib/hypervisor/hv_base.py index 3f620f226e29fa57125050551d2a3e977be40e66..4fd746e34566fc8c1406bb17e98a1fd8af67ee70 100644 --- a/lib/hypervisor/hv_base.py +++ b/lib/hypervisor/hv_base.py @@ -173,7 +173,8 @@ class BaseHypervisor(object): """Start an instance.""" raise NotImplementedError - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance @type instance: L{objects.Instance} @@ -186,6 +187,10 @@ class BaseHypervisor(object): @param name: if this parameter is passed, the the instance object should not be used (will be passed as None), and the shutdown must be done by name only + @type timeout: int or None + @param timeout: if the parameter is not None, a soft shutdown operation will + be killed after the specified number of seconds. A hard (forced) + shutdown cannot have a timeout """ raise NotImplementedError diff --git a/lib/hypervisor/hv_chroot.py b/lib/hypervisor/hv_chroot.py index 5d88447e1b769450a312f044e408e5246a8eb39b..3c77fc997f763a378ca97c4c4345120ff5672bfc 100644 --- a/lib/hypervisor/hv_chroot.py +++ b/lib/hypervisor/hv_chroot.py @@ -167,7 +167,8 @@ class ChrootManager(hv_base.BaseHypervisor): raise HypervisorError("Can't run the chroot start script: %s" % result.output) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. This method has complicated cleanup tests, as we must: @@ -176,6 +177,8 @@ class ChrootManager(hv_base.BaseHypervisor): - finally unmount the instance dir """ + assert(timeout is None or force is not None) + if name is None: name = instance.name @@ -183,9 +186,14 @@ class ChrootManager(hv_base.BaseHypervisor): if not os.path.exists(root_dir) or not self._IsDirLive(root_dir): return + timeout_cmd = [] + if timeout is not None: + timeout_cmd.extend(["timeout", str(timeout)]) + # Run the chroot stop script only once if not retry and not force: - result = utils.RunCmd(["chroot", root_dir, "/ganeti-chroot", "stop"]) + result = utils.RunCmd(timeout_cmd.extend(["chroot", root_dir, + "/ganeti-chroot", "stop"])) if result.failed: raise HypervisorError("Can't run the chroot stop script: %s" % result.output) diff --git a/lib/hypervisor/hv_fake.py b/lib/hypervisor/hv_fake.py index c99e8225c3ccf3a6ad44acbede6f2d4f7eeba3e6..9f3946cf83835c6f7cf3681cc4580843ef03c948 100644 --- a/lib/hypervisor/hv_fake.py +++ b/lib/hypervisor/hv_fake.py @@ -165,13 +165,16 @@ class FakeHypervisor(hv_base.BaseHypervisor): raise errors.HypervisorError("Failed to start instance %s: %s" % (instance.name, err)) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. For the fake hypervisor, this just removes the file in the base dir, if it exist, otherwise we raise an exception. """ + assert(timeout is None or force is not None) + if name is None: name = instance.name if not self._IsAlive(name): diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index 4a65c5839e22d915f1e5e87e38d9dea7b486ea48..5bba5d24f7dc2a3bdf0a0521a9abecc165fbc17e 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -1696,10 +1696,12 @@ class KVMHypervisor(hv_base.BaseHypervisor): # 500ms and likely more: socat can't detect the end of the reply and waits # for 500ms of no data received before exiting (500 ms is the default for # the "-t" parameter). - socat = ("echo %s | %s STDIO UNIX-CONNECT:%s" % + socat = ("echo %s | %s %s STDIO UNIX-CONNECT:%s" % (utils.ShellQuote(command), + timeout_cmd, constants.SOCAT_PATH, utils.ShellQuote(self._InstanceMonitor(instance_name)))) + result = utils.RunCmd(socat) if result.failed: msg = ("Failed to send command '%s' to instance '%s', reason '%s'," @@ -1776,10 +1778,13 @@ class KVMHypervisor(hv_base.BaseHypervisor): else: return "pc" - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. """ + assert(timeout is None or force is not None) + if name is not None and not force: raise errors.HypervisorError("Cannot shutdown cleanly by name only") if name is None: @@ -1792,7 +1797,7 @@ class KVMHypervisor(hv_base.BaseHypervisor): if force or not acpi: utils.KillProcess(pid) else: - self._CallMonitorCommand(name, "system_powerdown") + self._CallMonitorCommand(name, "system_powerdown", timeout) def CleanupInstance(self, instance_name): """Cleanup after a stopped instance diff --git a/lib/hypervisor/hv_lxc.py b/lib/hypervisor/hv_lxc.py index b7054f555bc04d14ec7ce928053dfb2305de9037..68029e924a69dbcab22ccfd1f2adaff7238652ce 100644 --- a/lib/hypervisor/hv_lxc.py +++ b/lib/hypervisor/hv_lxc.py @@ -325,7 +325,8 @@ class LXCHypervisor(hv_base.BaseHypervisor): raise HypervisorError("Running the lxc-start script failed: %s" % result.output) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. This method has complicated cleanup tests, as we must: @@ -334,9 +335,15 @@ class LXCHypervisor(hv_base.BaseHypervisor): - finally unmount the instance dir """ + assert(timeout is None or force is not None) + if name is None: name = instance.name + timeout_cmd = [] + if timeout is not None: + timeout_cmd.extend(["timeout", str(timeout)]) + root_dir = self._InstanceDir(name) if not os.path.exists(root_dir): return @@ -349,7 +356,7 @@ class LXCHypervisor(hv_base.BaseHypervisor): raise HypervisorError("Running 'poweroff' on the instance" " failed: %s" % result.output) time.sleep(2) - result = utils.RunCmd(["lxc-stop", "-n", name]) + result = utils.RunCmd(timeout_cmd.extend(["lxc-stop", "-n", name])) if result.failed: logging.warning("Error while doing lxc-stop for %s: %s", name, result.output) @@ -358,12 +365,12 @@ class LXCHypervisor(hv_base.BaseHypervisor): return for mpath in self._GetMountSubdirs(root_dir): - result = utils.RunCmd(["umount", mpath]) + result = utils.RunCmd(timeout_cmd.extend(["umount", mpath])) if result.failed: logging.warning("Error while umounting subpath %s for instance %s: %s", mpath, name, result.output) - result = utils.RunCmd(["umount", root_dir]) + result = utils.RunCmd(timeout_cmd.extend(["umount", root_dir])) if result.failed and force: msg = ("Processes still alive in the chroot: %s" % utils.RunCmd("fuser -vm %s" % root_dir).output) diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py index 88f71d175ba5584016eea7b68cc6ef3fded02d69..90e9ecb4cf92ffcb9bad7a2ff6b9c2625b09ce8a 100644 --- a/lib/hypervisor/hv_xen.py +++ b/lib/hypervisor/hv_xen.py @@ -519,16 +519,21 @@ class XenHypervisor(hv_base.BaseHypervisor): (instance.name, result.fail_reason, result.output, stashed_config)) - def StopInstance(self, instance, force=False, retry=False, name=None): + def StopInstance(self, instance, force=False, retry=False, name=None, + timeout=None): """Stop an instance. + A soft shutdown can be interrupted. A hard shutdown tries forever. + """ + assert(timeout is None or force is not None) + if name is None: name = instance.name - return self._StopInstance(name, force) + return self._StopInstance(name, force, timeout) - def _ShutdownInstance(self, name): + def _ShutdownInstance(self, name, timeout): """Shutdown an instance if the instance is running. The '-w' flag waits for shutdown to complete which avoids the need @@ -537,6 +542,9 @@ class XenHypervisor(hv_base.BaseHypervisor): @type name: string @param name: name of the instance to stop + @type timeout: int or None + @param timeout: a timeout after which the shutdown command should be killed, + or None for no timeout """ instance_info = self.GetInstanceInfo(name) @@ -545,7 +553,7 @@ class XenHypervisor(hv_base.BaseHypervisor): logging.info("Failed to shutdown instance %s, not running", name) return None - return self._RunXen(["shutdown", "-w", name]) + return self._RunXen(["shutdown", "-w", name], timeout) def _DestroyInstance(self, name): """Destroy an instance if the instance if the instance exists. @@ -562,7 +570,7 @@ class XenHypervisor(hv_base.BaseHypervisor): return self._RunXen(["destroy", name]) - def _StopInstance(self, name, force): + def _StopInstance(self, name, force, timeout): """Stop an instance. @type name: string @@ -571,11 +579,15 @@ class XenHypervisor(hv_base.BaseHypervisor): @type force: boolean @param force: whether to do a "hard" stop (destroy) + @type timeout: int or None + @param timeout: a timeout after which the shutdown command should be killed, + or None for no timeout + """ if force: result = self._DestroyInstance(name) else: - self._ShutdownInstance(name) + self._ShutdownInstance(name, timeout) result = self._DestroyInstance(name) if result is not None and result.failed and \ diff --git a/test/py/ganeti.hypervisor.hv_xen_unittest.py b/test/py/ganeti.hypervisor.hv_xen_unittest.py index 15e4391a2540e9067d3f646ac8870a86f90ce268..0b94ceb6fd170dc533266692321688c68b933e03 100755 --- a/test/py/ganeti.hypervisor.hv_xen_unittest.py +++ b/test/py/ganeti.hypervisor.hv_xen_unittest.py @@ -555,7 +555,14 @@ class _TestXenHypervisor(object): extra = inst.hvparams[constants.HV_KERNEL_ARGS] self.assertTrue(("extra = '%s'" % extra) in lines) - def _StopInstanceCommand(self, instance_name, force, fail, cmd): + def _StopInstanceCommand(self, instance_name, force, fail, full_cmd): + # Remove the timeout (and its number of seconds) if it's there + if full_cmd[:1][0] == "timeout": + cmd = full_cmd[2:] + else: + cmd = full_cmd + + # Test the actual command if (cmd == [self.CMD, "list"]): output = "Name ID Mem VCPUs State Time(s)\n" \ "Domain-0 0 1023 1 r----- 142691.0\n" \ @@ -592,7 +599,7 @@ class _TestXenHypervisor(object): if fail: try: - hv._StopInstance(name, force) + hv._StopInstance(name, force, constants.DEFAULT_SHUTDOWN_TIMEOUT) except errors.HypervisorError, err: self.assertTrue(str(err).startswith("xm list failed"), msg=str(err)) @@ -602,7 +609,7 @@ class _TestXenHypervisor(object): msg=("Configuration was removed when stopping" " instance failed")) else: - hv._StopInstance(name, force) + hv._StopInstance(name, force, constants.DEFAULT_SHUTDOWN_TIMEOUT) self.assertFalse(os.path.exists(cfgfile)) def _MigrateNonRunningInstCmd(self, cmd):