Commit 874f6148 authored by Michele Tartara's avatar Michele Tartara

Enable a timeout for instance shutdown

Add the timeout parameter to the StopInstance function of the hypervisor base
class and to all its implementations.

Also, change the tests as required by this change.
Signed-off-by: default avatarMichele Tartara <mtartara@google.com>
Reviewed-by: default avatarKlaus Aehlig <aehlig@google.com>
parent 2a2d087a
...@@ -1394,7 +1394,7 @@ def InstanceShutdown(instance, timeout, reason, store_reason=True): ...@@ -1394,7 +1394,7 @@ def InstanceShutdown(instance, timeout, reason, store_reason=True):
return return
try: try:
hyper.StopInstance(instance, retry=self.tried_once) hyper.StopInstance(instance, retry=self.tried_once, timeout=timeout)
if store_reason: if store_reason:
_StoreInstReasonTrail(instance.name, reason) _StoreInstReasonTrail(instance.name, reason)
except errors.HypervisorError, err: except errors.HypervisorError, err:
......
...@@ -173,7 +173,8 @@ class BaseHypervisor(object): ...@@ -173,7 +173,8 @@ class BaseHypervisor(object):
"""Start an instance.""" """Start an instance."""
raise NotImplementedError raise NotImplementedError
def StopInstance(self, instance, force=False, retry=False, name=None): def StopInstance(self, instance, force=False, retry=False, name=None,
timeout=None):
"""Stop an instance """Stop an instance
@type instance: L{objects.Instance} @type instance: L{objects.Instance}
...@@ -186,6 +187,10 @@ class BaseHypervisor(object): ...@@ -186,6 +187,10 @@ class BaseHypervisor(object):
@param name: if this parameter is passed, the the instance object @param name: if this parameter is passed, the the instance object
should not be used (will be passed as None), and the shutdown should not be used (will be passed as None), and the shutdown
must be done by name only must be done by name only
@type timeout: int or None
@param timeout: if the parameter is not None, a soft shutdown operation will
be killed after the specified number of seconds. A hard (forced)
shutdown cannot have a timeout
""" """
raise NotImplementedError raise NotImplementedError
......
...@@ -167,7 +167,8 @@ class ChrootManager(hv_base.BaseHypervisor): ...@@ -167,7 +167,8 @@ class ChrootManager(hv_base.BaseHypervisor):
raise HypervisorError("Can't run the chroot start script: %s" % raise HypervisorError("Can't run the chroot start script: %s" %
result.output) result.output)
def StopInstance(self, instance, force=False, retry=False, name=None): def StopInstance(self, instance, force=False, retry=False, name=None,
timeout=None):
"""Stop an instance. """Stop an instance.
This method has complicated cleanup tests, as we must: This method has complicated cleanup tests, as we must:
...@@ -176,6 +177,8 @@ class ChrootManager(hv_base.BaseHypervisor): ...@@ -176,6 +177,8 @@ class ChrootManager(hv_base.BaseHypervisor):
- finally unmount the instance dir - finally unmount the instance dir
""" """
assert(timeout is None or force is not None)
if name is None: if name is None:
name = instance.name name = instance.name
...@@ -183,9 +186,14 @@ class ChrootManager(hv_base.BaseHypervisor): ...@@ -183,9 +186,14 @@ class ChrootManager(hv_base.BaseHypervisor):
if not os.path.exists(root_dir) or not self._IsDirLive(root_dir): if not os.path.exists(root_dir) or not self._IsDirLive(root_dir):
return return
timeout_cmd = []
if timeout is not None:
timeout_cmd.extend(["timeout", str(timeout)])
# Run the chroot stop script only once # Run the chroot stop script only once
if not retry and not force: if not retry and not force:
result = utils.RunCmd(["chroot", root_dir, "/ganeti-chroot", "stop"]) result = utils.RunCmd(timeout_cmd.extend(["chroot", root_dir,
"/ganeti-chroot", "stop"]))
if result.failed: if result.failed:
raise HypervisorError("Can't run the chroot stop script: %s" % raise HypervisorError("Can't run the chroot stop script: %s" %
result.output) result.output)
......
...@@ -165,13 +165,16 @@ class FakeHypervisor(hv_base.BaseHypervisor): ...@@ -165,13 +165,16 @@ class FakeHypervisor(hv_base.BaseHypervisor):
raise errors.HypervisorError("Failed to start instance %s: %s" % raise errors.HypervisorError("Failed to start instance %s: %s" %
(instance.name, err)) (instance.name, err))
def StopInstance(self, instance, force=False, retry=False, name=None): def StopInstance(self, instance, force=False, retry=False, name=None,
timeout=None):
"""Stop an instance. """Stop an instance.
For the fake hypervisor, this just removes the file in the base For the fake hypervisor, this just removes the file in the base
dir, if it exist, otherwise we raise an exception. dir, if it exist, otherwise we raise an exception.
""" """
assert(timeout is None or force is not None)
if name is None: if name is None:
name = instance.name name = instance.name
if not self._IsAlive(name): if not self._IsAlive(name):
......
...@@ -1696,10 +1696,12 @@ class KVMHypervisor(hv_base.BaseHypervisor): ...@@ -1696,10 +1696,12 @@ class KVMHypervisor(hv_base.BaseHypervisor):
# 500ms and likely more: socat can't detect the end of the reply and waits # 500ms and likely more: socat can't detect the end of the reply and waits
# for 500ms of no data received before exiting (500 ms is the default for # for 500ms of no data received before exiting (500 ms is the default for
# the "-t" parameter). # the "-t" parameter).
socat = ("echo %s | %s STDIO UNIX-CONNECT:%s" % socat = ("echo %s | %s %s STDIO UNIX-CONNECT:%s" %
(utils.ShellQuote(command), (utils.ShellQuote(command),
timeout_cmd,
constants.SOCAT_PATH, constants.SOCAT_PATH,
utils.ShellQuote(self._InstanceMonitor(instance_name)))) utils.ShellQuote(self._InstanceMonitor(instance_name))))
result = utils.RunCmd(socat) result = utils.RunCmd(socat)
if result.failed: if result.failed:
msg = ("Failed to send command '%s' to instance '%s', reason '%s'," msg = ("Failed to send command '%s' to instance '%s', reason '%s',"
...@@ -1776,10 +1778,13 @@ class KVMHypervisor(hv_base.BaseHypervisor): ...@@ -1776,10 +1778,13 @@ class KVMHypervisor(hv_base.BaseHypervisor):
else: else:
return "pc" return "pc"
def StopInstance(self, instance, force=False, retry=False, name=None): def StopInstance(self, instance, force=False, retry=False, name=None,
timeout=None):
"""Stop an instance. """Stop an instance.
""" """
assert(timeout is None or force is not None)
if name is not None and not force: if name is not None and not force:
raise errors.HypervisorError("Cannot shutdown cleanly by name only") raise errors.HypervisorError("Cannot shutdown cleanly by name only")
if name is None: if name is None:
...@@ -1792,7 +1797,7 @@ class KVMHypervisor(hv_base.BaseHypervisor): ...@@ -1792,7 +1797,7 @@ class KVMHypervisor(hv_base.BaseHypervisor):
if force or not acpi: if force or not acpi:
utils.KillProcess(pid) utils.KillProcess(pid)
else: else:
self._CallMonitorCommand(name, "system_powerdown") self._CallMonitorCommand(name, "system_powerdown", timeout)
def CleanupInstance(self, instance_name): def CleanupInstance(self, instance_name):
"""Cleanup after a stopped instance """Cleanup after a stopped instance
......
...@@ -325,7 +325,8 @@ class LXCHypervisor(hv_base.BaseHypervisor): ...@@ -325,7 +325,8 @@ class LXCHypervisor(hv_base.BaseHypervisor):
raise HypervisorError("Running the lxc-start script failed: %s" % raise HypervisorError("Running the lxc-start script failed: %s" %
result.output) result.output)
def StopInstance(self, instance, force=False, retry=False, name=None): def StopInstance(self, instance, force=False, retry=False, name=None,
timeout=None):
"""Stop an instance. """Stop an instance.
This method has complicated cleanup tests, as we must: This method has complicated cleanup tests, as we must:
...@@ -334,9 +335,15 @@ class LXCHypervisor(hv_base.BaseHypervisor): ...@@ -334,9 +335,15 @@ class LXCHypervisor(hv_base.BaseHypervisor):
- finally unmount the instance dir - finally unmount the instance dir
""" """
assert(timeout is None or force is not None)
if name is None: if name is None:
name = instance.name name = instance.name
timeout_cmd = []
if timeout is not None:
timeout_cmd.extend(["timeout", str(timeout)])
root_dir = self._InstanceDir(name) root_dir = self._InstanceDir(name)
if not os.path.exists(root_dir): if not os.path.exists(root_dir):
return return
...@@ -349,7 +356,7 @@ class LXCHypervisor(hv_base.BaseHypervisor): ...@@ -349,7 +356,7 @@ class LXCHypervisor(hv_base.BaseHypervisor):
raise HypervisorError("Running 'poweroff' on the instance" raise HypervisorError("Running 'poweroff' on the instance"
" failed: %s" % result.output) " failed: %s" % result.output)
time.sleep(2) time.sleep(2)
result = utils.RunCmd(["lxc-stop", "-n", name]) result = utils.RunCmd(timeout_cmd.extend(["lxc-stop", "-n", name]))
if result.failed: if result.failed:
logging.warning("Error while doing lxc-stop for %s: %s", name, logging.warning("Error while doing lxc-stop for %s: %s", name,
result.output) result.output)
...@@ -358,12 +365,12 @@ class LXCHypervisor(hv_base.BaseHypervisor): ...@@ -358,12 +365,12 @@ class LXCHypervisor(hv_base.BaseHypervisor):
return return
for mpath in self._GetMountSubdirs(root_dir): for mpath in self._GetMountSubdirs(root_dir):
result = utils.RunCmd(["umount", mpath]) result = utils.RunCmd(timeout_cmd.extend(["umount", mpath]))
if result.failed: if result.failed:
logging.warning("Error while umounting subpath %s for instance %s: %s", logging.warning("Error while umounting subpath %s for instance %s: %s",
mpath, name, result.output) mpath, name, result.output)
result = utils.RunCmd(["umount", root_dir]) result = utils.RunCmd(timeout_cmd.extend(["umount", root_dir]))
if result.failed and force: if result.failed and force:
msg = ("Processes still alive in the chroot: %s" % msg = ("Processes still alive in the chroot: %s" %
utils.RunCmd("fuser -vm %s" % root_dir).output) utils.RunCmd("fuser -vm %s" % root_dir).output)
......
...@@ -519,16 +519,21 @@ class XenHypervisor(hv_base.BaseHypervisor): ...@@ -519,16 +519,21 @@ class XenHypervisor(hv_base.BaseHypervisor):
(instance.name, result.fail_reason, (instance.name, result.fail_reason,
result.output, stashed_config)) result.output, stashed_config))
def StopInstance(self, instance, force=False, retry=False, name=None): def StopInstance(self, instance, force=False, retry=False, name=None,
timeout=None):
"""Stop an instance. """Stop an instance.
A soft shutdown can be interrupted. A hard shutdown tries forever.
""" """
assert(timeout is None or force is not None)
if name is None: if name is None:
name = instance.name name = instance.name
return self._StopInstance(name, force) return self._StopInstance(name, force, timeout)
def _ShutdownInstance(self, name): def _ShutdownInstance(self, name, timeout):
"""Shutdown an instance if the instance is running. """Shutdown an instance if the instance is running.
The '-w' flag waits for shutdown to complete which avoids the need The '-w' flag waits for shutdown to complete which avoids the need
...@@ -537,6 +542,9 @@ class XenHypervisor(hv_base.BaseHypervisor): ...@@ -537,6 +542,9 @@ class XenHypervisor(hv_base.BaseHypervisor):
@type name: string @type name: string
@param name: name of the instance to stop @param name: name of the instance to stop
@type timeout: int or None
@param timeout: a timeout after which the shutdown command should be killed,
or None for no timeout
""" """
instance_info = self.GetInstanceInfo(name) instance_info = self.GetInstanceInfo(name)
...@@ -545,7 +553,7 @@ class XenHypervisor(hv_base.BaseHypervisor): ...@@ -545,7 +553,7 @@ class XenHypervisor(hv_base.BaseHypervisor):
logging.info("Failed to shutdown instance %s, not running", name) logging.info("Failed to shutdown instance %s, not running", name)
return None return None
return self._RunXen(["shutdown", "-w", name]) return self._RunXen(["shutdown", "-w", name], timeout)
def _DestroyInstance(self, name): def _DestroyInstance(self, name):
"""Destroy an instance if the instance if the instance exists. """Destroy an instance if the instance if the instance exists.
...@@ -562,7 +570,7 @@ class XenHypervisor(hv_base.BaseHypervisor): ...@@ -562,7 +570,7 @@ class XenHypervisor(hv_base.BaseHypervisor):
return self._RunXen(["destroy", name]) return self._RunXen(["destroy", name])
def _StopInstance(self, name, force): def _StopInstance(self, name, force, timeout):
"""Stop an instance. """Stop an instance.
@type name: string @type name: string
...@@ -571,11 +579,15 @@ class XenHypervisor(hv_base.BaseHypervisor): ...@@ -571,11 +579,15 @@ class XenHypervisor(hv_base.BaseHypervisor):
@type force: boolean @type force: boolean
@param force: whether to do a "hard" stop (destroy) @param force: whether to do a "hard" stop (destroy)
@type timeout: int or None
@param timeout: a timeout after which the shutdown command should be killed,
or None for no timeout
""" """
if force: if force:
result = self._DestroyInstance(name) result = self._DestroyInstance(name)
else: else:
self._ShutdownInstance(name) self._ShutdownInstance(name, timeout)
result = self._DestroyInstance(name) result = self._DestroyInstance(name)
if result is not None and result.failed and \ if result is not None and result.failed and \
......
...@@ -555,7 +555,14 @@ class _TestXenHypervisor(object): ...@@ -555,7 +555,14 @@ class _TestXenHypervisor(object):
extra = inst.hvparams[constants.HV_KERNEL_ARGS] extra = inst.hvparams[constants.HV_KERNEL_ARGS]
self.assertTrue(("extra = '%s'" % extra) in lines) self.assertTrue(("extra = '%s'" % extra) in lines)
def _StopInstanceCommand(self, instance_name, force, fail, cmd): def _StopInstanceCommand(self, instance_name, force, fail, full_cmd):
# Remove the timeout (and its number of seconds) if it's there
if full_cmd[:1][0] == "timeout":
cmd = full_cmd[2:]
else:
cmd = full_cmd
# Test the actual command
if (cmd == [self.CMD, "list"]): if (cmd == [self.CMD, "list"]):
output = "Name ID Mem VCPUs State Time(s)\n" \ output = "Name ID Mem VCPUs State Time(s)\n" \
"Domain-0 0 1023 1 r----- 142691.0\n" \ "Domain-0 0 1023 1 r----- 142691.0\n" \
...@@ -592,7 +599,7 @@ class _TestXenHypervisor(object): ...@@ -592,7 +599,7 @@ class _TestXenHypervisor(object):
if fail: if fail:
try: try:
hv._StopInstance(name, force) hv._StopInstance(name, force, constants.DEFAULT_SHUTDOWN_TIMEOUT)
except errors.HypervisorError, err: except errors.HypervisorError, err:
self.assertTrue(str(err).startswith("xm list failed"), self.assertTrue(str(err).startswith("xm list failed"),
msg=str(err)) msg=str(err))
...@@ -602,7 +609,7 @@ class _TestXenHypervisor(object): ...@@ -602,7 +609,7 @@ class _TestXenHypervisor(object):
msg=("Configuration was removed when stopping" msg=("Configuration was removed when stopping"
" instance failed")) " instance failed"))
else: else:
hv._StopInstance(name, force) hv._StopInstance(name, force, constants.DEFAULT_SHUTDOWN_TIMEOUT)
self.assertFalse(os.path.exists(cfgfile)) self.assertFalse(os.path.exists(cfgfile))
def _MigrateNonRunningInstCmd(self, cmd): def _MigrateNonRunningInstCmd(self, cmd):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment