diff --git a/daemons/import-export b/daemons/import-export index d137b57e97836ae47d63c6a81437c008dbc2bc28..0700339c86cf267665ebf2de98587bc62d915ca9 100755 --- a/daemons/import-export +++ b/daemons/import-export @@ -53,9 +53,6 @@ MAX_RECENT_OUTPUT_LINES = 20 #: Don't update status file more than once every 5 seconds (unless forced) MIN_UPDATE_INTERVAL = 5.0 -#: Give child process up to 5 seconds to exit after sending a signal -CHILD_LINGER_TIMEOUT = 5.0 - #: How long to wait for a connection to be established DEFAULT_CONNECT_TIMEOUT = 60 @@ -276,7 +273,7 @@ def ProcessChildIO(child, socat_stderr_read_fd, dd_stderr_read_fd, logging.info("Child process didn't establish connection in time") child.Kill(signal.SIGTERM) exit_timeout = \ - utils.RunningTimeout(CHILD_LINGER_TIMEOUT, True) + utils.RunningTimeout(constants.CHILD_LINGER_TIMEOUT, True) # Next block will calculate timeout else: # Not yet connected, check again in a second @@ -323,9 +320,9 @@ def ProcessChildIO(child, socat_stderr_read_fd, dd_stderr_read_fd, " to exit", exit_timeout.Remaining()) else: logging.info("Giving child process %0.2f seconds to exit", - CHILD_LINGER_TIMEOUT) + constants.CHILD_LINGER_TIMEOUT) exit_timeout = \ - utils.RunningTimeout(CHILD_LINGER_TIMEOUT, True) + utils.RunningTimeout(constants.CHILD_LINGER_TIMEOUT, True) else: poller.unregister(fd) del fdmap[fd] diff --git a/lib/constants.py b/lib/constants.py index 334fae3e2075f503996e215a8fd6e5928ed728ee..2db5bba656706f0956cc3f6e66df9a563e986a92 100644 --- a/lib/constants.py +++ b/lib/constants.py @@ -423,6 +423,8 @@ RIE_CERT_VALIDITY = 24 * 60 * 60 # Remote import/export connect timeout for socat RIE_CONNECT_TIMEOUT = 60 +#: Give child process up to 5 seconds to exit after sending a signal +CHILD_LINGER_TIMEOUT = 5.0 DISK_TEMPLATES = frozenset([DT_DISKLESS, DT_PLAIN, DT_DRBD8, DT_FILE]) diff --git a/lib/utils.py b/lib/utils.py index 7e4d968dd5fb0fcb78158cd1cd00cc72c53ba1f0..71134bc6b7d69b619c03ce78f28026cd48d04c3e 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -96,6 +96,10 @@ _MCL_FUTURE = 2 #: MAC checker regexp _MAC_CHECK = re.compile("^([0-9a-f]{2}:){5}[0-9a-f]{2}$", re.I) +(_TIMEOUT_NONE, + _TIMEOUT_TERM, + _TIMEOUT_KILL) = range(3) + class RunResult(object): """Holds the result of running external programs. @@ -120,7 +124,8 @@ class RunResult(object): "failed", "fail_reason", "cmd"] - def __init__(self, exit_code, signal_, stdout, stderr, cmd): + def __init__(self, exit_code, signal_, stdout, stderr, cmd, timeout_action, + timeout): self.cmd = cmd self.exit_code = exit_code self.signal = signal_ @@ -128,12 +133,23 @@ class RunResult(object): self.stderr = stderr self.failed = (signal_ is not None or exit_code != 0) + fail_msgs = [] if self.signal is not None: - self.fail_reason = "terminated by signal %s" % self.signal + fail_msgs.append("terminated by signal %s" % self.signal) elif self.exit_code is not None: - self.fail_reason = "exited with exit code %s" % self.exit_code + fail_msgs.append("exited with exit code %s" % self.exit_code) else: - self.fail_reason = "unable to determine termination reason" + fail_msgs.append("unable to determine termination reason") + + if timeout_action == _TIMEOUT_TERM: + fail_msgs.append("terminated after timeout of %.2f seconds" % timeout) + elif timeout_action == _TIMEOUT_KILL: + fail_msgs.append(("force termination after timeout of %.2f seconds" + " and linger for another %.2f seconds") % + (timeout, constants.CHILD_LINGER_TIMEOUT)) + + if fail_msgs and self.failed: + self.fail_reason = CommaJoin(fail_msgs) if self.failed: logging.debug("Command '%s' failed (%s); output: %s", @@ -165,7 +181,7 @@ def _BuildCmdEnvironment(env, reset): def RunCmd(cmd, env=None, output=None, cwd="/", reset_env=False, - interactive=False): + interactive=False, timeout=None): """Execute a (shell) command. The command should not read from its standard input, as it will be @@ -187,6 +203,9 @@ def RunCmd(cmd, env=None, output=None, cwd="/", reset_env=False, @type interactive: boolean @param interactive: weather we pipe stdin, stdout and stderr (default behaviour) or run the command interactive + @type timeout: int + @param timeout: If not None, timeout in seconds until child process gets + killed @rtype: L{RunResult} @return: RunResult instance @raise errors.ProgrammerError: if we call this when forks are disabled @@ -216,8 +235,10 @@ def RunCmd(cmd, env=None, output=None, cwd="/", reset_env=False, try: if output is None: - out, err, status = _RunCmdPipe(cmd, cmd_env, shell, cwd, interactive) + out, err, status, timeout_action = _RunCmdPipe(cmd, cmd_env, shell, cwd, + interactive, timeout) else: + timeout_action = _TIMEOUT_NONE status = _RunCmdFile(cmd, cmd_env, shell, output, cwd) out = err = "" except OSError, err: @@ -234,7 +255,7 @@ def RunCmd(cmd, env=None, output=None, cwd="/", reset_env=False, exitcode = None signal_ = -status - return RunResult(exitcode, signal_, out, err, strcmd) + return RunResult(exitcode, signal_, out, err, strcmd, timeout_action, timeout) def SetupDaemonEnv(cwd="/", umask=077): @@ -460,7 +481,28 @@ def WriteErrorToFD(fd, err): RetryOnSignal(os.write, fd, err) -def _RunCmdPipe(cmd, env, via_shell, cwd, interactive): +def _CheckIfAlive(child): + """Raises L{RetryAgain} if child is still alive. + + @raises RetryAgain: If child is still alive + + """ + if child.poll() is None: + raise RetryAgain() + + +def _WaitForProcess(child, timeout): + """Waits for the child to terminate or until we reach timeout. + + """ + try: + Retry(_CheckIfAlive, (1.0, 1.2, 5.0), max(0, timeout), args=[child]) + except RetryTimeout: + pass + + +def _RunCmdPipe(cmd, env, via_shell, cwd, interactive, timeout, + _linger_timeout=constants.CHILD_LINGER_TIMEOUT): """Run a command and return its output. @type cmd: string or list @@ -473,6 +515,8 @@ def _RunCmdPipe(cmd, env, via_shell, cwd, interactive): @param cwd: the working directory for the program @type interactive: boolean @param interactive: Run command interactive (without piping) + @type timeout: int + @param timeout: Timeout after the programm gets terminated @rtype: tuple @return: (out, err, status) @@ -495,6 +539,21 @@ def _RunCmdPipe(cmd, env, via_shell, cwd, interactive): out = StringIO() err = StringIO() + + linger_timeout = None + + if timeout is None: + poll_timeout = None + else: + poll_timeout = RunningTimeout(timeout, True).Remaining + + msg_timeout = ("Command %s (%d) run into execution timeout, terminating" % + (cmd, child.pid)) + msg_linger = ("Command %s (%d) run into linger timeout, killing" % + (cmd, child.pid)) + + timeout_action = _TIMEOUT_NONE + if not interactive: child.stdin.close() poller.register(child.stdout, select.POLLIN) @@ -507,7 +566,26 @@ def _RunCmdPipe(cmd, env, via_shell, cwd, interactive): SetNonblockFlag(fd, True) while fdmap: - pollresult = RetryOnSignal(poller.poll) + if poll_timeout: + current_timeout = poll_timeout() + if current_timeout < 0: + if linger_timeout is None: + logging.warning(msg_timeout) + if child.poll() is None: + timeout_action = _TIMEOUT_TERM + IgnoreProcessNotFound(os.kill, child.pid, signal.SIGTERM) + linger_timeout = RunningTimeout(_linger_timeout, True).Remaining + lt = linger_timeout() + if lt < 0: + break + + pt = max(0, lt) + else: + pt = current_timeout + else: + pt = None + + pollresult = RetryOnSignal(poller.poll, pt) for fd, event in pollresult: if event & select.POLLIN or event & select.POLLPRI: @@ -523,11 +601,35 @@ def _RunCmdPipe(cmd, env, via_shell, cwd, interactive): poller.unregister(fd) del fdmap[fd] + if timeout is not None: + assert callable(poll_timeout) + + # We have no I/O left but it might still run + if child.poll() is None: + _WaitForProcess(child, poll_timeout()) + + # Terminate if still alive after timeout + if child.poll() is None: + if linger_timeout is None: + logging.warning(msg_timeout) + timeout_action = _TIMEOUT_TERM + IgnoreProcessNotFound(os.kill, child.pid, signal.SIGTERM) + lt = _linger_timeout + else: + lt = linger_timeout() + _WaitForProcess(child, lt) + + # Okay, still alive after timeout and linger timeout? Kill it! + if child.poll() is None: + timeout_action = _TIMEOUT_KILL + logging.warning(msg_linger) + IgnoreProcessNotFound(os.kill, child.pid, signal.SIGKILL) + out = out.getvalue() err = err.getvalue() status = child.wait() - return out, err, status + return out, err, status, timeout_action def _RunCmdFile(cmd, env, via_shell, output, cwd): diff --git a/test/ganeti.utils_unittest.py b/test/ganeti.utils_unittest.py index 7c7e99c92201e2708ee25d6744e66c37b838b4f1..f03c0b993b55a37db4be90caaff4e8d6eaab22e9 100755 --- a/test/ganeti.utils_unittest.py +++ b/test/ganeti.utils_unittest.py @@ -237,6 +237,12 @@ class TestRunCmd(testutils.GanetiTestCase): testutils.GanetiTestCase.setUp(self) self.magic = time.ctime() + " ganeti test" self.fname = self._CreateTempFile() + self.fifo_tmpdir = tempfile.mkdtemp() + self.fifo_file = os.path.join(self.fifo_tmpdir, "ganeti_test_fifo") + os.mkfifo(self.fifo_file) + + def tearDown(self): + shutil.rmtree(self.fifo_tmpdir) def testOk(self): """Test successful exit code""" @@ -284,6 +290,38 @@ class TestRunCmd(testutils.GanetiTestCase): self.assertEqual(result.signal, 15) self.assertEqual(result.output, "") + def testTimeoutClean(self): + cmd = "trap 'exit 0' TERM; read < %s" % self.fifo_file + result = RunCmd(["/bin/sh", "-c", cmd], timeout=0.2) + self.assertEqual(result.exit_code, 0) + + def testTimeoutCleanInteractive(self): + cmd = "trap 'exit 0' TERM; read" + result = RunCmd(["/bin/sh", "-c", cmd], interactive=True, timeout=0.2) + self.assertEqual(result.exit_code, 0) + + def testTimeoutNonClean(self): + for exit_code in (1, 10, 17, 29): + cmd = "trap 'exit %i' TERM; read" % exit_code + result = RunCmd(["/bin/sh", "-c", cmd], interactive=True, timeout=0.2) + self.assert_(result.failed) + self.assertEqual(result.exit_code, exit_code) + + def testTimeoutKill(self): + cmd = "trap '' TERM; read < %s" % self.fifo_file + timeout = 0.2 + strcmd = utils.ShellQuoteArgs(["/bin/sh", "-c", cmd]) + out, err, status, ta = utils._RunCmdPipe(strcmd, {}, True, "/", False, + timeout, _linger_timeout=0.2) + self.assert_(status < 0) + self.assertEqual(-status, signal.SIGKILL) + + def testTimeoutOutputAfterTerm(self): + cmd = "trap 'echo sigtermed; exit 1' TERM; read < %s" % self.fifo_file + result = RunCmd(["/bin/sh", "-c", cmd], timeout=0.2) + self.assert_(result.failed) + self.assertEqual(result.stdout, "sigtermed\n") + def testListRun(self): """Test list runs""" result = RunCmd(["true"])