From 9e49dfc5d4a219f55f46f56916f8c777823d125a Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Fri, 8 Oct 2010 16:03:17 +0200 Subject: [PATCH] jqueue: Fix bug when cancelling jobs If a job was cancelled while it was waiting for locks, an assertion would've failed. This patch fixes the problem and provides a unit test to check for this situation. Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Iustin Pop <iustin@google.com> --- lib/jqueue.py | 9 ++++++- test/ganeti.jqueue_unittest.py | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/lib/jqueue.py b/lib/jqueue.py index 654c68143..1a0c20d2e 100644 --- a/lib/jqueue.py +++ b/lib/jqueue.py @@ -921,7 +921,14 @@ class _JobProcessor(object): except mcpu.LockAcquireTimeout: assert timeout is not None, "Received timeout for blocking acquire" logging.debug("Couldn't acquire locks in %0.6fs", timeout) - assert op.status == constants.OP_STATUS_WAITLOCK + + assert op.status in (constants.OP_STATUS_WAITLOCK, + constants.OP_STATUS_CANCELING) + + # Was job cancelled while we were waiting for the lock? + if op.status == constants.OP_STATUS_CANCELING: + return (constants.OP_STATUS_CANCELING, None) + return (constants.OP_STATUS_QUEUED, None) except CancelJob: logging.exception("%s: Canceling job", opctx.log_prefix) diff --git a/test/ganeti.jqueue_unittest.py b/test/ganeti.jqueue_unittest.py index dddf7c9f4..0939e08a0 100755 --- a/test/ganeti.jqueue_unittest.py +++ b/test/ganeti.jqueue_unittest.py @@ -690,6 +690,50 @@ class TestJobProcessor(unittest.TestCase, _JobProcessorTestUtils): [[constants.OP_STATUS_CANCELED for _ in job.ops], ["Job canceled by request" for _ in job.ops]]) + def testCancelWhileWaitlockWithTimeout(self): + queue = _FakeQueueForProc() + + ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False) + for i in range(5)] + + # Create job + job_id = 24314 + job = self._CreateJob(queue, job_id, ops) + + self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED) + + def _BeforeStart(timeout, priority): + self.assertFalse(queue.IsAcquired()) + self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK) + + # Mark as cancelled + (success, _) = job.Cancel() + self.assert_(success) + + self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELING + for op in job.ops)) + + # Fake an acquire attempt timing out + raise mcpu.LockAcquireTimeout() + + def _AfterStart(op, cbs): + self.fail("Should not reach this") + + opexec = _FakeExecOpCodeForProc(_BeforeStart, _AfterStart) + + self.assert_(jqueue._JobProcessor(queue, opexec, job)()) + + # Check result + self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED) + self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED]) + self.assert_(job.start_timestamp) + self.assert_(job.end_timestamp) + self.assertFalse(compat.all(op.start_timestamp and op.end_timestamp + for op in job.ops)) + self.assertEqual(job.GetInfo(["opstatus", "opresult"]), + [[constants.OP_STATUS_CANCELED for _ in job.ops], + ["Job canceled by request" for _ in job.ops]]) + def testCancelWhileRunning(self): # Tests canceling a job with finished opcodes and more, unprocessed ones queue = _FakeQueueForProc() -- GitLab