From 9e49dfc5d4a219f55f46f56916f8c777823d125a Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <hansmi@google.com>
Date: Fri, 8 Oct 2010 16:03:17 +0200
Subject: [PATCH] jqueue: Fix bug when cancelling jobs

If a job was cancelled while it was waiting for locks, an assertion
would've failed. This patch fixes the problem and provides a unit
test to check for this situation.

Signed-off-by: Michael Hanselmann <hansmi@google.com>
Reviewed-by: Iustin Pop <iustin@google.com>
---
 lib/jqueue.py                  |  9 ++++++-
 test/ganeti.jqueue_unittest.py | 44 ++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/lib/jqueue.py b/lib/jqueue.py
index 654c68143..1a0c20d2e 100644
--- a/lib/jqueue.py
+++ b/lib/jqueue.py
@@ -921,7 +921,14 @@ class _JobProcessor(object):
     except mcpu.LockAcquireTimeout:
       assert timeout is not None, "Received timeout for blocking acquire"
       logging.debug("Couldn't acquire locks in %0.6fs", timeout)
-      assert op.status == constants.OP_STATUS_WAITLOCK
+
+      assert op.status in (constants.OP_STATUS_WAITLOCK,
+                           constants.OP_STATUS_CANCELING)
+
+      # Was job cancelled while we were waiting for the lock?
+      if op.status == constants.OP_STATUS_CANCELING:
+        return (constants.OP_STATUS_CANCELING, None)
+
       return (constants.OP_STATUS_QUEUED, None)
     except CancelJob:
       logging.exception("%s: Canceling job", opctx.log_prefix)
diff --git a/test/ganeti.jqueue_unittest.py b/test/ganeti.jqueue_unittest.py
index dddf7c9f4..0939e08a0 100755
--- a/test/ganeti.jqueue_unittest.py
+++ b/test/ganeti.jqueue_unittest.py
@@ -690,6 +690,50 @@ class TestJobProcessor(unittest.TestCase, _JobProcessorTestUtils):
                      [[constants.OP_STATUS_CANCELED for _ in job.ops],
                       ["Job canceled by request" for _ in job.ops]])
 
+  def testCancelWhileWaitlockWithTimeout(self):
+    queue = _FakeQueueForProc()
+
+    ops = [opcodes.OpTestDummy(result="Res%s" % i, fail=False)
+           for i in range(5)]
+
+    # Create job
+    job_id = 24314
+    job = self._CreateJob(queue, job_id, ops)
+
+    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_QUEUED)
+
+    def _BeforeStart(timeout, priority):
+      self.assertFalse(queue.IsAcquired())
+      self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_WAITLOCK)
+
+      # Mark as cancelled
+      (success, _) = job.Cancel()
+      self.assert_(success)
+
+      self.assert_(compat.all(op.status == constants.OP_STATUS_CANCELING
+                              for op in job.ops))
+
+      # Fake an acquire attempt timing out
+      raise mcpu.LockAcquireTimeout()
+
+    def _AfterStart(op, cbs):
+      self.fail("Should not reach this")
+
+    opexec = _FakeExecOpCodeForProc(_BeforeStart, _AfterStart)
+
+    self.assert_(jqueue._JobProcessor(queue, opexec, job)())
+
+    # Check result
+    self.assertEqual(job.CalcStatus(), constants.JOB_STATUS_CANCELED)
+    self.assertEqual(job.GetInfo(["status"]), [constants.JOB_STATUS_CANCELED])
+    self.assert_(job.start_timestamp)
+    self.assert_(job.end_timestamp)
+    self.assertFalse(compat.all(op.start_timestamp and op.end_timestamp
+                                for op in job.ops))
+    self.assertEqual(job.GetInfo(["opstatus", "opresult"]),
+                     [[constants.OP_STATUS_CANCELED for _ in job.ops],
+                      ["Job canceled by request" for _ in job.ops]])
+
   def testCancelWhileRunning(self):
     # Tests canceling a job with finished opcodes and more, unprocessed ones
     queue = _FakeQueueForProc()
-- 
GitLab