From 91c622a86f5f0d5682d23b878bd31664c698669c Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Wed, 28 Jul 2010 16:08:29 -0400
Subject: [PATCH] Improve handling of lost jobs

Currently, if the cli.JobExecutor class is being used, and one of the
jobs is being archived before it can check its result, it will raise a
stracktrace as _ChooseJob is not prepared to handle this case.

This case makes JobExecutor work better with lost jobs (it still reports
them as 'failed', but it doesn't break and returns a proper error
message), and modifies the generic FormatError to report the JobLost
exception properly, instead of as "Unhandled Ganeti Exception".

Since JobExecutor is hard to test properly, I only tested this manually,
via a fake invocation.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Guido Trotter <ultrotter@google.com>
---
 lib/cli.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/lib/cli.py b/lib/cli.py
index ebff8f853..4e6da3913 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -1688,6 +1688,8 @@ def FormatError(err):
   elif isinstance(err, luxi.ProtocolError):
     obuf.write("Unhandled protocol error while talking to the master daemon:\n"
                "%s" % msg)
+  elif isinstance(err, errors.JobLost):
+    obuf.write("Error checking job status: %s" % msg)
   elif isinstance(err, errors.GenericError):
     obuf.write("Unhandled Ganeti error: %s" % msg)
   elif isinstance(err, JobSubmittedException):
@@ -2326,12 +2328,13 @@ class JobExecutor(object):
     assert result
 
     for job_data, status in zip(self.jobs, result):
-      if status[0] in (constants.JOB_STATUS_QUEUED,
-                    constants.JOB_STATUS_WAITLOCK,
-                    constants.JOB_STATUS_CANCELING):
-        # job is still waiting
+      if (isinstance(status, list) and status and
+          status[0] in (constants.JOB_STATUS_QUEUED,
+                        constants.JOB_STATUS_WAITLOCK,
+                        constants.JOB_STATUS_CANCELING)):
+        # job is still present and waiting
         continue
-      # good candidate found
+      # good candidate found (either running job or lost job)
       self.jobs.remove(job_data)
       return job_data
 
@@ -2367,6 +2370,11 @@ class JobExecutor(object):
       try:
         job_result = PollJob(jid, cl=self.cl, feedback_fn=self.feedback_fn)
         success = True
+      except errors.JobLost, err:
+        _, job_result = FormatError(err)
+        ToStderr("Job %s for %s has been archived, cannot check its result",
+                 jid, name)
+        success = False
       except (errors.GenericError, luxi.ProtocolError), err:
         _, job_result = FormatError(err)
         success = False
-- 
GitLab