From 91c622a86f5f0d5682d23b878bd31664c698669c Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Wed, 28 Jul 2010 16:08:29 -0400 Subject: [PATCH] Improve handling of lost jobs Currently, if the cli.JobExecutor class is being used, and one of the jobs is being archived before it can check its result, it will raise a stracktrace as _ChooseJob is not prepared to handle this case. This case makes JobExecutor work better with lost jobs (it still reports them as 'failed', but it doesn't break and returns a proper error message), and modifies the generic FormatError to report the JobLost exception properly, instead of as "Unhandled Ganeti Exception". Since JobExecutor is hard to test properly, I only tested this manually, via a fake invocation. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Guido Trotter <ultrotter@google.com> --- lib/cli.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/cli.py b/lib/cli.py index ebff8f853..4e6da3913 100644 --- a/lib/cli.py +++ b/lib/cli.py @@ -1688,6 +1688,8 @@ def FormatError(err): elif isinstance(err, luxi.ProtocolError): obuf.write("Unhandled protocol error while talking to the master daemon:\n" "%s" % msg) + elif isinstance(err, errors.JobLost): + obuf.write("Error checking job status: %s" % msg) elif isinstance(err, errors.GenericError): obuf.write("Unhandled Ganeti error: %s" % msg) elif isinstance(err, JobSubmittedException): @@ -2326,12 +2328,13 @@ class JobExecutor(object): assert result for job_data, status in zip(self.jobs, result): - if status[0] in (constants.JOB_STATUS_QUEUED, - constants.JOB_STATUS_WAITLOCK, - constants.JOB_STATUS_CANCELING): - # job is still waiting + if (isinstance(status, list) and status and + status[0] in (constants.JOB_STATUS_QUEUED, + constants.JOB_STATUS_WAITLOCK, + constants.JOB_STATUS_CANCELING)): + # job is still present and waiting continue - # good candidate found + # good candidate found (either running job or lost job) self.jobs.remove(job_data) return job_data @@ -2367,6 +2370,11 @@ class JobExecutor(object): try: job_result = PollJob(jid, cl=self.cl, feedback_fn=self.feedback_fn) success = True + except errors.JobLost, err: + _, job_result = FormatError(err) + ToStderr("Job %s for %s has been archived, cannot check its result", + jid, name) + success = False except (errors.GenericError, luxi.ProtocolError), err: _, job_result = FormatError(err) success = False -- GitLab