Commit 91c622a8 authored by Iustin Pop's avatar Iustin Pop
Browse files

Improve handling of lost jobs



Currently, if the cli.JobExecutor class is being used, and one of the
jobs is being archived before it can check its result, it will raise a
stracktrace as _ChooseJob is not prepared to handle this case.

This case makes JobExecutor work better with lost jobs (it still reports
them as 'failed', but it doesn't break and returns a proper error
message), and modifies the generic FormatError to report the JobLost
exception properly, instead of as "Unhandled Ganeti Exception".

Since JobExecutor is hard to test properly, I only tested this manually,
via a fake invocation.
Signed-off-by: default avatarIustin Pop <iustin@google.com>
Reviewed-by: default avatarGuido Trotter <ultrotter@google.com>
parent 5a1c22fe
......@@ -1688,6 +1688,8 @@ def FormatError(err):
elif isinstance(err, luxi.ProtocolError):
obuf.write("Unhandled protocol error while talking to the master daemon:\n"
"%s" % msg)
elif isinstance(err, errors.JobLost):
obuf.write("Error checking job status: %s" % msg)
elif isinstance(err, errors.GenericError):
obuf.write("Unhandled Ganeti error: %s" % msg)
elif isinstance(err, JobSubmittedException):
......@@ -2326,12 +2328,13 @@ class JobExecutor(object):
assert result
for job_data, status in zip(self.jobs, result):
if status[0] in (constants.JOB_STATUS_QUEUED,
constants.JOB_STATUS_WAITLOCK,
constants.JOB_STATUS_CANCELING):
# job is still waiting
if (isinstance(status, list) and status and
status[0] in (constants.JOB_STATUS_QUEUED,
constants.JOB_STATUS_WAITLOCK,
constants.JOB_STATUS_CANCELING)):
# job is still present and waiting
continue
# good candidate found
# good candidate found (either running job or lost job)
self.jobs.remove(job_data)
return job_data
......@@ -2367,6 +2370,11 @@ class JobExecutor(object):
try:
job_result = PollJob(jid, cl=self.cl, feedback_fn=self.feedback_fn)
success = True
except errors.JobLost, err:
_, job_result = FormatError(err)
ToStderr("Job %s for %s has been archived, cannot check its result",
jid, name)
success = False
except (errors.GenericError, luxi.ProtocolError), err:
_, job_result = FormatError(err)
success = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment