Commit 91c622a8 authored by Iustin Pop's avatar Iustin Pop
Browse files

Improve handling of lost jobs



Currently, if the cli.JobExecutor class is being used, and one of the
jobs is being archived before it can check its result, it will raise a
stracktrace as _ChooseJob is not prepared to handle this case.

This case makes JobExecutor work better with lost jobs (it still reports
them as 'failed', but it doesn't break and returns a proper error
message), and modifies the generic FormatError to report the JobLost
exception properly, instead of as "Unhandled Ganeti Exception".

Since JobExecutor is hard to test properly, I only tested this manually,
via a fake invocation.
Signed-off-by: default avatarIustin Pop <iustin@google.com>
Reviewed-by: default avatarGuido Trotter <ultrotter@google.com>
parent 5a1c22fe
...@@ -1688,6 +1688,8 @@ def FormatError(err): ...@@ -1688,6 +1688,8 @@ def FormatError(err):
elif isinstance(err, luxi.ProtocolError): elif isinstance(err, luxi.ProtocolError):
obuf.write("Unhandled protocol error while talking to the master daemon:\n" obuf.write("Unhandled protocol error while talking to the master daemon:\n"
"%s" % msg) "%s" % msg)
elif isinstance(err, errors.JobLost):
obuf.write("Error checking job status: %s" % msg)
elif isinstance(err, errors.GenericError): elif isinstance(err, errors.GenericError):
obuf.write("Unhandled Ganeti error: %s" % msg) obuf.write("Unhandled Ganeti error: %s" % msg)
elif isinstance(err, JobSubmittedException): elif isinstance(err, JobSubmittedException):
...@@ -2326,12 +2328,13 @@ class JobExecutor(object): ...@@ -2326,12 +2328,13 @@ class JobExecutor(object):
assert result assert result
for job_data, status in zip(self.jobs, result): for job_data, status in zip(self.jobs, result):
if status[0] in (constants.JOB_STATUS_QUEUED, if (isinstance(status, list) and status and
constants.JOB_STATUS_WAITLOCK, status[0] in (constants.JOB_STATUS_QUEUED,
constants.JOB_STATUS_CANCELING): constants.JOB_STATUS_WAITLOCK,
# job is still waiting constants.JOB_STATUS_CANCELING)):
# job is still present and waiting
continue continue
# good candidate found # good candidate found (either running job or lost job)
self.jobs.remove(job_data) self.jobs.remove(job_data)
return job_data return job_data
...@@ -2367,6 +2370,11 @@ class JobExecutor(object): ...@@ -2367,6 +2370,11 @@ class JobExecutor(object):
try: try:
job_result = PollJob(jid, cl=self.cl, feedback_fn=self.feedback_fn) job_result = PollJob(jid, cl=self.cl, feedback_fn=self.feedback_fn)
success = True success = True
except errors.JobLost, err:
_, job_result = FormatError(err)
ToStderr("Job %s for %s has been archived, cannot check its result",
jid, name)
success = False
except (errors.GenericError, luxi.ProtocolError), err: except (errors.GenericError, luxi.ProtocolError), err:
_, job_result = FormatError(err) _, job_result = FormatError(err)
success = False success = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment