From c4e388a5f15b7579d3b09a9ba6bdc9c36ee5eb1a Mon Sep 17 00:00:00 2001 From: Guido Trotter <ultrotter@google.com> Date: Fri, 22 Jan 2010 12:37:24 +0100 Subject: [PATCH] KVM: be more resilient on broken migration answers Before, when doing kvm live migrations we use to accept an "unknown status" but to reject anything that didn't match our regexp. Since we've seen "info migrate" return a completely empty answer, we'll be more tolerant of completely unknown results (while still logging them) and at the same time we'll limit the number of them which we're willing to accept in a row. Signed-off-by: Guido Trotter <ultrotter@google.com> Reviewed-by: Iustin Pop <iustin@google.com> --- lib/hypervisor/hv_kvm.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index 92a2854e3..d474627bb 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -80,6 +80,8 @@ class KVMHypervisor(hv_base.BaseHypervisor): _MIGRATION_STATUS_RE = re.compile('Migration\s+status:\s+(\w+)', re.M | re.I) + _MIGRATION_INFO_MAX_BAD_ANSWERS = 5 + _MIGRATION_INFO_RETRY_DELAY = 2 _KVM_NETWORK_SCRIPT = constants.SYSCONFDIR + "/ganeti/kvm-vif-bridge" @@ -675,26 +677,37 @@ class KVMHypervisor(hv_base.BaseHypervisor): info_command = 'info migrate' done = False + broken_answers = 0 while not done: result = self._CallMonitorCommand(instance_name, info_command) match = self._MIGRATION_STATUS_RE.search(result.stdout) if not match: - raise errors.HypervisorError("Unknown 'info migrate' result: %s" % - result.stdout) + broken_answers += 1 + if not result.stdout: + logging.info("KVM: empty 'info migrate' result") + else: + logging.warning("KVM: unknown 'info migrate' result: %s" % + result.stdout) + time.sleep(self._MIGRATION_INFO_RETRY_DELAY) else: status = match.group(1) if status == 'completed': done = True elif status == 'active': - time.sleep(2) + # reset the broken answers count + broken_answers = 0 + time.sleep(self._MIGRATION_INFO_RETRY_DELAY) elif status == 'failed' or status == 'cancelled': if not live: self._CallMonitorCommand(instance_name, 'cont') raise errors.HypervisorError("Migration %s at the kvm level" % status) else: - logging.info("KVM: unknown migration status '%s'", status) - time.sleep(2) + logging.warning("KVM: unknown migration status '%s'", status) + broken_answers += 1 + time.sleep(self._MIGRATION_INFO_RETRY_DELAY) + if broken_answers >= self._MIGRATION_INFO_MAX_BAD_ANSWERS: + raise errors.HypervisorError("Too many 'info migrate' broken answers") utils.KillProcess(pid) self._RemoveInstanceRuntimeFiles(pidfile, instance_name) -- GitLab