From c4e388a5f15b7579d3b09a9ba6bdc9c36ee5eb1a Mon Sep 17 00:00:00 2001
From: Guido Trotter <ultrotter@google.com>
Date: Fri, 22 Jan 2010 12:37:24 +0100
Subject: [PATCH] KVM: be more resilient on broken migration answers

Before, when doing kvm live migrations we use to accept an "unknown
status" but to reject anything that didn't match our regexp. Since we've
seen "info migrate" return a completely empty answer, we'll be more
tolerant of completely unknown results (while still logging them) and at
the same time we'll limit the number of them which we're willing to
accept in a row.

Signed-off-by: Guido Trotter <ultrotter@google.com>
Reviewed-by: Iustin Pop <iustin@google.com>
---
 lib/hypervisor/hv_kvm.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py
index 92a2854e3..d474627bb 100644
--- a/lib/hypervisor/hv_kvm.py
+++ b/lib/hypervisor/hv_kvm.py
@@ -80,6 +80,8 @@ class KVMHypervisor(hv_base.BaseHypervisor):
 
   _MIGRATION_STATUS_RE = re.compile('Migration\s+status:\s+(\w+)',
                                     re.M | re.I)
+  _MIGRATION_INFO_MAX_BAD_ANSWERS = 5
+  _MIGRATION_INFO_RETRY_DELAY = 2
 
   _KVM_NETWORK_SCRIPT = constants.SYSCONFDIR + "/ganeti/kvm-vif-bridge"
 
@@ -675,26 +677,37 @@ class KVMHypervisor(hv_base.BaseHypervisor):
 
     info_command = 'info migrate'
     done = False
+    broken_answers = 0
     while not done:
       result = self._CallMonitorCommand(instance_name, info_command)
       match = self._MIGRATION_STATUS_RE.search(result.stdout)
       if not match:
-        raise errors.HypervisorError("Unknown 'info migrate' result: %s" %
-                                     result.stdout)
+        broken_answers += 1
+        if not result.stdout:
+          logging.info("KVM: empty 'info migrate' result")
+        else:
+          logging.warning("KVM: unknown 'info migrate' result: %s" %
+                          result.stdout)
+        time.sleep(self._MIGRATION_INFO_RETRY_DELAY)
       else:
         status = match.group(1)
         if status == 'completed':
           done = True
         elif status == 'active':
-          time.sleep(2)
+          # reset the broken answers count
+          broken_answers = 0
+          time.sleep(self._MIGRATION_INFO_RETRY_DELAY)
         elif status == 'failed' or status == 'cancelled':
           if not live:
             self._CallMonitorCommand(instance_name, 'cont')
           raise errors.HypervisorError("Migration %s at the kvm level" %
                                        status)
         else:
-          logging.info("KVM: unknown migration status '%s'", status)
-          time.sleep(2)
+          logging.warning("KVM: unknown migration status '%s'", status)
+          broken_answers += 1
+          time.sleep(self._MIGRATION_INFO_RETRY_DELAY)
+      if broken_answers >= self._MIGRATION_INFO_MAX_BAD_ANSWERS:
+        raise errors.HypervisorError("Too many 'info migrate' broken answers")
 
     utils.KillProcess(pid)
     self._RemoveInstanceRuntimeFiles(pidfile, instance_name)
-- 
GitLab