From fbafd7a864cc1c47587f6c4746589d07847b61ae Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Wed, 3 Jun 2009 14:20:03 +0200
Subject: [PATCH] Wait for a while in failed resyncs

This patch is an attempt at fixing some very rare occurrences of messages like:
  - "There are some degraded disks for this instance", or:
  - "Cannot resync disks on node node3.example.com: [True, 100]"

What I believe happens is that drbd has finished syncing, but not all
fields are updated in 'Connected' state; maybe it's in WFBitmap[ST], or
in some other transient state we don't handle well.

The patch will change the _WaitForSync method to recheck up to a
hardcoded number of times if we're finished syncing but we're degraded
(using the same condition as the 'break' clause of the loop).

The cons of this changes is that a normal, really-degraded due to
network or disk failure will cause an extra delay before it aborts. For
this, I'm happy to choose other values.

A better, long term fix is to handle more DRBD state correctly (see the
bdev.DRBD8Status class).

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Guido Trotter <ultrotter@google.com>
---
 lib/cmdlib.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index e491163ac..87f145234 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -1608,6 +1608,7 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False):
     lu.cfg.SetDiskID(dev, node)
 
   retries = 0
+  degr_retries = 10 # in seconds, as we sleep 1 second each time
   while True:
     max_time = 0
     done = True
@@ -1640,6 +1641,16 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False):
           rem_time = "no time estimate"
         lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
                         (instance.disks[i].iv_name, perc_done, rem_time))
+
+    # if we're done but degraded, let's do a few small retries, to
+    # make sure we see a stable and not transient situation; therefore
+    # we force restart of the loop
+    if (done or oneshot) and cumul_degraded and degr_retries > 0:
+      logging.info("Degraded disks found, %d retries left", degr_retries)
+      degr_retries -= 1
+      time.sleep(1)
+      continue
+
     if done or oneshot:
       break
 
-- 
GitLab