From db8667b7c0f55ab1cdecebaa56cc79dcc0ce9933 Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Tue, 3 Nov 2009 16:14:07 +0100 Subject: [PATCH] Workaround fake failures in drbd+live migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch is an attempt to fix the ugly issue during migration: Cannot resync disks on node β¦: [True, 100] If my understanding is correct, sometimes we poll the /proc/drbd file at an inoportune moment, while it's being updated, or while the DRBD device is changing state, and we see an unexpected state. Based on the assumption that this is just a transient state, rather than aborting directly, we change the backend.DrbdWaitSync() function to retry a few times the operation, giving DRBD a chance to settle down at the end of the resync. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Guido Trotter <ultrotter@google.com> --- lib/backend.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index 2860564f8..50abfb89f 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -2547,14 +2547,25 @@ def DrbdWaitSync(nodes_ip, disks): """Wait until DRBDs have synchronized. """ + def _helper(rd): + stats = rd.GetProcStatus() + if not (stats.is_connected or stats.is_in_resync): + raise utils.RetryAgain() + return stats + bdevs = _FindDisks(nodes_ip, disks) min_resync = 100 alldone = True for rd in bdevs: - stats = rd.GetProcStatus() - if not (stats.is_connected or stats.is_in_resync): - _Fail("DRBD device %s is not in sync: stats=%s", rd, stats) + try: + # poll each second for 15 seconds + stats = utils.Retry(_helper, 1, 15, args=[rd]) + except utils.RetryTimeout: + stats = rd.GetProcStatus() + # last check + if not (stats.is_connected or stats.is_in_resync): + _Fail("DRBD device %s is not in sync: stats=%s", rd, stats) alldone = alldone and (not stats.is_in_resync) if stats.sync_percent is not None: min_resync = min(min_resync, stats.sync_percent) -- GitLab