From 8201b996c90056a382185412bbfebd2a49a6e5ed Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Thu, 14 Oct 2010 11:40:37 +0200 Subject: [PATCH] Rework QA interaction with the watcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interaction with cron-launched watcher is a well-known failure mode of QA: ---- 2010-10-14 06:54:55.464839 time=0:00:56.764827 Test tools/move-instance For the following tests it's recommended to turn off the ganeti-watcher cronjob. ---- 2010-10-14 06:54:55.465255 start Test automatic restart of instance by ganeti-watcher β¦ Error: Domain 'instance1' does not exist. Command: ssh -oEscapeChar=none -oBatchMode=yes -l root -t -oStrictHostKeyChecking=yes -oClearAllForwardings=yes -oForwardAgent=yes node2 'ganeti-watcher -d' 2010-10-13 23:55:04,479: pid=1659 ganeti-watcher:626 ERROR Can't acquire lock on state file /var/lib/ganeti/watcher.data: File already locked ---- 2010-10-14 06:55:04.513948 time=0:00:09.048693 Test automatic restart of instance by ganeti-watcher In order to fix this, we disable the watcher during these tests, and re-enable it afterwards. To protect against watcher being disabled, we enable it unconditionally at the start of the QA (we do want it enabled, in order to see the interaction between the watcher and many creation/disk replace jobs, etc.). Note: even after this patch, if a cron-watcher was started and is still running during the test, we'll have locking issues. I think for now this is OK, we'll have to see how often that happens. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Michael Hanselmann <hansmi@google.com> --- qa/ganeti-qa.py | 10 ++++++++-- qa/qa_daemon.py | 42 ++++++++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/qa/ganeti-qa.py b/qa/ganeti-qa.py index 5c5023fa5..851937cfa 100755 --- a/qa/ganeti-qa.py +++ b/qa/ganeti-qa.py @@ -105,6 +105,10 @@ def SetupCluster(rapi_user, rapi_secret): else: # consider the nodes are already there qa_node.MarkNodeAddedAll() + + # enable the watcher (unconditionally) + RunTest(qa_daemon.TestResumeWatcher) + if qa_config.TestEnabled('node-info'): RunTest(qa_node.TestNodeInfo) @@ -274,8 +278,8 @@ def RunDaemonTests(instance, pnode): consecutive_failures = \ qa_config.TestEnabled('instance-consecutive-failures') + RunTest(qa_daemon.TestPauseWatcher) if automatic_restart or consecutive_failures: - qa_daemon.PrintCronWarning() if automatic_restart: RunTest(qa_daemon.TestInstanceAutomaticRestart, pnode, instance) @@ -283,6 +287,8 @@ def RunDaemonTests(instance, pnode): if consecutive_failures: RunTest(qa_daemon.TestInstanceConsecutiveFailures, pnode, instance) + RunTest(qa_daemon.TestResumeWatcher) + def RunHardwareFailureTests(instance, pnode, snode): """Test cluster internal hardware failure recovery. @@ -349,7 +355,7 @@ def main(): SetupCluster(rapi_user, rapi_secret) # Load RAPI certificate - qa_rapi.Setup(rapi_user, rapi_secret) + #qa_rapi.Setup(rapi_user, rapi_secret) RunClusterTests() RunOsTests() diff --git a/qa/qa_daemon.py b/qa/qa_daemon.py index 4817bfef0..dd7fadbbb 100644 --- a/qa/qa_daemon.py +++ b/qa/qa_daemon.py @@ -1,7 +1,7 @@ # # -# Copyright (C) 2007 Google Inc. +# Copyright (C) 2007, 2008, 2009, 2010 Google Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -32,7 +32,7 @@ import qa_config import qa_utils import qa_error -from qa_utils import AssertEqual, StartSSH +from qa_utils import AssertEqual, AssertMatch, StartSSH, GetCommandOutput def _InstanceRunning(node, name): @@ -89,19 +89,41 @@ def _RunWatcherDaemon(): """ master = qa_config.GetMasterNode() - cmd = ['ganeti-watcher', '-d'] - AssertEqual(StartSSH(master['primary'], + cmd = ["ganeti-watcher", "-d", "--ignore-pause"] + AssertEqual(StartSSH(master["primary"], utils.ShellQuoteArgs(cmd)).wait(), 0) -def PrintCronWarning(): - """Shows a warning about the cron job. +def TestPauseWatcher(): + """Tests and pauses the watcher. """ - msg = ("For the following tests it's recommended to turn off the" - " ganeti-watcher cronjob.") - print - print qa_utils.FormatWarning(msg) + master = qa_config.GetMasterNode() + + cmd = ["gnt-cluster", "watcher", "pause", "4h"] + AssertEqual(StartSSH(master["primary"], + utils.ShellQuoteArgs(cmd)).wait(), 0) + + cmd = ["gnt-cluster", "watcher", "info"] + output = GetCommandOutput(master["primary"], + utils.ShellQuoteArgs(cmd)) + AssertMatch(output, r"^.*\bis paused\b.*") + + +def TestResumeWatcher(): + """Tests and unpauses the watcher. + + """ + master = qa_config.GetMasterNode() + + cmd = ["gnt-cluster", "watcher", "continue"] + AssertEqual(StartSSH(master["primary"], + utils.ShellQuoteArgs(cmd)).wait(), 0) + + cmd = ["gnt-cluster", "watcher", "info"] + output = GetCommandOutput(master["primary"], + utils.ShellQuoteArgs(cmd)) + AssertMatch(output, r"^.*\bis not paused\b.*") def TestInstanceAutomaticRestart(node, instance): -- GitLab