Skip to content
Snippets Groups Projects
Commit 8201b996 authored by Iustin Pop's avatar Iustin Pop
Browse files

Rework QA interaction with the watcher


The interaction with cron-launched watcher is a well-known failure mode of QA:

---- 2010-10-14 06:54:55.464839 time=0:00:56.764827 Test tools/move-instance

For the following tests it's recommended to turn off the ganeti-watcher cronjob.

---- 2010-10-14 06:54:55.465255 start Test automatic restart of instance by ganeti-watcher
…
Error: Domain 'instance1' does not exist.
Command: ssh -oEscapeChar=none -oBatchMode=yes -l root -t -oStrictHostKeyChecking=yes
  -oClearAllForwardings=yes -oForwardAgent=yes node2 'ganeti-watcher -d'
2010-10-13 23:55:04,479:  pid=1659 ganeti-watcher:626
 ERROR Can't acquire lock on state file /var/lib/ganeti/watcher.data: File already locked
---- 2010-10-14 06:55:04.513948 time=0:00:09.048693 Test automatic restart of instance by ganeti-watcher

In order to fix this, we disable the watcher during these tests, and
re-enable it afterwards. To protect against watcher being disabled, we
enable it unconditionally at the start of the QA (we do want it enabled,
in order to see the interaction between the watcher and many
creation/disk replace jobs, etc.).

Note: even after this patch, if a cron-watcher was started and is still
running during the test, we'll have locking issues. I think for now this
is OK, we'll have to see how often that happens.

Signed-off-by: default avatarIustin Pop <iustin@google.com>
Reviewed-by: default avatarMichael Hanselmann <hansmi@google.com>
parent 46c8a6ab
No related branches found
No related tags found
No related merge requests found
...@@ -105,6 +105,10 @@ def SetupCluster(rapi_user, rapi_secret): ...@@ -105,6 +105,10 @@ def SetupCluster(rapi_user, rapi_secret):
else: else:
# consider the nodes are already there # consider the nodes are already there
qa_node.MarkNodeAddedAll() qa_node.MarkNodeAddedAll()
# enable the watcher (unconditionally)
RunTest(qa_daemon.TestResumeWatcher)
if qa_config.TestEnabled('node-info'): if qa_config.TestEnabled('node-info'):
RunTest(qa_node.TestNodeInfo) RunTest(qa_node.TestNodeInfo)
...@@ -274,8 +278,8 @@ def RunDaemonTests(instance, pnode): ...@@ -274,8 +278,8 @@ def RunDaemonTests(instance, pnode):
consecutive_failures = \ consecutive_failures = \
qa_config.TestEnabled('instance-consecutive-failures') qa_config.TestEnabled('instance-consecutive-failures')
RunTest(qa_daemon.TestPauseWatcher)
if automatic_restart or consecutive_failures: if automatic_restart or consecutive_failures:
qa_daemon.PrintCronWarning()
if automatic_restart: if automatic_restart:
RunTest(qa_daemon.TestInstanceAutomaticRestart, pnode, instance) RunTest(qa_daemon.TestInstanceAutomaticRestart, pnode, instance)
...@@ -283,6 +287,8 @@ def RunDaemonTests(instance, pnode): ...@@ -283,6 +287,8 @@ def RunDaemonTests(instance, pnode):
if consecutive_failures: if consecutive_failures:
RunTest(qa_daemon.TestInstanceConsecutiveFailures, pnode, instance) RunTest(qa_daemon.TestInstanceConsecutiveFailures, pnode, instance)
RunTest(qa_daemon.TestResumeWatcher)
def RunHardwareFailureTests(instance, pnode, snode): def RunHardwareFailureTests(instance, pnode, snode):
"""Test cluster internal hardware failure recovery. """Test cluster internal hardware failure recovery.
...@@ -349,7 +355,7 @@ def main(): ...@@ -349,7 +355,7 @@ def main():
SetupCluster(rapi_user, rapi_secret) SetupCluster(rapi_user, rapi_secret)
# Load RAPI certificate # Load RAPI certificate
qa_rapi.Setup(rapi_user, rapi_secret) #qa_rapi.Setup(rapi_user, rapi_secret)
RunClusterTests() RunClusterTests()
RunOsTests() RunOsTests()
......
# #
# #
# Copyright (C) 2007 Google Inc. # Copyright (C) 2007, 2008, 2009, 2010 Google Inc.
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
...@@ -32,7 +32,7 @@ import qa_config ...@@ -32,7 +32,7 @@ import qa_config
import qa_utils import qa_utils
import qa_error import qa_error
from qa_utils import AssertEqual, StartSSH from qa_utils import AssertEqual, AssertMatch, StartSSH, GetCommandOutput
def _InstanceRunning(node, name): def _InstanceRunning(node, name):
...@@ -89,19 +89,41 @@ def _RunWatcherDaemon(): ...@@ -89,19 +89,41 @@ def _RunWatcherDaemon():
""" """
master = qa_config.GetMasterNode() master = qa_config.GetMasterNode()
cmd = ['ganeti-watcher', '-d'] cmd = ["ganeti-watcher", "-d", "--ignore-pause"]
AssertEqual(StartSSH(master['primary'], AssertEqual(StartSSH(master["primary"],
utils.ShellQuoteArgs(cmd)).wait(), 0) utils.ShellQuoteArgs(cmd)).wait(), 0)
def PrintCronWarning(): def TestPauseWatcher():
"""Shows a warning about the cron job. """Tests and pauses the watcher.
""" """
msg = ("For the following tests it's recommended to turn off the" master = qa_config.GetMasterNode()
" ganeti-watcher cronjob.")
print cmd = ["gnt-cluster", "watcher", "pause", "4h"]
print qa_utils.FormatWarning(msg) AssertEqual(StartSSH(master["primary"],
utils.ShellQuoteArgs(cmd)).wait(), 0)
cmd = ["gnt-cluster", "watcher", "info"]
output = GetCommandOutput(master["primary"],
utils.ShellQuoteArgs(cmd))
AssertMatch(output, r"^.*\bis paused\b.*")
def TestResumeWatcher():
"""Tests and unpauses the watcher.
"""
master = qa_config.GetMasterNode()
cmd = ["gnt-cluster", "watcher", "continue"]
AssertEqual(StartSSH(master["primary"],
utils.ShellQuoteArgs(cmd)).wait(), 0)
cmd = ["gnt-cluster", "watcher", "info"]
output = GetCommandOutput(master["primary"],
utils.ShellQuoteArgs(cmd))
AssertMatch(output, r"^.*\bis not paused\b.*")
def TestInstanceAutomaticRestart(node, instance): def TestInstanceAutomaticRestart(node, instance):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment