Skip to content
Snippets Groups Projects
Commit e8ae0c20 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

Implement disk failure QA test.

Testing failure on the primary node is currently disabled due to drbd problems.

Reviewed-by: iustinp
parent 579d4337
No related branches found
No related tags found
No related merge requests found
...@@ -228,6 +228,12 @@ def main(): ...@@ -228,6 +228,12 @@ def main():
if qa_config.TestEnabled('node-volumes'): if qa_config.TestEnabled('node-volumes'):
RunTest(qa_node.TestNodeVolumes) RunTest(qa_node.TestNodeVolumes)
if qa_config.TestEnabled('instance-disk-failure'):
RunTest(qa_instance.TestInstanceMasterDiskFailure,
instance, node, node2)
RunTest(qa_instance.TestInstanceSecondaryDiskFailure,
instance, node, node2)
RunTest(qa_instance.TestInstanceRemove, instance) RunTest(qa_instance.TestInstanceRemove, instance)
del instance del instance
finally: finally:
......
...@@ -57,6 +57,11 @@ tests: ...@@ -57,6 +57,11 @@ tests:
instance-reinstall: True instance-reinstall: True
instance-shutdown: True instance-shutdown: True
# Make sure not to include the disk(s) required for Dom0 to be included in
# the volume group used for instances. Otherwise the whole system may stop
# working until restarted.
instance-disk-failure: False
# This test takes up to 6 minutes to complete # This test takes up to 6 minutes to complete
instance-automatic-restart: False instance-automatic-restart: False
......
...@@ -20,13 +20,21 @@ ...@@ -20,13 +20,21 @@
""" """
import re
import time
from ganeti import utils from ganeti import utils
from ganeti import constants from ganeti import constants
import qa_config import qa_config
import qa_utils import qa_utils
import qa_error
from qa_utils import AssertEqual, AssertNotEqual, StartSSH
from qa_utils import AssertEqual, StartSSH
def _GetDiskStatePath(disk):
return "/sys/block/%s/device/state" % disk
def _GetGenericAddParameters(): def _GetGenericAddParameters():
...@@ -172,3 +180,118 @@ def TestBackupList(expnode): ...@@ -172,3 +180,118 @@ def TestBackupList(expnode):
cmd = ['gnt-backup', 'list', '--nodes=%s' % expnode['primary']] cmd = ['gnt-backup', 'list', '--nodes=%s' % expnode['primary']]
AssertEqual(StartSSH(master['primary'], AssertEqual(StartSSH(master['primary'],
utils.ShellQuoteArgs(cmd)).wait(), 0) utils.ShellQuoteArgs(cmd)).wait(), 0)
def _TestInstanceDiskFailure(instance, node, node2, onmaster):
"""Testing disk failure."""
master = qa_config.GetMasterNode()
sq = utils.ShellQuoteArgs
instance_full = qa_utils.ResolveInstanceName(instance)
node_full = qa_utils.ResolveNodeName(node)
node2_full = qa_utils.ResolveNodeName(node2)
cmd = ['gnt-node', 'volumes', '--separator=|', '--no-headers',
'--output=node,phys,instance',
node['primary'], node2['primary']]
output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
# Get physical disk names
re_disk = re.compile(r'^/dev/([a-z]+)\d+$')
node2disk = {}
for line in output.splitlines():
(node_name, phys, inst) = line.split('|')
if inst == instance_full:
if node_name not in node2disk:
node2disk[node_name] = []
m = re_disk.match(phys)
if not m:
raise qa_error.Error("Unknown disk name format: %s" % disk)
name = m.group(1)
if name not in node2disk[node_name]:
node2disk[node_name].append(name)
if [node2_full, node_full][int(onmaster)] not in node2disk:
raise qa_error.Error("Couldn't find physical disks used on "
"%s node" % ["secondary", "master"][int(onmaster)])
# Check whether nodes have ability to stop disks
for node_name, disks in node2disk.iteritems():
cmds = []
for disk in disks:
cmds.append(sq(["test", "-f", _GetDiskStatePath(disk)]))
AssertEqual(StartSSH(node_name, ' && '.join(cmds)).wait(), 0)
# Get device paths
cmd = ['gnt-instance', 'activate-disks', instance['name']]
output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
devpath = []
for line in output.splitlines():
(_, _, tmpdevpath) = line.split(':')
devpath.append(tmpdevpath)
# Get drbd device paths
cmd = ['gnt-instance', 'info', instance['name']]
output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
pattern = (r'\s+-\s+type:\s+drbd,\s+.*$'
r'\s+primary:\s+(/dev/drbd\d+)\s+')
drbddevs = re.findall(pattern, output, re.M)
# Deactivate disks on secondary node
halted_disks = []
cmds = []
for name in node2disk[[node2_full, node_full][int(onmaster)]]:
halted_disks.append(name)
cmds.append(sq(["echo", "offline"]) + " >%s" % _GetDiskStatePath(name))
AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'],
'; '.join(cmds)).wait(), 0)
try:
# Write something to the disks and give some time to notice the problem
cmds = []
for disk in devpath:
cmds.append(sq(["dd", "count=1", "bs=512", "conv=notrunc",
"if=%s" % disk, "of=%s" % disk]))
for _ in (0, 1, 2):
AssertEqual(StartSSH(node['primary'], ' && '.join(cmds)).wait(), 0)
time.sleep(3)
# For manual checks
cmd = ['gnt-instance', 'info', instance['name']]
AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
finally:
# Activate disks again
cmds = []
for name in halted_disks:
cmds.append(sq(["echo", "running"]) + " >%s" % _GetDiskStatePath(name))
AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'],
'; '.join(cmds)).wait(), 0)
# Restart instance
cmd = ['gnt-instance', 'shutdown', instance['name']]
AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
cmd = ['gnt-instance', 'startup', '--force', instance['name']]
AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
# Make sure disks are up again
cmd = ['gnt-instance', 'activate-disks', instance['name']]
AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
cmd = ['gnt-cluster', 'verify']
AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
def TestInstanceMasterDiskFailure(instance, node, node2):
"""Testing disk failure on master node."""
qa_utils.PrintError("Disk failure on primary node cannot be "
"tested due to potential crashes.")
# The following can cause crashes, thus it's disabled until fixed
#return _TestInstanceDiskFailure(instance, node, node2, True)
def TestInstanceSecondaryDiskFailure(instance, node, node2):
"""Testing disk failure on secondary node."""
return _TestInstanceDiskFailure(instance, node, node2, False)
...@@ -61,12 +61,20 @@ def _SetupColours(): ...@@ -61,12 +61,20 @@ def _SetupColours():
_SetupColours() _SetupColours()
def AssertEqual(first, second, msg=None): def AssertEqual(first, second):
"""Raises an error when values aren't equal. """Raises an error when values aren't equal.
""" """
if not first == second: if not first == second:
raise qa_error.Error(msg or '%r == %r' % (first, second)) raise qa_error.Error('%r == %r' % (first, second))
def AssertNotEqual(first, second):
"""Raises an error when values are equal.
"""
if not first != second:
raise qa_error.Error('%r != %r' % (first, second))
def GetSSHCommand(node, cmd, strict=True): def GetSSHCommand(node, cmd, strict=True):
...@@ -158,7 +166,7 @@ def ResolveInstanceName(instance): ...@@ -158,7 +166,7 @@ def ResolveInstanceName(instance):
"""Gets the full name of an instance. """Gets the full name of an instance.
""" """
return _ResolveName(['gnt-instance', 'info', instance['info']], return _ResolveName(['gnt-instance', 'info', instance['name']],
'Instance name') 'Instance name')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment