From e8ae0c200611cae990f0ab9a524a0adc0fd57b1a Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Thu, 18 Oct 2007 13:02:03 +0000 Subject: [PATCH] Implement disk failure QA test. Testing failure on the primary node is currently disabled due to drbd problems. Reviewed-by: iustinp --- qa/ganeti-qa.py | 6 +++ qa/qa-sample.yaml | 5 ++ qa/qa_instance.py | 125 +++++++++++++++++++++++++++++++++++++++++++++- qa/qa_utils.py | 14 ++++-- 4 files changed, 146 insertions(+), 4 deletions(-) diff --git a/qa/ganeti-qa.py b/qa/ganeti-qa.py index 64c0006b2..3c85242a2 100755 --- a/qa/ganeti-qa.py +++ b/qa/ganeti-qa.py @@ -228,6 +228,12 @@ def main(): if qa_config.TestEnabled('node-volumes'): RunTest(qa_node.TestNodeVolumes) + if qa_config.TestEnabled('instance-disk-failure'): + RunTest(qa_instance.TestInstanceMasterDiskFailure, + instance, node, node2) + RunTest(qa_instance.TestInstanceSecondaryDiskFailure, + instance, node, node2) + RunTest(qa_instance.TestInstanceRemove, instance) del instance finally: diff --git a/qa/qa-sample.yaml b/qa/qa-sample.yaml index be6779733..139952741 100644 --- a/qa/qa-sample.yaml +++ b/qa/qa-sample.yaml @@ -57,6 +57,11 @@ tests: instance-reinstall: True instance-shutdown: True + # Make sure not to include the disk(s) required for Dom0 to be included in + # the volume group used for instances. Otherwise the whole system may stop + # working until restarted. + instance-disk-failure: False + # This test takes up to 6 minutes to complete instance-automatic-restart: False diff --git a/qa/qa_instance.py b/qa/qa_instance.py index b6bd59be4..ec17d703a 100644 --- a/qa/qa_instance.py +++ b/qa/qa_instance.py @@ -20,13 +20,21 @@ """ +import re +import time + from ganeti import utils from ganeti import constants import qa_config import qa_utils +import qa_error + +from qa_utils import AssertEqual, AssertNotEqual, StartSSH -from qa_utils import AssertEqual, StartSSH + +def _GetDiskStatePath(disk): + return "/sys/block/%s/device/state" % disk def _GetGenericAddParameters(): @@ -172,3 +180,118 @@ def TestBackupList(expnode): cmd = ['gnt-backup', 'list', '--nodes=%s' % expnode['primary']] AssertEqual(StartSSH(master['primary'], utils.ShellQuoteArgs(cmd)).wait(), 0) + + +def _TestInstanceDiskFailure(instance, node, node2, onmaster): + """Testing disk failure.""" + master = qa_config.GetMasterNode() + sq = utils.ShellQuoteArgs + + instance_full = qa_utils.ResolveInstanceName(instance) + node_full = qa_utils.ResolveNodeName(node) + node2_full = qa_utils.ResolveNodeName(node2) + + cmd = ['gnt-node', 'volumes', '--separator=|', '--no-headers', + '--output=node,phys,instance', + node['primary'], node2['primary']] + output = qa_utils.GetCommandOutput(master['primary'], sq(cmd)) + + # Get physical disk names + re_disk = re.compile(r'^/dev/([a-z]+)\d+$') + node2disk = {} + for line in output.splitlines(): + (node_name, phys, inst) = line.split('|') + if inst == instance_full: + if node_name not in node2disk: + node2disk[node_name] = [] + + m = re_disk.match(phys) + if not m: + raise qa_error.Error("Unknown disk name format: %s" % disk) + + name = m.group(1) + if name not in node2disk[node_name]: + node2disk[node_name].append(name) + + if [node2_full, node_full][int(onmaster)] not in node2disk: + raise qa_error.Error("Couldn't find physical disks used on " + "%s node" % ["secondary", "master"][int(onmaster)]) + + # Check whether nodes have ability to stop disks + for node_name, disks in node2disk.iteritems(): + cmds = [] + for disk in disks: + cmds.append(sq(["test", "-f", _GetDiskStatePath(disk)])) + AssertEqual(StartSSH(node_name, ' && '.join(cmds)).wait(), 0) + + # Get device paths + cmd = ['gnt-instance', 'activate-disks', instance['name']] + output = qa_utils.GetCommandOutput(master['primary'], sq(cmd)) + devpath = [] + for line in output.splitlines(): + (_, _, tmpdevpath) = line.split(':') + devpath.append(tmpdevpath) + + # Get drbd device paths + cmd = ['gnt-instance', 'info', instance['name']] + output = qa_utils.GetCommandOutput(master['primary'], sq(cmd)) + pattern = (r'\s+-\s+type:\s+drbd,\s+.*$' + r'\s+primary:\s+(/dev/drbd\d+)\s+') + drbddevs = re.findall(pattern, output, re.M) + + # Deactivate disks on secondary node + halted_disks = [] + cmds = [] + for name in node2disk[[node2_full, node_full][int(onmaster)]]: + halted_disks.append(name) + cmds.append(sq(["echo", "offline"]) + " >%s" % _GetDiskStatePath(name)) + AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'], + '; '.join(cmds)).wait(), 0) + try: + # Write something to the disks and give some time to notice the problem + cmds = [] + for disk in devpath: + cmds.append(sq(["dd", "count=1", "bs=512", "conv=notrunc", + "if=%s" % disk, "of=%s" % disk])) + for _ in (0, 1, 2): + AssertEqual(StartSSH(node['primary'], ' && '.join(cmds)).wait(), 0) + time.sleep(3) + + # For manual checks + cmd = ['gnt-instance', 'info', instance['name']] + AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0) + + finally: + # Activate disks again + cmds = [] + for name in halted_disks: + cmds.append(sq(["echo", "running"]) + " >%s" % _GetDiskStatePath(name)) + AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'], + '; '.join(cmds)).wait(), 0) + + # Restart instance + cmd = ['gnt-instance', 'shutdown', instance['name']] + AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0) + + cmd = ['gnt-instance', 'startup', '--force', instance['name']] + AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0) + + # Make sure disks are up again + cmd = ['gnt-instance', 'activate-disks', instance['name']] + AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0) + + cmd = ['gnt-cluster', 'verify'] + AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0) + + +def TestInstanceMasterDiskFailure(instance, node, node2): + """Testing disk failure on master node.""" + qa_utils.PrintError("Disk failure on primary node cannot be " + "tested due to potential crashes.") + # The following can cause crashes, thus it's disabled until fixed + #return _TestInstanceDiskFailure(instance, node, node2, True) + + +def TestInstanceSecondaryDiskFailure(instance, node, node2): + """Testing disk failure on secondary node.""" + return _TestInstanceDiskFailure(instance, node, node2, False) diff --git a/qa/qa_utils.py b/qa/qa_utils.py index 41b8b9329..f94aa6fda 100644 --- a/qa/qa_utils.py +++ b/qa/qa_utils.py @@ -61,12 +61,20 @@ def _SetupColours(): _SetupColours() -def AssertEqual(first, second, msg=None): +def AssertEqual(first, second): """Raises an error when values aren't equal. """ if not first == second: - raise qa_error.Error(msg or '%r == %r' % (first, second)) + raise qa_error.Error('%r == %r' % (first, second)) + + +def AssertNotEqual(first, second): + """Raises an error when values are equal. + + """ + if not first != second: + raise qa_error.Error('%r != %r' % (first, second)) def GetSSHCommand(node, cmd, strict=True): @@ -158,7 +166,7 @@ def ResolveInstanceName(instance): """Gets the full name of an instance. """ - return _ResolveName(['gnt-instance', 'info', instance['info']], + return _ResolveName(['gnt-instance', 'info', instance['name']], 'Instance name') -- GitLab