From e8ae0c200611cae990f0ab9a524a0adc0fd57b1a Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <hansmi@google.com>
Date: Thu, 18 Oct 2007 13:02:03 +0000
Subject: [PATCH] Implement disk failure QA test.

Testing failure on the primary node is currently disabled due to drbd problems.

Reviewed-by: iustinp
---
 qa/ganeti-qa.py   |   6 +++
 qa/qa-sample.yaml |   5 ++
 qa/qa_instance.py | 125 +++++++++++++++++++++++++++++++++++++++++++++-
 qa/qa_utils.py    |  14 ++++--
 4 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/qa/ganeti-qa.py b/qa/ganeti-qa.py
index 64c0006b2..3c85242a2 100755
--- a/qa/ganeti-qa.py
+++ b/qa/ganeti-qa.py
@@ -228,6 +228,12 @@ def main():
         if qa_config.TestEnabled('node-volumes'):
           RunTest(qa_node.TestNodeVolumes)
 
+        if qa_config.TestEnabled('instance-disk-failure'):
+          RunTest(qa_instance.TestInstanceMasterDiskFailure,
+                  instance, node, node2)
+          RunTest(qa_instance.TestInstanceSecondaryDiskFailure,
+                  instance, node, node2)
+
         RunTest(qa_instance.TestInstanceRemove, instance)
         del instance
       finally:
diff --git a/qa/qa-sample.yaml b/qa/qa-sample.yaml
index be6779733..139952741 100644
--- a/qa/qa-sample.yaml
+++ b/qa/qa-sample.yaml
@@ -57,6 +57,11 @@ tests:
   instance-reinstall: True
   instance-shutdown: True
 
+  # Make sure not to include the disk(s) required for Dom0 to be included in
+  # the volume group used for instances. Otherwise the whole system may stop
+  # working until restarted.
+  instance-disk-failure: False
+
   # This test takes up to 6 minutes to complete
   instance-automatic-restart: False
 
diff --git a/qa/qa_instance.py b/qa/qa_instance.py
index b6bd59be4..ec17d703a 100644
--- a/qa/qa_instance.py
+++ b/qa/qa_instance.py
@@ -20,13 +20,21 @@
 
 """
 
+import re
+import time
+
 from ganeti import utils
 from ganeti import constants
 
 import qa_config
 import qa_utils
+import qa_error
+
+from qa_utils import AssertEqual, AssertNotEqual, StartSSH
 
-from qa_utils import AssertEqual, StartSSH
+
+def _GetDiskStatePath(disk):
+  return "/sys/block/%s/device/state" % disk
 
 
 def _GetGenericAddParameters():
@@ -172,3 +180,118 @@ def TestBackupList(expnode):
   cmd = ['gnt-backup', 'list', '--nodes=%s' % expnode['primary']]
   AssertEqual(StartSSH(master['primary'],
                        utils.ShellQuoteArgs(cmd)).wait(), 0)
+
+
+def _TestInstanceDiskFailure(instance, node, node2, onmaster):
+  """Testing disk failure."""
+  master = qa_config.GetMasterNode()
+  sq = utils.ShellQuoteArgs
+
+  instance_full = qa_utils.ResolveInstanceName(instance)
+  node_full = qa_utils.ResolveNodeName(node)
+  node2_full = qa_utils.ResolveNodeName(node2)
+
+  cmd = ['gnt-node', 'volumes', '--separator=|', '--no-headers',
+         '--output=node,phys,instance',
+         node['primary'], node2['primary']]
+  output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
+
+  # Get physical disk names
+  re_disk = re.compile(r'^/dev/([a-z]+)\d+$')
+  node2disk = {}
+  for line in output.splitlines():
+    (node_name, phys, inst) = line.split('|')
+    if inst == instance_full:
+      if node_name not in node2disk:
+        node2disk[node_name] = []
+
+      m = re_disk.match(phys)
+      if not m:
+        raise qa_error.Error("Unknown disk name format: %s" % disk)
+
+      name = m.group(1)
+      if name not in node2disk[node_name]:
+        node2disk[node_name].append(name)
+
+  if [node2_full, node_full][int(onmaster)] not in node2disk:
+    raise qa_error.Error("Couldn't find physical disks used on "
+                         "%s node" % ["secondary", "master"][int(onmaster)])
+
+  # Check whether nodes have ability to stop disks
+  for node_name, disks in node2disk.iteritems():
+    cmds = []
+    for disk in disks:
+      cmds.append(sq(["test", "-f", _GetDiskStatePath(disk)]))
+    AssertEqual(StartSSH(node_name, ' && '.join(cmds)).wait(), 0)
+
+  # Get device paths
+  cmd = ['gnt-instance', 'activate-disks', instance['name']]
+  output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
+  devpath = []
+  for line in output.splitlines():
+    (_, _, tmpdevpath) = line.split(':')
+    devpath.append(tmpdevpath)
+
+  # Get drbd device paths
+  cmd = ['gnt-instance', 'info', instance['name']]
+  output = qa_utils.GetCommandOutput(master['primary'], sq(cmd))
+  pattern = (r'\s+-\s+type:\s+drbd,\s+.*$'
+             r'\s+primary:\s+(/dev/drbd\d+)\s+')
+  drbddevs = re.findall(pattern, output, re.M)
+
+  # Deactivate disks on secondary node
+  halted_disks = []
+  cmds = []
+  for name in node2disk[[node2_full, node_full][int(onmaster)]]:
+    halted_disks.append(name)
+    cmds.append(sq(["echo", "offline"]) + " >%s" % _GetDiskStatePath(name))
+  AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'],
+                       '; '.join(cmds)).wait(), 0)
+  try:
+    # Write something to the disks and give some time to notice the problem
+    cmds = []
+    for disk in devpath:
+      cmds.append(sq(["dd", "count=1", "bs=512", "conv=notrunc",
+                      "if=%s" % disk, "of=%s" % disk]))
+    for _ in (0, 1, 2):
+      AssertEqual(StartSSH(node['primary'], ' && '.join(cmds)).wait(), 0)
+      time.sleep(3)
+
+    # For manual checks
+    cmd = ['gnt-instance', 'info', instance['name']]
+    AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
+
+  finally:
+    # Activate disks again
+    cmds = []
+    for name in halted_disks:
+      cmds.append(sq(["echo", "running"]) + " >%s" % _GetDiskStatePath(name))
+    AssertEqual(StartSSH([node2, node][int(onmaster)]['primary'],
+                         '; '.join(cmds)).wait(), 0)
+
+  # Restart instance
+  cmd = ['gnt-instance', 'shutdown', instance['name']]
+  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
+
+  cmd = ['gnt-instance', 'startup', '--force', instance['name']]
+  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
+
+  # Make sure disks are up again
+  cmd = ['gnt-instance', 'activate-disks', instance['name']]
+  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
+
+  cmd = ['gnt-cluster', 'verify']
+  AssertEqual(StartSSH(master['primary'], sq(cmd)).wait(), 0)
+
+
+def TestInstanceMasterDiskFailure(instance, node, node2):
+  """Testing disk failure on master node."""
+  qa_utils.PrintError("Disk failure on primary node cannot be "
+                      "tested due to potential crashes.")
+  # The following can cause crashes, thus it's disabled until fixed
+  #return _TestInstanceDiskFailure(instance, node, node2, True)
+
+
+def TestInstanceSecondaryDiskFailure(instance, node, node2):
+  """Testing disk failure on secondary node."""
+  return _TestInstanceDiskFailure(instance, node, node2, False)
diff --git a/qa/qa_utils.py b/qa/qa_utils.py
index 41b8b9329..f94aa6fda 100644
--- a/qa/qa_utils.py
+++ b/qa/qa_utils.py
@@ -61,12 +61,20 @@ def _SetupColours():
 _SetupColours()
 
 
-def AssertEqual(first, second, msg=None):
+def AssertEqual(first, second):
   """Raises an error when values aren't equal.
 
   """
   if not first == second:
-    raise qa_error.Error(msg or '%r == %r' % (first, second))
+    raise qa_error.Error('%r == %r' % (first, second))
+
+
+def AssertNotEqual(first, second):
+  """Raises an error when values are equal.
+
+  """
+  if not first != second:
+    raise qa_error.Error('%r != %r' % (first, second))
 
 
 def GetSSHCommand(node, cmd, strict=True):
@@ -158,7 +166,7 @@ def ResolveInstanceName(instance):
   """Gets the full name of an instance.
 
   """
-  return _ResolveName(['gnt-instance', 'info', instance['info']],
+  return _ResolveName(['gnt-instance', 'info', instance['name']],
                       'Instance name')
 
 
-- 
GitLab