From 5178f1bcc26cd6cc1daebb94d344191cbc224ed3 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Tue, 13 Jan 2009 13:03:44 +0000
Subject: [PATCH] burnin: introduce instance alive checks

This patch adds instance alive checks after most start operations. The
check is done in a custom way:
  - the instance is expected to have an http server up and running
  - and it should server the '/hostname.txt' resource containing the
    hostname of the instance

This allows checking that:
  - creation is working OK
  - start after failover (and in the future migrate) is ok
  - rename works correctly

By default, the check is disabled since one needs a custom OS for this
check.

The patch also fixes a wrong variable name from a previous burnin patch.

Reviewed-by: ultrotter
---
 tools/burnin | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tools/burnin b/tools/burnin
index 291818858..bbde37ebb 100755
--- a/tools/burnin
+++ b/tools/burnin
@@ -27,6 +27,9 @@ import os
 import sys
 import optparse
 import time
+import socket
+import urllib2
+import errno
 from itertools import izip, islice, cycle
 from cStringIO import StringIO
 
@@ -41,6 +44,10 @@ from ganeti import utils
 USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...")
 
 
+class InstanceDown(Exception):
+  """The checked instance was not up"""
+
+
 def Usage():
   """Shows program usage information and exits the program."""
 
@@ -190,6 +197,16 @@ class Burner(object):
                       dest="parallel",
                       help="Enable parallelization of some operations in"
                       " order to speed burnin or to test granular locking")
+    parser.add_option("--net-timeout", default=15, type="int",
+                      dest="net_timeout",
+                      help="The instance check network timeout in seconds"
+                      " (defaults to 15 seconds)")
+    parser.add_option("-C", "--http-check", default=False, action="store_true",
+                      dest="http_check",
+                      help="Enable checking of instance status via http,"
+                      " looking for /hostname.txt that should contain the"
+                      " name of the instance")
+
 
     options, args = parser.parse_args()
     if len(args) < 1 or options.os is None:
@@ -205,7 +222,7 @@ class Burner(object):
 
     if options.disk_template == constants.DT_DISKLESS:
       disk_size = disk_growth = []
-      opts.do_addremove_disks = False
+      options.do_addremove_disks = False
     else:
       disk_size = [utils.ParseUnit(v) for v in options.disk_size.split(",")]
       disk_growth = [utils.ParseUnit(v)
@@ -234,6 +251,8 @@ class Burner(object):
       }
     self.hvp = {}
 
+    socket.setdefaulttimeout(options.net_timeout)
+
   def GetState(self):
     """Read the cluster state from the config."""
     if self.opts.nodes:
@@ -314,6 +333,9 @@ class Burner(object):
     if self.opts.parallel:
       self.ExecJobSet(jobset)
 
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
+
   def GrowDisks(self):
     """Grow both the os and the swap disks by the requested amount, if any."""
     for instance in self.instances:
@@ -360,6 +382,8 @@ class Burner(object):
 
       Log("- Failover instance %s" % (instance))
       self.ExecOp(op)
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
 
   def ImportExport(self):
     """Export the instance, delete it, and import it back.
@@ -429,6 +453,9 @@ class Burner(object):
 
       self.to_rem.append(instance)
 
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
+
   def StopInstance(self, instance):
     """Stop given instance."""
     op = opcodes.OpShutdownInstance(instance_name=instance)
@@ -454,6 +481,9 @@ class Burner(object):
       self.StopInstance(instance)
       self.StartInstance(instance)
 
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
+
   def Remove(self):
     """Remove the instances."""
     for instance in self.to_rem:
@@ -469,10 +499,14 @@ class Burner(object):
       self.StopInstance(instance)
       self.RenameInstance(instance, rename)
       self.StartInstance(rename)
+      self._CheckInstanceAlive(rename)
       self.StopInstance(rename)
       self.RenameInstance(rename, instance)
       self.StartInstance(instance)
 
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
+
   def Reinstall(self):
     """Reinstall the instances."""
     for instance in self.instances:
@@ -485,6 +519,8 @@ class Burner(object):
       Log("- Reinstall instance %s specifying the OS" % (instance,))
       self.ExecOp(op)
       self.StartInstance(instance)
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
 
   def Reboot(self):
     """Reinstall the instances."""
@@ -495,6 +531,7 @@ class Burner(object):
                                       ignore_secondaries=False)
         Log("- Reboot instance %s with type '%s'" % (instance, reboot_type))
         self.ExecOp(op)
+        self._CheckInstanceAlive(instance)
 
   def ActivateDisks(self):
     """Activate and deactivate disks of the instances."""
@@ -509,6 +546,8 @@ class Burner(object):
       Log("- Deactivate disks of offline instance %s" % (instance,))
       self.ExecOp(op_deact)
       self.StartInstance(instance)
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
 
   def AddRemoveDisks(self):
     """Add and remove an extra disk for the instances."""
@@ -524,6 +563,8 @@ class Burner(object):
       Log("- Removing the last disk of instance %s" % (instance,))
       self.ExecOp(op_rem)
       self.StartInstance(instance)
+    for instance in self.instances:
+      self._CheckInstanceAlive(instance)
 
   def AddRemoveNICs(self):
     """Add and remove an extra NIC for the instances."""
@@ -537,6 +578,33 @@ class Burner(object):
       Log("- Removing the last NIC of instance %s" % (instance,))
       self.ExecOp(op_rem)
 
+  def _CheckInstanceAlive(self, instance):
+    """Check if an instance is alive by doing http checks.
+
+    This will try to retrieve the url on the instance /hostname.txt
+    and check that it contains the hostname of the instance. In case
+    we get ECONNREFUSED, we retry up to the net timeout seconds, for
+    any other error we abort.
+
+    """
+    if not self.opts.http_check:
+      return
+    try:
+      for retries in range(self.opts.net_timeout):
+        try:
+          url = urllib2.urlopen("http://%s/hostname.txt" % instance)
+        except urllib2.URLError, err:
+          if err.args[0][0] == errno.ECONNREFUSED:
+            time.sleep(1)
+            continue
+          raise
+    except urllib2.URLError, err:
+      raise InstanceDown(instance, str(err))
+    hostname = url.read().strip()
+    if hostname != instance:
+      raise InstanceDown(instance, ("Hostname mismatch, expected %s, got %s" %
+                                    (instance, hostname)))
+
   def BurninCluster(self):
     """Test a cluster intensively.
 
-- 
GitLab