From db1473059f58e9df934a31e6c1a70264f490e8aa Mon Sep 17 00:00:00 2001
From: Tom Limoncelli <tlim@google.com>
Date: Wed, 2 Jun 2010 11:06:37 -0400
Subject: [PATCH] ganeti-watcher should attempt to fix ganeti-rapi

Update ganeti-watcher so that it tests the master's RAPI port with a
simple test (in this case GetVersion). If it fails, make one attempt
at restarting ganeti-rapi and retest.

- daemons/ganeti-watcher: Test rapi and make one attempt at restarting it.
- lib/utils.py: add StopDaemon() function.

Signed-off-by: Tom Limoncelli <tlim@google.com>
Signed-off-by: Michael Hanselmann <hansmi@google.com>
Reviewed-by: Michael Hanselmann <hansmi@google.com>
---
 daemons/ganeti-watcher | 43 ++++++++++++++++++++++++++++++++++++++++++
 lib/utils.py           | 13 +++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 1f82db8b9..b0d924d43 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -47,8 +47,11 @@ from ganeti import luxi
 from ganeti import ssconf
 from ganeti import bdev
 from ganeti import hypervisor
+from ganeti import rapi
 from ganeti.confd import client as confd_client
 
+import ganeti.rapi.client # pylint: disable-msg=W0611
+
 
 MAXTRIES = 5
 BAD_STATES = ['ERROR_down']
@@ -595,6 +598,34 @@ def OpenStateFile(path):
   return os.fdopen(statefile_fd, "w+")
 
 
+def IsRapiResponding(hostname):
+  """Connects to RAPI port and does a simple test.
+
+  Connects to RAPI port of hostname and does a simple test. At this time, the
+  test is GetVersion.
+
+  @type hostname: string
+  @param hostname: hostname of the node to connect to.
+  @rtype: bool
+  @return: Whether RAPI is working properly
+
+  """
+  ssl_config = rapi.client.CertAuthorityVerify(constants.RAPI_CERT_FILE)
+  rapi_client = \
+    rapi.client.GanetiRapiClient(hostname,
+                                 config_ssl_verification=ssl_config)
+  try:
+    master_version = rapi_client.GetVersion()
+  except rapi.client.CertificateError, err:
+    logging.warning("RAPI Error: CertificateError (%s)", err)
+    return False
+  except rapi.client.GanetiApiError, err:
+    logging.warning("RAPI Error: GanetiApiError (%s)", err)
+    return False
+  logging.debug("RAPI Result: master_version is %s", master_version)
+  return master_version == constants.RAPI_VERSION
+
+
 def ParseOptions():
   """Parse the command line options.
 
@@ -668,6 +699,18 @@ def main():
       # we are on master now
       utils.EnsureDaemon(constants.RAPI)
 
+      # If RAPI isn't responding to queries, try one restart.
+      logging.debug("Attempting to talk with RAPI.")
+      if not IsRapiResponding(constants.LOCALHOST_IP_ADDRESS):
+        logging.warning("Couldn't get answer from Ganeti RAPI daemon."
+                        " Restarting Ganeti RAPI.")
+        utils.StopDaemon(constants.RAPI)
+        utils.EnsureDaemon(constants.RAPI)
+        logging.debug("Second attempt to talk with RAPI")
+        if not IsRapiResponding(constants.LOCALHOST_IP_ADDRESS):
+          logging.fatal("RAPI is not responding. Please investigate.")
+      logging.debug("Successfully talked to RAPI.")
+
       try:
         watcher = Watcher(options, notepad)
       except errors.ConfigurationError:
diff --git a/lib/utils.py b/lib/utils.py
index 70f5b41fa..a1fd259e0 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -2261,6 +2261,19 @@ def EnsureDaemon(name):
   return True
 
 
+def StopDaemon(name):
+  """Stop daemon
+
+  """
+  result = RunCmd([constants.DAEMON_UTIL, "stop", name])
+  if result.failed:
+    logging.error("Can't stop daemon '%s', failure %s, output: %s",
+                  name, result.fail_reason, result.output)
+    return False
+
+  return True
+
+
 def WritePidFile(name):
   """Write the current process pidfile.
 
-- 
GitLab