From c4f0219c486fff45b5f883efe036154aa1243c04 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Mon, 25 May 2009 15:41:56 +0200
Subject: [PATCH] watcher: automatically restart noded/rapi

This patch makes the watcher automatically restart the node and rapi
daemons, if they are not running (as per the PID file).

This is not an exhaustive test; a better one would be TCP connect to the
port, and an even better one a simple protocol ping (e.g. get / for rapi
and a rpc_call_alive for noded), but since we don't know how they've
been started we can't implement it today. rapi would need to write the
SSL/port to a file, and noded something similar, so that we know how to
connect.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Michael Hanselmann <hansmi@google.com>
---
 daemons/ganeti-watcher | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index b762c6a9a..2749de63f 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -80,6 +80,20 @@ def StartMaster():
   return not result.failed
 
 
+def EnsureDaemon(daemon):
+  """Check for and start daemon if not alive.
+
+  """
+  pidfile = utils.DaemonPidFileName(daemon)
+  pid = utils.ReadPidFile(pidfile)
+  if pid == 0 or not utils.IsProcessAlive(pid): # no file or dead pid
+    logging.debug("Daemon '%s' not alive, trying to restart", daemon)
+    result = utils.RunCmd([daemon])
+    if not result:
+      logging.error("Can't start daemon '%s', failure %s, output: %s",
+                    daemon, result.fail_reason, result.output)
+
+
 class WatcherState(object):
   """Interface to a state file recording restart attempts.
 
@@ -464,6 +478,10 @@ def main():
 
   update_file = False
   try:
+    # on master or not, try to start the node dameon (use _PID but is
+    # the same as daemon name)
+    EnsureDaemon(constants.NODED_PID)
+
     notepad = WatcherState()
     try:
       try:
@@ -482,6 +500,9 @@ def main():
         # else retry the connection
         client = cli.GetClient()
 
+      # we are on master now (use _PID but is the same as daemon name)
+      EnsureDaemon(constants.RAPI_PID)
+
       try:
         watcher = Watcher(options, notepad)
       except errors.ConfigurationError:
-- 
GitLab