From c4f0219c486fff45b5f883efe036154aa1243c04 Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Mon, 25 May 2009 15:41:56 +0200 Subject: [PATCH] watcher: automatically restart noded/rapi This patch makes the watcher automatically restart the node and rapi daemons, if they are not running (as per the PID file). This is not an exhaustive test; a better one would be TCP connect to the port, and an even better one a simple protocol ping (e.g. get / for rapi and a rpc_call_alive for noded), but since we don't know how they've been started we can't implement it today. rapi would need to write the SSL/port to a file, and noded something similar, so that we know how to connect. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Michael Hanselmann <hansmi@google.com> --- daemons/ganeti-watcher | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index b762c6a9a..2749de63f 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -80,6 +80,20 @@ def StartMaster(): return not result.failed +def EnsureDaemon(daemon): + """Check for and start daemon if not alive. + + """ + pidfile = utils.DaemonPidFileName(daemon) + pid = utils.ReadPidFile(pidfile) + if pid == 0 or not utils.IsProcessAlive(pid): # no file or dead pid + logging.debug("Daemon '%s' not alive, trying to restart", daemon) + result = utils.RunCmd([daemon]) + if not result: + logging.error("Can't start daemon '%s', failure %s, output: %s", + daemon, result.fail_reason, result.output) + + class WatcherState(object): """Interface to a state file recording restart attempts. @@ -464,6 +478,10 @@ def main(): update_file = False try: + # on master or not, try to start the node dameon (use _PID but is + # the same as daemon name) + EnsureDaemon(constants.NODED_PID) + notepad = WatcherState() try: try: @@ -482,6 +500,9 @@ def main(): # else retry the connection client = cli.GetClient() + # we are on master now (use _PID but is the same as daemon name) + EnsureDaemon(constants.RAPI_PID) + try: watcher = Watcher(options, notepad) except errors.ConfigurationError: -- GitLab