From 7dfb83c28ac302753e9125afe0d8eb01222032d8 Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Tue, 19 May 2009 13:23:31 +0200 Subject: [PATCH] watcher: try to restart the master if down Bugs in either our code or in associated libraries can bring the master daemon down, and this (due to the 2.0 architecture) stops all work on the cluster. Since the watcher already does periodic checks on the cluster, we modify it to try to start the master automatically in case of failures to connect. This will be tried only once per cycle. Also, in this case, we modify the code so that the watcher status file is not updated - its timestamp will reflect thus the time of last successful connection to the master. Side note: the except errors.ConfigurationError part could be cleaned up, since in 2.0 we don't usually get that directly, and if we do it's an error and we shouldn't touch the file anyway; but that is not a rc5 change. Signed-off-by: Iustin Pop <iustin@google.com> --- daemons/ganeti-watcher | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 8745f2dd7..42a2eaf83 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -39,6 +39,7 @@ from ganeti import serializer from ganeti import errors from ganeti import opcodes from ganeti import cli +from ganeti import luxi MAXTRIES = 5 @@ -69,6 +70,16 @@ def Indent(s, prefix='| '): return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines())) +def StartMaster(): + """Try to start the master daemon. + + """ + result = utils.RunCmd(['ganeti-masterd']) + if result.failed: + logging.error("Can't start the master daemon: output '%s'", result.output) + return not result.failed + + class WatcherState(object): """Interface to a state file recording restart attempts. @@ -441,6 +452,7 @@ def main(): utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug, stderr_logging=options.debug) + update_file = True try: notepad = WatcherState() try: @@ -448,7 +460,17 @@ def main(): client = cli.GetClient() except errors.OpPrereqError: # this is, from cli.GetClient, a not-master case + logging.debug("Not on master, exiting") sys.exit(constants.EXIT_SUCCESS) + except luxi.NoMasterError, err: + logging.warning("Master seems to be down (%s), trying to restart", + str(err)) + if not StartMaster(): + logging.critical("Can't start the master, exiting") + update_file = False + sys.exit(constants.EXIT_FAILURE) + # else retry the connection + client = cli.GetClient() try: watcher = Watcher(options, notepad) @@ -458,7 +480,10 @@ def main(): watcher.Run() finally: - notepad.Save() + if update_file: + notepad.Save() + else: + logging.debug("Not updating status file due to failure") except SystemExit: raise except NotMasterError: -- GitLab