From db1473059f58e9df934a31e6c1a70264f490e8aa Mon Sep 17 00:00:00 2001 From: Tom Limoncelli <tlim@google.com> Date: Wed, 2 Jun 2010 11:06:37 -0400 Subject: [PATCH] ganeti-watcher should attempt to fix ganeti-rapi Update ganeti-watcher so that it tests the master's RAPI port with a simple test (in this case GetVersion). If it fails, make one attempt at restarting ganeti-rapi and retest. - daemons/ganeti-watcher: Test rapi and make one attempt at restarting it. - lib/utils.py: add StopDaemon() function. Signed-off-by: Tom Limoncelli <tlim@google.com> Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Michael Hanselmann <hansmi@google.com> --- daemons/ganeti-watcher | 43 ++++++++++++++++++++++++++++++++++++++++++ lib/utils.py | 13 +++++++++++++ 2 files changed, 56 insertions(+) diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher index 1f82db8b9..b0d924d43 100755 --- a/daemons/ganeti-watcher +++ b/daemons/ganeti-watcher @@ -47,8 +47,11 @@ from ganeti import luxi from ganeti import ssconf from ganeti import bdev from ganeti import hypervisor +from ganeti import rapi from ganeti.confd import client as confd_client +import ganeti.rapi.client # pylint: disable-msg=W0611 + MAXTRIES = 5 BAD_STATES = ['ERROR_down'] @@ -595,6 +598,34 @@ def OpenStateFile(path): return os.fdopen(statefile_fd, "w+") +def IsRapiResponding(hostname): + """Connects to RAPI port and does a simple test. + + Connects to RAPI port of hostname and does a simple test. At this time, the + test is GetVersion. + + @type hostname: string + @param hostname: hostname of the node to connect to. + @rtype: bool + @return: Whether RAPI is working properly + + """ + ssl_config = rapi.client.CertAuthorityVerify(constants.RAPI_CERT_FILE) + rapi_client = \ + rapi.client.GanetiRapiClient(hostname, + config_ssl_verification=ssl_config) + try: + master_version = rapi_client.GetVersion() + except rapi.client.CertificateError, err: + logging.warning("RAPI Error: CertificateError (%s)", err) + return False + except rapi.client.GanetiApiError, err: + logging.warning("RAPI Error: GanetiApiError (%s)", err) + return False + logging.debug("RAPI Result: master_version is %s", master_version) + return master_version == constants.RAPI_VERSION + + def ParseOptions(): """Parse the command line options. @@ -668,6 +699,18 @@ def main(): # we are on master now utils.EnsureDaemon(constants.RAPI) + # If RAPI isn't responding to queries, try one restart. + logging.debug("Attempting to talk with RAPI.") + if not IsRapiResponding(constants.LOCALHOST_IP_ADDRESS): + logging.warning("Couldn't get answer from Ganeti RAPI daemon." + " Restarting Ganeti RAPI.") + utils.StopDaemon(constants.RAPI) + utils.EnsureDaemon(constants.RAPI) + logging.debug("Second attempt to talk with RAPI") + if not IsRapiResponding(constants.LOCALHOST_IP_ADDRESS): + logging.fatal("RAPI is not responding. Please investigate.") + logging.debug("Successfully talked to RAPI.") + try: watcher = Watcher(options, notepad) except errors.ConfigurationError: diff --git a/lib/utils.py b/lib/utils.py index 70f5b41fa..a1fd259e0 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -2261,6 +2261,19 @@ def EnsureDaemon(name): return True +def StopDaemon(name): + """Stop daemon + + """ + result = RunCmd([constants.DAEMON_UTIL, "stop", name]) + if result.failed: + logging.error("Can't stop daemon '%s', failure %s, output: %s", + name, result.fail_reason, result.output) + return False + + return True + + def WritePidFile(name): """Write the current process pidfile. -- GitLab