From 313b2dd43c1a958abdab798558a85b3d3bc8aa2e Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Mon, 30 Nov 2009 14:09:28 +0100 Subject: [PATCH] gnt-cluster verify: Warn if node time diverges too far The warning will be generated if the clocks diverge by more than 150 seconds. Due to the way the RPC system works, we cannot get exact time differences, e.g. if one of the queried nodes is broken. The comparision is done using a time window. Confd queries will fail if the clock on the client and server are more than 300 seconds from each other. This check helps keeping at least the nodes of a cluster in sync. Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Guido Trotter <ultrotter@google.com> --- lib/backend.py | 4 ++++ lib/cmdlib.py | 32 +++++++++++++++++++++++++++++++ lib/constants.py | 5 +++-- test/ganeti.constants_unittest.py | 6 +++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index f5f258bd2..cce99aba7 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -551,6 +551,10 @@ def VerifyNode(what, cluster_name): tmpr.append("The procfs filesystem doesn't seem to be mounted" " under /proc, missing required directory /proc/sys and" " the file /proc/sysrq-trigger") + + if constants.NV_TIME in what: + result[constants.NV_TIME] = utils.SplitTime(time.time()) + return result diff --git a/lib/cmdlib.py b/lib/cmdlib.py index c149a229d..9bbc19655 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -944,6 +944,7 @@ class LUVerifyCluster(LogicalUnit): ENODESSH = (TNODE, "ENODESSH") ENODEVERSION = (TNODE, "ENODEVERSION") ENODESETUP = (TNODE, "ENODESETUP") + ENODETIME = (TNODE, "ENODETIME") ETYPE_FIELD = "code" ETYPE_ERROR = "ERROR" @@ -1326,14 +1327,23 @@ class LUVerifyCluster(LogicalUnit): constants.NV_VERSION: None, constants.NV_HVINFO: self.cfg.GetHypervisorType(), constants.NV_NODESETUP: None, + constants.NV_TIME: None, } + if vg_name is not None: node_verify_param[constants.NV_VGLIST] = None node_verify_param[constants.NV_LVLIST] = vg_name node_verify_param[constants.NV_PVLIST] = [vg_name] node_verify_param[constants.NV_DRBDLIST] = None + + # Due to the way our RPC system works, exact response times cannot be + # guaranteed (e.g. a broken node could run into a timeout). By keeping the + # time before and after executing the request, we can at least have a time + # window. + nvinfo_starttime = time.time() all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param, self.cfg.GetClusterName()) + nvinfo_endtime = time.time() cluster = self.cfg.GetClusterInfo() master_node = self.cfg.GetMasterNode() @@ -1380,6 +1390,7 @@ class LUVerifyCluster(LogicalUnit): else: instance = instanceinfo[instance] node_drbd[minor] = (instance.name, instance.admin_up) + self._VerifyNode(node_i, file_names, local_checksums, nresult, master_files, node_drbd, vg_name) @@ -1413,6 +1424,27 @@ class LUVerifyCluster(LogicalUnit): if test: continue + # Node time + ntime = nresult.get(constants.NV_TIME, None) + try: + ntime_merged = utils.MergeTime(ntime) + except (ValueError, TypeError): + _ErrorIf(test, self.ENODETIME, node, "Node returned invalid time") + + if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW): + ntime_diff = abs(nvinfo_starttime - ntime_merged) + elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW): + ntime_diff = abs(ntime_merged - nvinfo_endtime) + else: + ntime_diff = None + + _ErrorIf(ntime_diff is not None, self.ENODETIME, node, + "Node time diverges by at least %0.1fs from master node time", + ntime_diff) + + if ntime_diff is not None: + continue + try: node_info[node] = { "mfree": int(nodeinfo['memory_free']), diff --git a/lib/constants.py b/lib/constants.py index 0f035ef79..9ff39588c 100644 --- a/lib/constants.py +++ b/lib/constants.py @@ -320,7 +320,7 @@ DEFAULT_MAC_PREFIX = "aa:00:00" LVM_STRIPECOUNT = _autoconf.LVM_STRIPECOUNT # default maximum instance wait time, in seconds. DEFAULT_SHUTDOWN_TIMEOUT = 120 - +NODE_MAX_CLOCK_SKEW = 150 # RPC constants (RPC_ENCODING_NONE, @@ -526,6 +526,7 @@ NV_LVLIST = "lvlist" NV_PVLIST = "pvlist" NV_DRBDLIST = "drbd-list" NV_NODESETUP = "nodesetup" +NV_TIME = "time" # Allocator framework constants IALLOCATOR_VERSION = 2 @@ -722,7 +723,7 @@ CONFD_ERROR_ARGUMENT = 3 # Each request is "salted" by the current timestamp. # This constants decides how many seconds of skew to accept. # TODO: make this a default and allow the value to be more configurable -CONFD_MAX_CLOCK_SKEW = 300 +CONFD_MAX_CLOCK_SKEW = 2 * NODE_MAX_CLOCK_SKEW # When we haven't reloaded the config for more than this amount of seconds, we # force a test to see if inotify is betraying us. diff --git a/test/ganeti.constants_unittest.py b/test/ganeti.constants_unittest.py index 219afee84..7f1f0cba7 100755 --- a/test/ganeti.constants_unittest.py +++ b/test/ganeti.constants_unittest.py @@ -60,6 +60,10 @@ class TestConstants(unittest.TestCase): self.failUnless(constants.LDS_OKAY < constants.LDS_UNKNOWN) self.failUnless(constants.LDS_UNKNOWN < constants.LDS_FAULTY) + def testClockSkew(self): + self.failUnless(constants.NODE_MAX_CLOCK_SKEW < + (0.8 * constants.CONFD_MAX_CLOCK_SKEW)) + class TestParameterNames(unittest.TestCase): """HV/BE parameter tests""" @@ -81,7 +85,7 @@ class TestConfdConstants(unittest.TestCase): def testFourCc(self): self.failUnlessEqual(len(constants.CONFD_MAGIC_FOURCC), 4, - "Invalid fourcc len, should be 4") + "Invalid fourcc len, should be 4") def _IsUniqueSequence(self, sequence): seen = set() -- GitLab