Commit 313b2dd4 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

gnt-cluster verify: Warn if node time diverges too far



The warning will be generated if the clocks diverge by more
than 150 seconds. Due to the way the RPC system works, we
cannot get exact time differences, e.g. if one of the
queried nodes is broken. The comparision is done using a
time window.

Confd queries will fail if the clock on the client and server
are more than 300 seconds from each other. This check helps
keeping at least the nodes of a cluster in sync.
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarGuido Trotter <ultrotter@google.com>
parent 94c2ed34
......@@ -551,6 +551,10 @@ def VerifyNode(what, cluster_name):
tmpr.append("The procfs filesystem doesn't seem to be mounted"
" under /proc, missing required directory /proc/sys and"
" the file /proc/sysrq-trigger")
if constants.NV_TIME in what:
result[constants.NV_TIME] = utils.SplitTime(time.time())
return result
......
......@@ -944,6 +944,7 @@ class LUVerifyCluster(LogicalUnit):
ENODESSH = (TNODE, "ENODESSH")
ENODEVERSION = (TNODE, "ENODEVERSION")
ENODESETUP = (TNODE, "ENODESETUP")
ENODETIME = (TNODE, "ENODETIME")
ETYPE_FIELD = "code"
ETYPE_ERROR = "ERROR"
......@@ -1326,14 +1327,23 @@ class LUVerifyCluster(LogicalUnit):
constants.NV_VERSION: None,
constants.NV_HVINFO: self.cfg.GetHypervisorType(),
constants.NV_NODESETUP: None,
constants.NV_TIME: None,
}
if vg_name is not None:
node_verify_param[constants.NV_VGLIST] = None
node_verify_param[constants.NV_LVLIST] = vg_name
node_verify_param[constants.NV_PVLIST] = [vg_name]
node_verify_param[constants.NV_DRBDLIST] = None
# Due to the way our RPC system works, exact response times cannot be
# guaranteed (e.g. a broken node could run into a timeout). By keeping the
# time before and after executing the request, we can at least have a time
# window.
nvinfo_starttime = time.time()
all_nvinfo = self.rpc.call_node_verify(nodelist, node_verify_param,
self.cfg.GetClusterName())
nvinfo_endtime = time.time()
cluster = self.cfg.GetClusterInfo()
master_node = self.cfg.GetMasterNode()
......@@ -1380,6 +1390,7 @@ class LUVerifyCluster(LogicalUnit):
else:
instance = instanceinfo[instance]
node_drbd[minor] = (instance.name, instance.admin_up)
self._VerifyNode(node_i, file_names, local_checksums,
nresult, master_files, node_drbd, vg_name)
......@@ -1413,6 +1424,27 @@ class LUVerifyCluster(LogicalUnit):
if test:
continue
# Node time
ntime = nresult.get(constants.NV_TIME, None)
try:
ntime_merged = utils.MergeTime(ntime)
except (ValueError, TypeError):
_ErrorIf(test, self.ENODETIME, node, "Node returned invalid time")
if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
ntime_diff = abs(nvinfo_starttime - ntime_merged)
elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
ntime_diff = abs(ntime_merged - nvinfo_endtime)
else:
ntime_diff = None
_ErrorIf(ntime_diff is not None, self.ENODETIME, node,
"Node time diverges by at least %0.1fs from master node time",
ntime_diff)
if ntime_diff is not None:
continue
try:
node_info[node] = {
"mfree": int(nodeinfo['memory_free']),
......
......@@ -320,7 +320,7 @@ DEFAULT_MAC_PREFIX = "aa:00:00"
LVM_STRIPECOUNT = _autoconf.LVM_STRIPECOUNT
# default maximum instance wait time, in seconds.
DEFAULT_SHUTDOWN_TIMEOUT = 120
NODE_MAX_CLOCK_SKEW = 150
# RPC constants
(RPC_ENCODING_NONE,
......@@ -526,6 +526,7 @@ NV_LVLIST = "lvlist"
NV_PVLIST = "pvlist"
NV_DRBDLIST = "drbd-list"
NV_NODESETUP = "nodesetup"
NV_TIME = "time"
# Allocator framework constants
IALLOCATOR_VERSION = 2
......@@ -722,7 +723,7 @@ CONFD_ERROR_ARGUMENT = 3
# Each request is "salted" by the current timestamp.
# This constants decides how many seconds of skew to accept.
# TODO: make this a default and allow the value to be more configurable
CONFD_MAX_CLOCK_SKEW = 300
CONFD_MAX_CLOCK_SKEW = 2 * NODE_MAX_CLOCK_SKEW
# When we haven't reloaded the config for more than this amount of seconds, we
# force a test to see if inotify is betraying us.
......
......@@ -60,6 +60,10 @@ class TestConstants(unittest.TestCase):
self.failUnless(constants.LDS_OKAY < constants.LDS_UNKNOWN)
self.failUnless(constants.LDS_UNKNOWN < constants.LDS_FAULTY)
def testClockSkew(self):
self.failUnless(constants.NODE_MAX_CLOCK_SKEW <
(0.8 * constants.CONFD_MAX_CLOCK_SKEW))
class TestParameterNames(unittest.TestCase):
"""HV/BE parameter tests"""
......@@ -81,7 +85,7 @@ class TestConfdConstants(unittest.TestCase):
def testFourCc(self):
self.failUnlessEqual(len(constants.CONFD_MAGIC_FOURCC), 4,
"Invalid fourcc len, should be 4")
"Invalid fourcc len, should be 4")
def _IsUniqueSequence(self, sequence):
seen = set()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment