Commit ff699aa9 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

gnt-cluster master-failover: Undrain queue



- Move functions for drain status (tracked via file) from jqueue to jstore
- Undrain queue on master failover if necessary
- Add QA test
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent b12d5e2e
......@@ -43,6 +43,7 @@ from ganeti import bdev
from ganeti import netutils
from ganeti import backend
from ganeti import luxi
from ganeti import jstore
# ec_id for InitConfig's temporary reservation manager
......@@ -660,6 +661,10 @@ def MasterFailover(no_voting=False):
" continuing but activating the master on the current"
" node will probably fail", total_timeout)
if jstore.CheckDrainFlag():
logging.info("Undraining job queue")
jstore.SetDrainFlag(False)
logging.info("Starting the master daemons on the new master")
result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
......
......@@ -29,7 +29,6 @@ used by all other classes in this module.
"""
import os
import logging
import errno
import re
......@@ -1230,7 +1229,7 @@ class JobQueue(object):
self._queue_size = 0
self._UpdateQueueSizeUnlocked()
self._drained = self._IsQueueMarkedDrain()
self._drained = jstore.CheckDrainFlag()
# Setup worker pool
self._wpool = _JobQueueWorkerPool(self)
......@@ -1629,19 +1628,6 @@ class JobQueue(object):
logging.exception("Can't load/parse job %s", job_id)
return None
@staticmethod
def _IsQueueMarkedDrain():
"""Check if the queue is marked from drain.
This currently uses the queue drain file, which makes it a
per-node flag. In the future this can be moved to the config file.
@rtype: boolean
@return: True of the job queue is marked for draining
"""
return os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
def _UpdateQueueSizeUnlocked(self):
"""Update the queue size.
......@@ -1657,13 +1643,7 @@ class JobQueue(object):
@param drain_flag: Whether to set or unset the drain flag
"""
getents = runtime.GetEnts()
if drain_flag:
utils.WriteFile(constants.JOB_QUEUE_DRAIN_FILE, data="", close=True,
uid=getents.masterd_uid, gid=getents.masterd_gid)
else:
utils.RemoveFile(constants.JOB_QUEUE_DRAIN_FILE)
jstore.SetDrainFlag(drain_flag)
self._drained = drain_flag
......
......@@ -22,6 +22,7 @@
"""Module implementing the job queue handling."""
import errno
import os
from ganeti import constants
from ganeti import errors
......@@ -134,3 +135,36 @@ def InitAndVerifyQueue(must_lock):
raise
return queue_lock
def CheckDrainFlag():
"""Check if the queue is marked to be drained.
This currently uses the queue drain file, which makes it a per-node flag.
In the future this can be moved to the config file.
@rtype: boolean
@return: True if the job queue is marked drained
"""
return os.path.exists(constants.JOB_QUEUE_DRAIN_FILE)
def SetDrainFlag(drain_flag):
"""Sets the drain flag for the queue.
@type drain_flag: boolean
@param drain_flag: Whether to set or unset the drain flag
@attention: This function should only called the current holder of the queue
lock
"""
getents = runtime.GetEnts()
if drain_flag:
utils.WriteFile(constants.JOB_QUEUE_DRAIN_FILE, data="",
uid=getents.masterd_uid, gid=getents.masterd_gid)
else:
utils.RemoveFile(constants.JOB_QUEUE_DRAIN_FILE)
assert (not drain_flag) ^ CheckDrainFlag()
......@@ -167,6 +167,8 @@ def RunClusterTests():
("cluster-command", qa_cluster.TestClusterCommand),
("cluster-burnin", qa_cluster.TestClusterBurnin),
("cluster-master-failover", qa_cluster.TestClusterMasterFailover),
("cluster-master-failover",
qa_cluster.TestClusterMasterFailoverWithDrainedQueue),
("cluster-oob", qa_cluster.TestClusterOob),
("rapi", qa_rapi.TestVersion),
("rapi", qa_rapi.TestEmptyCluster),
......
......@@ -385,11 +385,42 @@ def TestClusterMasterFailover():
cmd = ["gnt-cluster", "master-failover"]
try:
AssertCommand(cmd, node=failovermaster)
# Back to original master node
AssertCommand(cmd, node=master)
finally:
qa_config.ReleaseNode(failovermaster)
def TestClusterMasterFailoverWithDrainedQueue():
"""gnt-cluster master-failover with drained queue"""
drain_check = ["test", "-f", constants.JOB_QUEUE_DRAIN_FILE]
master = qa_config.GetMasterNode()
failovermaster = qa_config.AcquireNode(exclude=master)
# Ensure queue is not drained
for node in [master, failovermaster]:
AssertCommand(drain_check, node=node, fail=True)
# Drain queue on failover master
AssertCommand(["touch", constants.JOB_QUEUE_DRAIN_FILE], node=failovermaster)
cmd = ["gnt-cluster", "master-failover"]
try:
AssertCommand(drain_check, node=failovermaster)
AssertCommand(cmd, node=failovermaster)
AssertCommand(drain_check, fail=True)
AssertCommand(drain_check, node=failovermaster, fail=True)
# Back to original master node
AssertCommand(cmd, node=master)
finally:
qa_config.ReleaseNode(failovermaster)
AssertCommand(drain_check, fail=True)
AssertCommand(drain_check, node=failovermaster, fail=True)
def TestClusterCopyfile():
"""gnt-cluster copyfile"""
master = qa_config.GetMasterNode()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment