Commit 8e2524c3 authored by Guido Trotter's avatar Guido Trotter
Browse files

Create a new --no-voting option for masterfailover



This allows failing over in certain corner cases, such as a 2 node
cluster with one node down. The man page is also updated to document
this dangerous option and how to recover from this situation.
Signed-off-by: default avatarGuido Trotter <ultrotter@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent 5e96d216
......@@ -373,13 +373,17 @@ def SetupNodeDaemon(cluster_name, node, ssh_key_check):
(node, result.fail_reason, result.output))
def MasterFailover():
def MasterFailover(no_voting=False):
"""Failover the master node.
This checks that we are not already the master, and will cause the
current master to cease being master, and the non-master to become
new master.
@type no_voting: boolean
@param no_voting: force the operation without remote nodes agreement
(dangerous)
"""
sstore = ssconf.SimpleStore()
......@@ -401,18 +405,20 @@ def MasterFailover():
" master candidates is:\n"
"%s" % ('\n'.join(mc_no_master)))
vote_list = GatherMasterVotes(node_list)
if vote_list:
voted_master = vote_list[0][0]
if voted_master is None:
raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not"
" respond.")
elif voted_master != old_master:
raise errors.OpPrereqError("I have wrong configuration, I believe the"
" master is %s but the other nodes voted for"
" %s. Please resync the configuration of"
" this node." % (old_master, voted_master))
if not no_voting:
vote_list = GatherMasterVotes(node_list)
if vote_list:
voted_master = vote_list[0][0]
if voted_master is None:
raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
" not respond.")
elif voted_master != old_master:
raise errors.OpPrereqError("I have a wrong configuration, I believe"
" the master is %s but the other nodes"
" voted %s. Please resync the configuration"
" of this node." %
(old_master, voted_master))
# end checks
rcode = 0
......@@ -436,7 +442,8 @@ def MasterFailover():
# cluster info
cfg.Update(cluster_info)
result = rpc.RpcRunner.call_node_start_master(new_master, True)
# 2.0.X: Don't start the master if no_voting is true
result = rpc.RpcRunner.call_node_start_master(new_master, not no_voting)
if result.failed or not result.data:
logging.error("Could not start the master role on the new master"
" %s, please check", new_master)
......
......@@ -442,11 +442,32 @@
<cmdsynopsis>
<command>masterfailover</command>
<arg>--no-voting</arg>
</cmdsynopsis>
<para>
Failover the master role to the current node.
</para>
<para>
The <option>--no-voting</option> option skips the remote node agreement
checks. This is dangerous, but necessary in some cases (for example
failing over the master role in a 2 node cluster with the original master
down). If the original master then comes up, it won't be able to start
its master daemon because it won't have enough votes, but so won't the
new master, if the master daemon ever needs a restart. You can pass
--no-voting to ganeti-masterd on the new master to solve this problem,
and gnt-cluster redist-conf to make sure the cluster is consistent again.
</para>
<para>
In version 2.0.X ganeti-masterd will not be able to start if
masterfailover is called with the --no-voting option (which, again,
should only be used on 2 nodes clusters with the former master being
down). In that case just start it manually passing --no-voting to it
as well, until you have restored cluster redundancy.
</para>
</refsect2>
<refsect2>
......
......@@ -424,7 +424,15 @@ def MasterFailover(opts, args):
@return: the desired exit code
"""
return bootstrap.MasterFailover()
if opts.no_voting:
usertext = ("This will perform the failover even if most other nodes"
" are down, or if this node is outdated. This is dangerous"
" as it can lead to a non-consistent cluster. Check the"
" gnt-cluster(8) man page before proceeding. Continue?")
if not AskUser(usertext):
return 1
return bootstrap.MasterFailover(no_voting=opts.no_voting)
def SearchTags(opts, args):
......@@ -613,7 +621,12 @@ commands = {
"", "Does a check on the cluster configuration"),
'verify-disks': (VerifyDisks, ARGS_NONE, [DEBUG_OPT],
"", "Does a check on the cluster disk status"),
'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT],
'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT,
make_option("--no-voting", dest="no_voting",
help="Skip node agreement check (dangerous)",
action="store_true",
default=False,),
],
"", "Makes the current node the master"),
'version': (ShowClusterVersion, ARGS_NONE, [DEBUG_OPT],
"", "Shows the cluster version"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment