diff --git a/daemons/ganeti-masterd b/daemons/ganeti-masterd index b9e62c76130d958fbb0de38c3d701bb0b5f5bb40..5a20e1f02d6480adcad33462d639ad806cbb0377 100755 --- a/daemons/ganeti-masterd +++ b/daemons/ganeti-masterd @@ -397,6 +397,10 @@ def ParseOptions(): help="Do not check that the nodes agree on this node" " being the master and start the daemon unconditionally", default=False, action="store_true") + parser.add_option("--yes-do-it", dest="yes_do_it", + help="Override interactive check for --no-voting", + default=False, action="store_true") + options, args = parser.parse_args() return options, args @@ -473,7 +477,7 @@ def main(): ssconf.CheckMaster(options.debug) # we believe we are the master, let's ask the other nodes... - if options.no_voting: + if options.no_voting and not options.yes_do_it: sys.stdout.write("The 'no voting' option has been selected.\n") sys.stdout.write("This is dangerous, please confirm by" " typing uppercase 'yes': ") @@ -482,7 +486,7 @@ def main(): if confirmation != "YES": print "Aborting." return - else: + elif not options.no_voting: if not CheckAgreement(): return diff --git a/lib/bootstrap.py b/lib/bootstrap.py index d569e8d60c09ad991add64f4929c4bd8b2906332..18db894671f0bf53e7993ac56ae60983a8cb9ec0 100644 --- a/lib/bootstrap.py +++ b/lib/bootstrap.py @@ -376,13 +376,17 @@ def SetupNodeDaemon(cluster_name, node, ssh_key_check): (node, result.fail_reason, result.output)) -def MasterFailover(): +def MasterFailover(no_voting=False): """Failover the master node. This checks that we are not already the master, and will cause the current master to cease being master, and the non-master to become new master. + @type no_voting: boolean + @param no_voting: force the operation without remote nodes agreement + (dangerous) + """ sstore = ssconf.SimpleStore() @@ -404,18 +408,20 @@ def MasterFailover(): " master candidates is:\n" "%s" % ('\n'.join(mc_no_master))) - vote_list = GatherMasterVotes(node_list) - - if vote_list: - voted_master = vote_list[0][0] - if voted_master is None: - raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not" - " respond.") - elif voted_master != old_master: - raise errors.OpPrereqError("I have wrong configuration, I believe the" - " master is %s but the other nodes voted for" - " %s. Please resync the configuration of" - " this node." % (old_master, voted_master)) + if not no_voting: + vote_list = GatherMasterVotes(node_list) + + if vote_list: + voted_master = vote_list[0][0] + if voted_master is None: + raise errors.OpPrereqError("Cluster is inconsistent, most nodes did" + " not respond.") + elif voted_master != old_master: + raise errors.OpPrereqError("I have a wrong configuration, I believe" + " the master is %s but the other nodes" + " voted %s. Please resync the configuration" + " of this node." % + (old_master, voted_master)) # end checks rcode = 0 @@ -439,7 +445,8 @@ def MasterFailover(): # cluster info cfg.Update(cluster_info) - result = rpc.RpcRunner.call_node_start_master(new_master, True) + # 2.0.X: Don't start the master if no_voting is true + result = rpc.RpcRunner.call_node_start_master(new_master, not no_voting) if result.failed or not result.data: logging.error("Could not start the master role on the new master" " %s, please check", new_master) diff --git a/man/gnt-cluster.sgml b/man/gnt-cluster.sgml index e3fecbf35f2833b41e23a455465f3edd0797dbd7..98ec7db76eb603337294d680669b7b7991f990f4 100644 --- a/man/gnt-cluster.sgml +++ b/man/gnt-cluster.sgml @@ -442,11 +442,32 @@ <cmdsynopsis> <command>masterfailover</command> + <arg>--no-voting</arg> </cmdsynopsis> <para> Failover the master role to the current node. </para> + + <para> + The <option>--no-voting</option> option skips the remote node agreement + checks. This is dangerous, but necessary in some cases (for example + failing over the master role in a 2 node cluster with the original master + down). If the original master then comes up, it won't be able to start + its master daemon because it won't have enough votes, but so won't the + new master, if the master daemon ever needs a restart. You can pass + --no-voting to ganeti-masterd on the new master to solve this problem, + and gnt-cluster redist-conf to make sure the cluster is consistent again. + </para> + + <para> + In version 2.0.X ganeti-masterd will not be able to start if + masterfailover is called with the --no-voting option (which, again, + should only be used on 2 nodes clusters with the former master being + down). In that case just start it manually passing --no-voting to it + as well, until you have restored cluster redundancy. + </para> + </refsect2> <refsect2> diff --git a/scripts/gnt-cluster b/scripts/gnt-cluster index 99cab31f28a71dbf7ae616dfb24bbf7d9c899cc8..29990f54c08985c386b1385a18b2481c1a8fa40e 100755 --- a/scripts/gnt-cluster +++ b/scripts/gnt-cluster @@ -424,7 +424,15 @@ def MasterFailover(opts, args): @return: the desired exit code """ - return bootstrap.MasterFailover() + if opts.no_voting: + usertext = ("This will perform the failover even if most other nodes" + " are down, or if this node is outdated. This is dangerous" + " as it can lead to a non-consistent cluster. Check the" + " gnt-cluster(8) man page before proceeding. Continue?") + if not AskUser(usertext): + return 1 + + return bootstrap.MasterFailover(no_voting=opts.no_voting) def SearchTags(opts, args): @@ -613,7 +621,12 @@ commands = { "", "Does a check on the cluster configuration"), 'verify-disks': (VerifyDisks, ARGS_NONE, [DEBUG_OPT], "", "Does a check on the cluster disk status"), - 'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT], + 'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT, + make_option("--no-voting", dest="no_voting", + help="Skip node agreement check (dangerous)", + action="store_true", + default=False,), + ], "", "Makes the current node the master"), 'version': (ShowClusterVersion, ARGS_NONE, [DEBUG_OPT], "", "Shows the cluster version"),