Commit b1b6ea87 authored by Iustin Pop's avatar Iustin Pop

Rework master startup/shutdown/failover

This (big) patch reworks the master startup/shutdown and the fixes the
master failover.

What does the patch do?

For master start/stop:
  - remove the old ganeti-master script and its associated man page
  - moves the ip start/stop directly into the backend.(Start|Stop)Master
  - adds start/stop of the master/rapi daemon into these functions,
    selectively based on the start/stop arguments
  - makes the master call via rpc StartMaster(start_daemons=False) to
    the local node so that the master IP is started
  - and finally changes the example init.d script to directly start and
    stop all three daemons, since they do the right thing (depending on
    master/not master role)

For master failover:
  - moves the code from LUMasterFailover into bootstrap.MasterFailover,
    since we need to start/stop the master during this operation and
    thus it can't be executed from the master
  - removes the LUMasterFailover and its associated opcode

Notes: ubuntu's /etc/lsb-base-logging.sh is dumb, so the messages 'not
master' are not seen during startup on non-master nodes.

Reviewed-by: ultrotter
parent 53beffbb
......@@ -113,7 +113,6 @@ doc_DATA = \
dist_sbin_SCRIPTS = \
daemons/ganeti-noded \
daemons/ganeti-watcher \
daemons/ganeti-master \
daemons/ganeti-masterd \
daemons/ganeti-rapi \
scripts/gnt-backup \
......@@ -165,7 +164,6 @@ EXTRA_DIST = \
man_MANS = \
man/ganeti.7 \
man/ganeti-master.8 \
man/ganeti-noded.8 \
man/ganeti-os-interface.7 \
man/ganeti-watcher.8 \
......
#!/usr/bin/python
#
# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
"""Ganeti master script
Exit codes, for both start and stop:
- 0: master setup successful
- 1: some generic error (this exit code can also be thrown by exceptions)
- 11: node is not master, nothing to do
- 12: node setup incomplete, cannot start
- 13: node should be master, but someone has the ip address already
Only exit codes 0 and 11 represent an ok state. Code 1 was left for
generic errors as other python code can cause exit with code 1.
"""
import os
import sys
from optparse import OptionParser
from ganeti import constants
from ganeti import errors
from ganeti import ssconf
from ganeti import utils
EXIT_OK = 0
EXIT_SOME_ERROR = 1
EXIT_NOTMASTER = constants.EXIT_NOTMASTER
EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR
EXIT_DUPLICATE_IP = 13
EXIT_ARGS_ERROR = 14
def ParseOptions():
"""Parse the command line options.
Returns:
(options, args) as from OptionParser.parse_args()
"""
parser = OptionParser(description="Ganeti master",
usage="%prog [-d]",
version="%%prog (ganeti) %s" %
constants.RELEASE_VERSION)
parser.add_option("-d", "--debug", dest="debug",
help="Enable some debug messages",
default=False, action="store_true")
options, args = parser.parse_args()
if len(args) != 1 or args[0] not in ("start", "stop"):
sys.stderr.write("Usage: %s [-d] start|stop\n" % sys.argv[0])
sys.exit(EXIT_ARGS_ERROR)
return options, args
def CheckNodeSetup(debug):
"""Checks the node setup.
If the node setup if ok, this function will return the tuple
(master_hostname, master_netdev, master_ip). Otherwise the return
value will be None.
"""
for fname in (constants.SSL_CERT_FILE,):
if not os.path.isfile(fname):
if debug:
sys.stderr.write("Missing config file %s.\n" % fname)
return None
try:
ss = ssconf.SimpleStore()
port = ss.GetNodeDaemonPort()
pwdata = ss.GetNodeDaemonPassword()
master_name = ss.GetMasterNode()
master_netdev = ss.GetMasterNetdev()
master_ip = ss.GetMasterIP()
except errors.ConfigurationError, err:
if debug:
sys.stderr.write("Cluster configuration incomplete: '%s'\n" % str(err))
return None
return (master_name, master_netdev, master_ip)
def StartMaster(master_netdev, master_ip, debug):
"""Starts the master.
"""
if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT,
source=constants.LOCALHOST_IP_ADDRESS):
# we already have the ip:
if debug:
sys.stderr.write("Notice: already started.\n")
return EXIT_OK
else:
return EXIT_DUPLICATE_IP
result = utils.RunCmd(["ip", "address", "add", "%s/32" % master_ip,
"dev", master_netdev, "label",
"%s:0" % master_netdev])
if result.failed:
if debug:
sys.stderr.write("Can't activate master IP: %s\n" % result.output)
return EXIT_SOME_ERROR
result = utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev,
"-s", master_ip, master_ip])
# we'll ignore the exit code of arping
if constants.RAPI_ENABLE:
# Start remote API
result = utils.RunCmd(["ganeti-rapi", "--port=%s" % constants.RAPI_PORT])
if debug and result.failed:
sys.stderr.write("Failed to start ganeti-rapi, error: %s\n" %
result.output)
return EXIT_OK
def StopMaster(master_netdev, master_ip, debug):
"""Stops the master.
"""
if constants.RAPI_ENABLE:
# Stop remote API
result = utils.RunCmd(["fuser", "-k", "-TERM", "-n", "tcp",
str(constants.RAPI_PORT)])
if debug and result.failed:
sys.stderr.write("Failed to stop ganeti-rapi, error: %s\n" %
result.output)
result = utils.RunCmd(["ip", "address", "del", "%s/32" % master_ip,
"dev", master_netdev])
if result.failed:
if debug:
sys.stderr.write("Can't remove the master IP, error: %s" % result.output)
# but otherwise ignore the failure
return EXIT_OK
def main():
"""Main function.
"""
options, args = ParseOptions()
debug = options.debug
try:
myself = utils.HostInfo()
except errors.ResolverError, err:
sys.stderr.write("Cannot resolve my own name (%s)\n" % err.args[0])
return EXIT_NODESETUP_ERROR
result = CheckNodeSetup(debug)
if not result:
if debug:
sys.stderr.write("Node configuration incomplete.\n")
return EXIT_NODESETUP_ERROR
master_node, master_netdev, master_ip = result
if myself.name != master_node and args[0] == "start":
if debug:
sys.stderr.write("Not master, ignoring request.\n")
return EXIT_NOTMASTER
if args[0] == "start":
fn = StartMaster
else:
fn = StopMaster
result = fn(master_netdev, master_ip, debug)
sys.exit(result)
if __name__ == '__main__':
main()
......@@ -52,6 +52,7 @@ from ganeti import errors
from ganeti import ssconf
from ganeti import logger
from ganeti import workerpool
from ganeti import rpc
CLIENT_REQUEST_WORKERS = 16
......@@ -310,6 +311,11 @@ def main():
logging.info("ganeti master daemon startup")
# activate ip
master_node = ssconf.SimpleStore().GetMasterNode()
if not rpc.call_node_start_master(master_node, False):
logging.error("Can't activate master IP address")
master.setup_queue()
try:
master.serve_forever()
......
......@@ -13,15 +13,22 @@
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
DESC="Ganeti cluster"
NAME=ganeti-noded
NODED=@PREFIX@/sbin/ganeti-noded
MASTER=@PREFIX@/sbin/ganeti-master
MASTERD_NAME=ganeti-masterd
MASTERD="@PREFIX@/sbin/$MASTERD_NAME"
SCRIPTNAME=@SYSCONFDIR@/init.d/ganeti
GANETIRUNDIR="@LOCALSTATEDIR@/run/ganeti"
NODED_PID="$GANETIRUNDIR/$NAME.pid"
MASTERD_PID="$GANETIRUNDIR/$MASTERD_NAME.pid"
NODED_NAME="ganeti-noded"
NODED="@PREFIX@/sbin/${NODED_NAME}"
NODED_PID="${GANETIRUNDIR}/${NODED_NAME}.pid"
MASTERD_NAME="ganeti-masterd"
MASTERD="@PREFIX@/sbin/${MASTERD_NAME}"
MASTERD_PID="${GANETIRUNDIR}/${MASTERD_NAME}.pid"
RAPI_NAME="ganeti-rapi"
RAPI="@PREFIX@/sbin/${RAPI_NAME}"
RAPI_PID="${GANETIRUNDIR}/${RAPI_NAME}.pid"
SCRIPTNAME="@SYSCONFDIR@/init.d/ganeti"
test -f $NODED || exit 0
......@@ -40,9 +47,8 @@ check_config() {
done
}
master_action() {
log_action_begin_msg "ganeti-master"; $MASTER "$1"
RC=$?
check_exitcode() {
RC=$1
case $RC in
0)
log_action_end_msg 0
......@@ -56,47 +62,45 @@ master_action() {
esac
}
start_action() {
# called as start_action daemon pidfile
log_action_begin_msg "$1"
start-stop-daemon --start --quiet --exec "$1" --pidfile "$2"
check_exitcode $?
}
stop_action() {
# called as stop_action daemon pidfile
log_action_begin_msg "$1"
start-stop-daemon --stop --quiet --oknodo \
--retry 30 --pidfile "$2"
check_exitcode $?
}
case "$1" in
start)
log_daemon_msg "Starting $DESC" "$NAME"
check_config
if start-stop-daemon --start --quiet --exec $NODED; then
log_end_msg 0
else
log_end_msg 1
fi
master_action start
if start-stop-daemon --start --quiet --exec $MASTERD; then
log_end_msg 0
else
log_end_msg 1
fi
start_action $NODED $NODED_PID
start_action $MASTERD $MASTERD_PID
start_action $RAPI $RAPI_PID
;;
stop)
log_daemon_msg "Stopping $DESC" "$NAME"
if start-stop-daemon --stop --quiet --oknodo -p $NODED_PID; then
log_end_msg 0
else
log_end_msg 1
fi
if start-stop-daemon --stop --quiet --oknodo -p $MASTERD_PID; then
log_end_msg 0
else
log_end_msg 1
fi
master_action stop
stop_action $RAPI $RAPI_PID
stop_action $MASTERD $MASTERD_PID
stop_action $NODED $NODED_PID
;;
restart|force-reload)
log_daemon_msg "Reloading $DESC"
start-stop-daemon --stop --quiet --oknodo --retry 30 -p $NODED_PID
start-stop-daemon --stop --quiet --oknodo --retry 30 -p $MASTERD_PID
stop_action $RAPI $RAPI_PID
stop_action $MASTERD $MASTERD_PID
stop_action $NODED $NODED_PID
check_config
start-stop-daemon --start --quiet --exec $NODED
start-stop-daemon --start --quiet --exec $MASTERD
log_end_msg $?
$MASTER stop
master_action start
start_action $NODED $NODED_PID
start_action $MASTERD $MASTERD_PID
start_action $RAPI $RAPI_PID
;;
*)
log_success_msg "Usage: $SCRIPTNAME {start|stop|force-reload|restart}"
......
......@@ -47,6 +47,20 @@ def _GetSshRunner():
return ssh.SshRunner()
def _GetMasterInfo():
"""Return the master ip and netdev.
"""
try:
ss = ssconf.SimpleStore()
master_netdev = ss.GetMasterNetdev()
master_ip = ss.GetMasterIP()
except errors.ConfigurationError, err:
logging.exception("Cluster configuration incomplete")
return (None, None)
return (master_netdev, master_ip)
def StartMaster(start_daemons):
"""Activate local node as master node.
......@@ -56,14 +70,39 @@ def StartMaster(start_daemons):
(ganet-masterd and ganeti-rapi).
"""
result = utils.RunCmd([constants.MASTER_SCRIPT, "-d", "start"])
if result.failed:
logging.error("could not activate cluster interface with command %s,"
" error: '%s'", result.cmd, result.output)
ok = True
master_netdev, master_ip = _GetMasterInfo()
if not master_netdev:
return False
return True
if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT,
source=constants.LOCALHOST_IP_ADDRESS):
# we already have the ip:
logging.debug("Already started")
else:
logging.error("Someone else has the master ip, not activating")
ok = False
else:
result = utils.RunCmd(["ip", "address", "add", "%s/32" % master_ip,
"dev", master_netdev, "label",
"%s:0" % master_netdev])
if result.failed:
logging.error("Can't activate master IP: %s", result.output)
ok = False
result = utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev,
"-s", master_ip, master_ip])
# we'll ignore the exit code of arping
# and now start the master and rapi daemons
if start_daemons:
for daemon in 'ganeti-masterd', 'ganeti-rapi':
result = utils.RunCmd([daemon])
if result.failed:
logging.error("Can't start daemon %s: %s", daemon, result.output)
ok = False
return ok
def StopMaster(stop_daemons):
......@@ -74,12 +113,20 @@ def StopMaster(stop_daemons):
stop the master daemons (ganet-masterd and ganeti-rapi).
"""
result = utils.RunCmd([constants.MASTER_SCRIPT, "-d", "stop"])
master_netdev, master_ip = _GetMasterInfo()
if not master_netdev:
return False
result = utils.RunCmd(["ip", "address", "del", "%s/32" % master_ip,
"dev", master_netdev])
if result.failed:
logging.error("could not deactivate cluster interface with command %s,"
" error: '%s'", result.cmd, result.output)
return False
logger.error("Can't remove the master IP, error: %s", result.output)
# but otherwise ignore the failure
if stop_daemons:
# stop/kill the rapi and the master daemon
for daemon in constants.RAPI_PID, constants.MASTERD_PID:
utils.KillProcess(utils.ReadPidFile(utils.DaemonPidFileName(daemon)))
return True
......
......@@ -27,6 +27,7 @@ import os
import os.path
import sha
import re
import logging
from ganeti import rpc
from ganeti import ssh
......@@ -228,12 +229,13 @@ def InitCluster(cluster_name, hypervisor_type, mac_prefix, def_bridge,
ssh.WriteKnownHostsFile(cfg, ss, constants.SSH_KNOWN_HOSTS_FILE)
def SetupNodeDaemon(node):
"""Add a node to the cluster.
This function must be called before the actual opcode, and will ssh to the
remote node, copy the needed files, and start ganeti-noded, allowing the master
to do the rest via normal rpc calls.
This function must be called before the actual opcode, and will ssh
to the remote node, copy the needed files, and start ganeti-noded,
allowing the master to do the rest via normal rpc calls.
Args:
node: fully qualified domain name for the new node
......@@ -278,3 +280,47 @@ def SetupNodeDaemon(node):
return 0
def MasterFailover():
"""Failover the master node.
This checks that we are not already the master, and will cause the
current master to cease being master, and the non-master to become
new master.
"""
ss = ssconf.WritableSimpleStore()
new_master = utils.HostInfo().name
old_master = ss.GetMasterNode()
if old_master == new_master:
raise errors.OpPrereqError("This commands must be run on the node"
" where you want the new master to be."
" %s is already the master" %
old_master)
# end checks
rcode = 0
logging.info("setting master to %s, old master: %s", new_master, old_master)
if not rpc.call_node_stop_master(old_master, True):
logging.error("could disable the master role on the old master"
" %s, please disable manually", old_master)
ss.SetKey(ss.SS_MASTER_NODE, new_master)
cfg = config.ConfigWriter()
if not rpc.call_upload_file(cfg.GetNodeList(),
ss.KeyToFilename(ss.SS_MASTER_NODE)):
logger.Error("could not distribute the new simple store master file"
" to the other nodes, please check.")
if not rpc.call_node_start_master(new_master, True):
logging.error("could not start the master role on the new master"
" %s, please check", new_master)
rcode = 1
return rcode
......@@ -1684,78 +1684,6 @@ class LUAddNode(LogicalUnit):
self.context.glm.add(locking.LEVEL_NODE, node)
class LUMasterFailover(LogicalUnit):
"""Failover the master node to the current node.
This is a special LU in that it must run on a non-master node.
"""
HPATH = "master-failover"
HTYPE = constants.HTYPE_CLUSTER
REQ_MASTER = False
REQ_WSSTORE = True
_OP_REQP = []
def BuildHooksEnv(self):
"""Build hooks env.
This will run on the new master only in the pre phase, and on all
the nodes in the post phase.
"""
env = {
"OP_TARGET": self.new_master,
"NEW_MASTER": self.new_master,
"OLD_MASTER": self.old_master,
}
return env, [self.new_master], self.cfg.GetNodeList()
def CheckPrereq(self):
"""Check prerequisites.
This checks that we are not already the master.
"""
self.new_master = utils.HostInfo().name
self.old_master = self.sstore.GetMasterNode()
if self.old_master == self.new_master:
raise errors.OpPrereqError("This commands must be run on the node"
" where you want the new master to be."
" %s is already the master" %
self.old_master)
def Exec(self, feedback_fn):
"""Failover the master node.
This command, when run on a non-master node, will cause the current
master to cease being master, and the non-master to become new
master.
"""
#TODO: do not rely on gethostname returning the FQDN
logger.Info("setting master to %s, old master: %s" %
(self.new_master, self.old_master))
if not rpc.call_node_stop_master(self.old_master, True):
logger.Error("could disable the master role on the old master"
" %s, please disable manually" % self.old_master)
ss = self.sstore
ss.SetKey(ss.SS_MASTER_NODE, self.new_master)
if not rpc.call_upload_file(self.cfg.GetNodeList(),
ss.KeyToFilename(ss.SS_MASTER_NODE)):
logger.Error("could not distribute the new simple store master file"
" to the other nodes, please check.")
if not rpc.call_node_start_master(self.new_master, True):
logger.Error("could not start the master role on the new master"
" %s, please check" % self.new_master)
feedback_fn("Error in activating the master IP on the new master,"
" please fix manually.")
class LUQueryClusterInfo(NoHooksLU):
"""Query cluster configuration.
......
......@@ -47,7 +47,6 @@ class Processor(object):
opcodes.OpDestroyCluster: cmdlib.LUDestroyCluster,
opcodes.OpQueryClusterInfo: cmdlib.LUQueryClusterInfo,
opcodes.OpVerifyCluster: cmdlib.LUVerifyCluster,
opcodes.OpMasterFailover: cmdlib.LUMasterFailover,
opcodes.OpDumpClusterConfig: cmdlib.LUDumpClusterConfig,
opcodes.OpRenameCluster: cmdlib.LURenameCluster,
opcodes.OpVerifyDisks: cmdlib.LUVerifyDisks,
......
......@@ -215,12 +215,6 @@ class OpVerifyDisks(OpCode):
__slots__ = []
class OpMasterFailover(OpCode):
"""Do a master failover."""
OP_ID = "OP_CLUSTER_MASTERFAILOVER"
__slots__ = []
class OpDumpClusterConfig(OpCode):
"""Dump the cluster configuration."""
OP_ID = "OP_CLUSTER_DUMPCONFIG"
......
<!doctype refentry PUBLIC "-//OASIS//DTD DocBook V4.1//EN" [
<!-- Please adjust the date whenever revising the manpage. -->
<!ENTITY dhdate "<date>November 22, 2007</date>">
<!-- SECTION should be 1-8, maybe w/ subsection other parameters are
allowed: see man(7), man(1). -->
<!ENTITY dhsection "<manvolnum>8</manvolnum>">
<!ENTITY dhucpackage "<refentrytitle>ganeti-master</refentrytitle>">
<!ENTITY dhpackage "ganeti-master">
<!ENTITY debian "<productname>Debian</productname>">