From 66d1f03544c73fb058f3a2e283364aa6002cc50c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Nussbaumer?= <rn@google.com> Date: Wed, 16 Feb 2011 11:05:56 +0100 Subject: [PATCH] Introducing gnt-cluster epo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a convenience command to do an automated EPO in the possible limits of Ganeti. Signed-off-by: RenΓ© Nussbaumer <rn@google.com> Reviewed-by: Michael Hanselmann <hansmi@google.com> --- Makefile.am | 1 + lib/client/gnt_cluster.py | 332 +++++++++++++++++++++ man/gnt-cluster.rst | 17 ++ qa/ganeti-qa.py | 1 + qa/qa_cluster.py | 41 ++- test/ganeti.client.gnt_cluster_unittest.py | 142 +++++++++ 6 files changed, 533 insertions(+), 1 deletion(-) create mode 100755 test/ganeti.client.gnt_cluster_unittest.py diff --git a/Makefile.am b/Makefile.am index 9652322b4..4aaae3fec 100644 --- a/Makefile.am +++ b/Makefile.am @@ -471,6 +471,7 @@ python_tests = \ test/ganeti.backend_unittest.py \ test/ganeti.bdev_unittest.py \ test/ganeti.cli_unittest.py \ + test/ganeti.client.gnt_cluster_unittest.py \ test/ganeti.client.gnt_instance_unittest.py \ test/ganeti.daemon_unittest.py \ test/ganeti.cmdlib_unittest.py \ diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py index 1755ef963..dcd07928b 100644 --- a/lib/client/gnt_cluster.py +++ b/lib/client/gnt_cluster.py @@ -29,6 +29,7 @@ import os.path import time import OpenSSL +import itertools from ganeti.cli import * from ganeti import opcodes @@ -40,6 +41,20 @@ from ganeti import ssh from ganeti import objects from ganeti import uidpool from ganeti import compat +from ganeti import netutils + + +ON_OPT = cli_option("--on", default=False, + action="store_true", dest="on", + help="Recover from an EPO") + +GROUPS_OPT = cli_option("--groups", default=False, + action="store_true", dest="groups", + help="Arguments are node groups instead of nodes") + +_EPO_PING_INTERVAL = 30 # 30 seconds between pings +_EPO_PING_TIMEOUT = 1 # 1 second +_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes @UsesRPC @@ -882,6 +897,318 @@ def WatcherOps(opts, args): return 0 +def _OobPower(opts, node_list, power): + """Puts the node in the list to desired power state. + + @param opts: The command line options selected by the user + @param node_list: The list of nodes to operate on + @param power: True if they should be powered on, False otherwise + @return: The success of the operation (none failed) + + """ + if power: + command = constants.OOB_POWER_ON + else: + command = constants.OOB_POWER_OFF + + op = opcodes.OpOobCommand(node_names=node_list, + command=command, + ignore_status=True, + timeout=opts.oob_timeout) + result = SubmitOpCode(op, opts=opts) + errs = 0 + for node_result in result: + (node_tuple, data_tuple) = node_result + (_, node_name) = node_tuple + (data_status, _) = data_tuple + if data_status != constants.RS_NORMAL: + assert data_status != constants.RS_UNAVAIL + errs += 1 + ToStderr("There was a problem changing power for %s, please investigate", + node_name) + + if errs > 0: + return False + + return True + + +def _InstanceStart(opts, inst_list, start): + """Puts the instances in the list to desired state. + + @param opts: The command line options selected by the user + @param inst_list: The list of instances to operate on + @param start: True if they should be started, False for shutdown + @return: The success of the operation (none failed) + + """ + if start: + opcls = opcodes.OpInstanceStartup + text_submit, text_success, text_failed = ("startup", "started", "starting") + else: + opcls = opcodes.OpInstanceShutdown + text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping") + + jex = JobExecutor(opts=opts) + + for inst in inst_list: + ToStdout("Submit %s of instance %s", text_submit, inst) + op = opcls(instance_name=inst) + jex.QueueJob(inst, op) + + results = jex.GetResults() + bad_cnt = len([1 for (success, _) in results if not success]) + + if bad_cnt == 0: + ToStdout("All instances have been %s successfully", text_success) + else: + ToStderr("There were errors while %s instances:\n" + "%d error(s) out of %d instance(s)", text_failed, bad_cnt, + len(results)) + return False + + return True + + +class _RunWhenNodesReachableHelper: + """Helper class to make shared internal state sharing easier. + + @ivar success: Indicates if all action_cb calls were successful + + """ + def __init__(self, node_list, action_cb, node2ip, port, + _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep): + """Init the object. + + @param node_list: The list of nodes to be reachable + @param action_cb: Callback called when a new host is reachable + @type node2ip: dict + @param node2ip: Node to ip mapping + @param port: The port to use for the TCP ping + @param _ping_fn: Function to check reachabilty (for unittest use only) + @param _sleep_fn: Function to sleep (for unittest use only) + + """ + self.down = set(node_list) + self.up = set() + self.node2ip = node2ip + self.success = True + self.action_cb = action_cb + self.port = port + self._ping_fn = _ping_fn + self._sleep_fn = _sleep_fn + + def __call__(self): + """When called we run action_cb. + + @raises utils.RetryAgain: When there are still down nodes + + """ + if not self.action_cb(self.up): + self.success = False + + if self.down: + raise utils.RetryAgain() + else: + return self.success + + def Wait(self, secs): + """Checks if a host is up or waits remaining seconds. + + @param secs: The secs remaining + + """ + start = time.time() + for node in self.down: + if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT, + live_port_needed=True): + ToStdout("Node %s became available", node) + self.up.add(node) + self.down -= self.up + # If we have a node available there is the possibility to run the + # action callback successfully, therefore we don't wait and return + return + + self._sleep_fn(max(0.0, start + secs - time.time())) + + +def _RunWhenNodesReachable(node_list, action_cb, interval): + """Run action_cb when nodes become reachable. + + @param node_list: The list of nodes to be reachable + @param action_cb: Callback called when a new host is reachable + @param interval: The earliest time to retry + + """ + client = GetClient() + cluster_info = client.QueryClusterInfo() + if cluster_info["primary_ip_version"] == constants.IP4_VERSION: + family = netutils.IPAddress.family + else: + family = netutils.IP6Address.family + + node2ip = dict((node, netutils.GetHostname(node, family=family).ip) + for node in node_list) + + port = netutils.GetDaemonPort(constants.NODED) + helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port) + + try: + return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT, + wait_fn=helper.Wait) + except utils.RetryTimeout: + ToStderr("Time exceeded while waiting for nodes to become reachable" + " again:\n - %s", " - ".join(helper.down)) + return False + + +def _MaybeInstanceStartup(opts, inst_map, nodes_online, + _instance_start_fn=_InstanceStart): + """Start the instances conditional based on node_states. + + @param opts: The command line options selected by the user + @param inst_map: A dict of inst -> nodes mapping + @param nodes_online: A list of nodes online + @param _instance_start_fn: Callback to start instances (unittest use only) + @return: Success of the operation on all instances + + """ + start_inst_list = [] + for (inst, nodes) in inst_map.items(): + if not (nodes - nodes_online): + # All nodes the instance lives on are back online + start_inst_list.append(inst) + + for inst in start_inst_list: + del inst_map[inst] + + if start_inst_list: + return _instance_start_fn(opts, start_inst_list, True) + + return True + + +def _EpoOn(opts, full_node_list, node_list, inst_map): + """Does the actual power on. + + @param opts: The command line options selected by the user + @param full_node_list: All nodes to operate on (includes nodes not supporting + OOB) + @param node_list: The list of nodes to operate on (all need to support OOB) + @param inst_map: A dict of inst -> nodes mapping + @return: The desired exit status + + """ + if node_list and not _OobPower(opts, node_list, False): + ToStderr("Not all nodes seem to get back up, investigate and start" + " manually if needed") + + # Wait for the nodes to be back up + action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map)) + + ToStdout("Waiting until all nodes are available again") + if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL): + ToStderr("Please investigate and start stopped instances manually") + return constants.EXIT_FAILURE + + return constants.EXIT_SUCCESS + + +def _EpoOff(opts, node_list, inst_map): + """Does the actual power off. + + @param opts: The command line options selected by the user + @param node_list: The list of nodes to operate on (all need to support OOB) + @param inst_map: A dict of inst -> nodes mapping + @return: The desired exit status + + """ + if not _InstanceStart(opts, inst_map.keys(), False): + ToStderr("Please investigate and stop instances manually before continuing") + return constants.EXIT_FAILURE + + if not node_list: + return constants.EXIT_SUCCESS + + if _OobPower(opts, node_list, False): + return constants.EXIT_SUCCESS + else: + return constants.EXIT_FAILURE + + +def Epo(opts, args): + """EPO operations. + + @param opts: the command line options selected by the user + @type args: list + @param args: should contain only one element, the subcommand + @rtype: int + @return: the desired exit code + + """ + if opts.groups and opts.show_all: + ToStderr("Only one of --groups or --all are allowed") + return constants.EXIT_FAILURE + elif args and opts.show_all: + ToStderr("Arguments in combination with --all are not allowed") + return constants.EXIT_FAILURE + + client = GetClient() + + if opts.groups: + node_query_list = itertools.chain(*client.QueryGroups(names=args, + fields=["node_list"], + use_locking=False)) + else: + node_query_list = args + + result = client.QueryNodes(names=node_query_list, + fields=["name", "master", "pinst_list", + "sinst_list", "powered", "offline"], + use_locking=False) + node_list = [] + inst_map = {} + for (idx, (node, master, pinsts, sinsts, powered, + offline)) in enumerate(result): + # Normalize the node_query_list as well + if not opts.show_all: + node_query_list[idx] = node + if not offline: + for inst in (pinsts + sinsts): + if inst in inst_map: + if not master: + inst_map[inst].add(node) + elif master: + inst_map[inst] = set() + else: + inst_map[inst] = set([node]) + + if master and opts.on: + # We ignore the master for turning on the machines, in fact we are + # already operating on the master at this point :) + continue + elif master and not opts.show_all: + ToStderr("%s is the master node, please do a master-failover to another" + " node not affected by the EPO or use --all if you intend to" + " shutdown the whole cluster", node) + return constants.EXIT_FAILURE + elif powered is None: + ToStdout("Node %s does not support out-of-band handling, it can not be" + " handled in a fully automated manner", node) + elif powered == opts.on: + ToStdout("Node %s is already in desired power state, skipping", node) + elif not offline or (offline and powered): + node_list.append(node) + + if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"): + return constants.EXIT_FAILURE + + if opts.on: + return _EpoOn(opts, node_query_list, node_list, inst_map) + else: + return _EpoOff(opts, node_list, inst_map) + + commands = { 'init': ( InitCluster, [ArgHost(min=1, max=1)], @@ -977,6 +1304,11 @@ commands = { NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT], "[opts...]", "Renews cluster certificates, keys and secrets"), + "epo": ( + Epo, [ArgUnknown()], + [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT], + "[opts...] [args]", + "Performs an emergency power-off on given args"), } diff --git a/man/gnt-cluster.rst b/man/gnt-cluster.rst index 5a6532442..d0c80e82a 100644 --- a/man/gnt-cluster.rst +++ b/man/gnt-cluster.rst @@ -93,6 +93,23 @@ Remove all configuration files related to the cluster, so that a Since this is a dangerous command, you are required to pass the argument *--yes-do-it.* +EPO +~~~ + +**epo** [--on] [--groups|--all] *arguments* + +Performs an emergency power-off on nodes given as arguments. If ``--groups`` +is given, arguments are node groups. If ``--all`` is provided, the whole +cluster will be shut down. + +The ``--on`` flag recovers the cluster after an emergency power-off + +Please note that the master node will not be turned down or up automatically. +It will just be left in a state, where you can manully perform the shutdown of +that one node. If the master is in the list of affected nodes and this is not a +complete cluster emergency power-off (e.g. using ``--all``), you're required to +do a master failover to another node not affected. + GETMASTER ~~~~~~~~~ diff --git a/qa/ganeti-qa.py b/qa/ganeti-qa.py index 5a3f44132..aafdea89f 100755 --- a/qa/ganeti-qa.py +++ b/qa/ganeti-qa.py @@ -411,6 +411,7 @@ def RunQa(): instance = RunTest(qa_instance.TestInstanceAddWithPlainDisk, pnode) RunCommonInstanceTests(instance) RunGroupListTests() + RunTest(qa_cluster.TestClusterEpo) RunExportImportTests(instance, pnode, None) RunDaemonTests(instance, pnode) RunTest(qa_instance.TestInstanceRemove, instance) diff --git a/qa/qa_cluster.py b/qa/qa_cluster.py index 176bcd9eb..809be4aef 100644 --- a/qa/qa_cluster.py +++ b/qa/qa_cluster.py @@ -27,13 +27,14 @@ import tempfile import os.path from ganeti import constants +from ganeti import compat from ganeti import utils import qa_config import qa_utils import qa_error -from qa_utils import AssertEqual, AssertCommand +from qa_utils import AssertEqual, AssertCommand, GetCommandOutput def _RemoveFileFromAllNodes(filename): @@ -150,6 +151,44 @@ def TestClusterOob(): "oob_program="]) +def TestClusterEpo(): + """gnt-cluster epo""" + master = qa_config.GetMasterNode() + + # Assert that OOB is unavailable for all nodes + result_output = GetCommandOutput(master["primary"], + "gnt-node list --verbose --no-header -o" + " powered") + AssertEqual(compat.all(powered == "(unavail)" + for powered in result_output.splitlines()), True) + + # Conflicting + AssertCommand(["gnt-cluster", "epo", "--groups", "--all"], fail=True) + # --all doesn't expect arguments + AssertCommand(["gnt-cluster", "epo", "--all", "some_arg"], fail=True) + + # Unless --all is given master is not allowed to be in the list + AssertCommand(["gnt-cluster", "epo", "-f", master["primary"]], fail=True) + + # This shouldn't fail + AssertCommand(["gnt-cluster", "epo", "-f", "--all"]) + + # All instances should have been stopped now + result_output = GetCommandOutput(master["primary"], + "gnt-instance list --no-header -o status") + AssertEqual(compat.all(status == "ADMIN_down" + for status in result_output.splitlines()), True) + + # Now start everything again + AssertCommand(["gnt-cluster", "epo", "--on", "-f", "--all"]) + + # All instances should have been started now + result_output = GetCommandOutput(master["primary"], + "gnt-instance list --no-header -o status") + AssertEqual(compat.all(status == "running" + for status in result_output.splitlines()), True) + + def TestClusterVerify(): """gnt-cluster verify""" AssertCommand(["gnt-cluster", "verify"]) diff --git a/test/ganeti.client.gnt_cluster_unittest.py b/test/ganeti.client.gnt_cluster_unittest.py new file mode 100755 index 000000000..3a6b05141 --- /dev/null +++ b/test/ganeti.client.gnt_cluster_unittest.py @@ -0,0 +1,142 @@ +#!/usr/bin/python +# + +# Copyright (C) 2011 Google Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. + + +"""Script for testing ganeti.client.gnt_cluster""" + +import unittest + +from ganeti.client import gnt_cluster +from ganeti import utils +from ganeti import compat + +import testutils + + +class TestEpo(unittest.TestCase): + def setUp(self): + self.nodes2ip = dict(("node%s" % i, "192.0.2.%s" % i) for i in range(1, 10)) + self.nodes = set(self.nodes2ip.keys()) + self.ips2node = dict((v, k) for (k, v) in self.nodes2ip.items()) + + def _FakeAction(*args): + return True + + def _FakePing(ip, port, live_port_needed=False): + self.assert_(live_port_needed) + self.assertEqual(port, 0) + return True + + def _FakeSleep(secs): + self.assert_(secs >= 0 and secs <= 5) + return + + def testPingFnRemoveHostsUp(self): + seen = set() + def _FakeSeenPing(ip, *args, **kwargs): + node = self.ips2node[ip] + self.assertFalse(node in seen) + seen.add(node) + return True + + helper = gnt_cluster._RunWhenNodesReachableHelper(self.nodes, + self._FakeAction, + self.nodes2ip, port=0, + _ping_fn=_FakeSeenPing, + _sleep_fn=self._FakeSleep) + + nodes_len = len(self.nodes) + for (num, _) in enumerate(self.nodes): + helper.Wait(5) + if num < nodes_len - 1: + self.assertRaises(utils.RetryAgain, helper) + else: + helper() + + self.assertEqual(seen, self.nodes) + self.assertFalse(helper.down) + self.assertEqual(helper.up, self.nodes) + + def testActionReturnFalseSetsHelperFalse(self): + called = False + def _FalseAction(*args): + return called + + helper = gnt_cluster._RunWhenNodesReachableHelper(self.nodes, _FalseAction, + self.nodes2ip, port=0, + _ping_fn=self._FakePing, + _sleep_fn=self._FakeSleep) + for _ in self.nodes: + try: + helper() + except utils.RetryAgain: + called = True + + self.assertFalse(helper.success) + + def testMaybeInstanceStartup(self): + instances_arg = [] + def _FakeInstanceStart(opts, instances, start): + instances_arg.append(set(instances)) + return None + + inst_map = { + "inst1": set(["node1", "node2"]), + "inst2": set(["node1", "node3"]), + "inst3": set(["node2", "node1"]), + "inst4": set(["node2", "node1", "node3"]), + "inst5": set(["node4"]), + } + + fn = _FakeInstanceStart + self.assert_(gnt_cluster._MaybeInstanceStartup(None, inst_map, set(), + _instance_start_fn=fn)) + self.assertFalse(instances_arg) + result = gnt_cluster._MaybeInstanceStartup(None, inst_map, set(["node1"]), + _instance_start_fn=fn) + self.assert_(result) + self.assertFalse(instances_arg) + result = gnt_cluster._MaybeInstanceStartup(None, inst_map, + set(["node1", "node3"]), + _instance_start_fn=fn) + self.assert_(result is None) + self.assertEqual(instances_arg.pop(0), set(["inst2"])) + self.assertFalse("inst2" in inst_map) + result = gnt_cluster._MaybeInstanceStartup(None, inst_map, + set(["node1", "node3"]), + _instance_start_fn=fn) + self.assert_(result) + self.assertFalse(instances_arg) + result = gnt_cluster._MaybeInstanceStartup(None, inst_map, + set(["node1", "node3", "node2"]), + _instance_start_fn=fn) + self.assertEqual(instances_arg.pop(0), set(["inst1", "inst3", "inst4"])) + self.assert_(result is None) + result = gnt_cluster._MaybeInstanceStartup(None, inst_map, + set(["node1", "node3", "node2", + "node4"]), + _instance_start_fn=fn) + self.assert_(result is None) + self.assertEqual(instances_arg.pop(0), set(["inst5"])) + self.assertFalse(inst_map) + + +if __name__ == "__main__": + testutils.GanetiTestProgram() -- GitLab