From f5118ade209718ec501e3ae63b654358606eb06b Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Tue, 26 May 2009 19:41:19 +0200 Subject: [PATCH] Add a node powercycle command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This (somewhat big) patch adds support for remotely rebooting the nodes via whatever support the hypervisor has for such a concept. For KVM/fake (and containers in the future) this just uses sysrq plus a βrebootβ call if the sysrq method failed. For Xen, it first tries the above, and then Xen-hypervisor reboot (we first try sysrq since that just requires opening a file handle, whereas xen reboot means launching an external utility). The user interface is: # gnt-node powercycle node5 Are you sure you want to hard powercycle node node5? y/[n]/?: y Reboot scheduled in 5 seconds The node reboots hopefully after sending the reply. In case the clock is broken, βtime.sleep(5)β might take ages (but then I suspect SSL negotiation wouldn't work). Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Guido Trotter <ultrotter@google.com> --- daemons/ganeti-noded | 9 ++++++++ lib/backend.py | 19 +++++++++++++++++ lib/cmdlib.py | 45 +++++++++++++++++++++++++++++++++++++++ lib/hypervisor/hv_base.py | 29 +++++++++++++++++++++++++ lib/hypervisor/hv_fake.py | 7 ++++++ lib/hypervisor/hv_kvm.py | 7 ++++++ lib/hypervisor/hv_xen.py | 17 +++++++++++++++ lib/mcpu.py | 1 + lib/opcodes.py | 10 +++++++++ lib/rpc.py | 10 +++++++++ man/gnt-node.sgml | 30 ++++++++++++++++++++++++++ scripts/gnt-node | 24 +++++++++++++++++++++ 12 files changed, 208 insertions(+) diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded index 21b9b5074..c29db720c 100755 --- a/daemons/ganeti-noded +++ b/daemons/ganeti-noded @@ -552,6 +552,15 @@ class NodeHttpServer(http.server.HttpServer): return backend.DemoteFromMC() + @staticmethod + def perspective_node_powercycle(params): + """Tries to powercycle the nod. + + """ + hypervisor_type = params[0] + return backend.PowercycleNode(hypervisor_type) + + # cluster -------------------------- @staticmethod diff --git a/lib/backend.py b/lib/backend.py index 183286b36..a01f95673 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -2437,6 +2437,25 @@ def DrbdWaitSync(nodes_ip, disks): return (not failure, (alldone, min_resync)) +def PowercycleNode(hypervisor_type): + """Hard-powercycle the node. + + Because we need to return first, and schedule the powercycle in the + background, we won't be able to report failures nicely. + + """ + hyper = hypervisor.GetHypervisor(hypervisor_type) + try: + pid = os.fork() + except OSError, err: + # if we can't fork, we'll pretend that we're in the child process + pid = 0 + if pid > 0: + return (True, "Reboot scheduled in 5 seconds") + time.sleep(5) + hyper.PowercycleNode() + + class HooksRunner(object): """Hook runner. diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 27dfe5f45..801d00540 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -2423,6 +2423,51 @@ class LUSetNodeParams(LogicalUnit): return result +class LUPowercycleNode(NoHooksLU): + """Powercycles a node. + + """ + _OP_REQP = ["node_name", "force"] + REQ_BGL = False + + def CheckArguments(self): + node_name = self.cfg.ExpandNodeName(self.op.node_name) + if node_name is None: + raise errors.OpPrereqError("Invalid node name '%s'" % self.op.node_name) + self.op.node_name = node_name + if node_name == self.cfg.GetMasterNode() and not self.op.force: + raise errors.OpPrereqError("The node is the master and the force" + " parameter was not set") + + def ExpandNames(self): + """Locking for PowercycleNode. + + This is a last-resource option and shouldn't block on other + jobs. Therefore, we grab no locks. + + """ + self.needed_locks = {} + + def CheckPrereq(self): + """Check prerequisites. + + This LU has no prereqs. + + """ + pass + + def Exec(self, feedback_fn): + """Reboots a node. + + """ + result = self.rpc.call_node_powercycle(self.op.node_name, + self.cfg.GetHypervisorType()) + msg = result.RemoteFailMsg() + if msg: + raise errors.OpExecError("Failed to schedule the reboot: %s" % msg) + return result.payload + + class LUQueryClusterInfo(NoHooksLU): """Query cluster configuration. diff --git a/lib/hypervisor/hv_base.py b/lib/hypervisor/hv_base.py index fd675f1b2..88c13f44f 100644 --- a/lib/hypervisor/hv_base.py +++ b/lib/hypervisor/hv_base.py @@ -286,6 +286,18 @@ class BaseHypervisor(object): " validation: %s (current value: '%s')" % (name, errstr, value)) + @classmethod + def PowercycleNode(cls): + """Hard powercycle a node using hypervisor specific methods. + + This method should hard powercycle the node, using whatever + methods the hypervisor provides. Note that this means that all + instances running on the node must be stopped too. + + """ + raise NotImplementedError + + def GetLinuxNodeInfo(self): """For linux systems, return actual OS information. @@ -346,3 +358,20 @@ class BaseHypervisor(object): result['cpu_sockets'] = 1 return result + + @classmethod + def LinuxPowercycle(cls): + """Linux-specific powercycle method. + + """ + try: + fd = os.open("/proc/sysrq-trigger", os.O_WRONLY) + try: + os.write(fd, "b") + finally: + fd.close() + except OSError: + logging.exception("Can't open the sysrq-trigger file") + result = utils.RunCmd(["reboot", "-n", "-f"]) + if not result: + logging.error("Can't run shutdown: %s", result.output) diff --git a/lib/hypervisor/hv_fake.py b/lib/hypervisor/hv_fake.py index ccac84277..52d85bc96 100644 --- a/lib/hypervisor/hv_fake.py +++ b/lib/hypervisor/hv_fake.py @@ -186,3 +186,10 @@ class FakeHypervisor(hv_base.BaseHypervisor): """ if not os.path.exists(self._ROOT_DIR): return "The required directory '%s' does not exist." % self._ROOT_DIR + + @classmethod + def PowercycleNode(cls): + """Fake hypervisor powercycle, just a wrapper over Linux powercycle. + + """ + cls.LinuxPowercycle() diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index 507e23a07..e2114751a 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -721,3 +721,10 @@ class KVMHypervisor(hv_base.BaseHypervisor): hvparams[constants.HV_NIC_TYPE] == constants.HT_NIC_PARAVIRTUAL): raise errors.HypervisorError("Cannot boot from a paravirtual NIC. Please" " change the NIC type.") + + @classmethod + def PowercycleNode(cls): + """KVM powercycle, just a wrapper over Linux powercycle. + + """ + cls.LinuxPowercycle() diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py index 614e6806b..6693c6bac 100644 --- a/lib/hypervisor/hv_xen.py +++ b/lib/hypervisor/hv_xen.py @@ -402,6 +402,23 @@ class XenHypervisor(hv_base.BaseHypervisor): except EnvironmentError: logging.exception("Failure while removing instance config file") + @classmethod + def PowercycleNode(cls): + """Xen-specific powercycle. + + This first does a Linux reboot (which triggers automatically a Xen + reboot), and if that fails it tries to do a Xen reboot. The reason + we don't try a Xen reboot first is that the xen reboot launches an + external command which connects to the Xen hypervisor, and that + won't work in case the root filesystem is broken and/or the xend + daemon is not working. + + """ + try: + cls.LinuxPowercycle() + finally: + utils.RunCmd(["xm", "debug", "R"]) + class XenPvmHypervisor(XenHypervisor): """Xen PVM hypervisor interface""" diff --git a/lib/mcpu.py b/lib/mcpu.py index 251276265..2e75f5643 100644 --- a/lib/mcpu.py +++ b/lib/mcpu.py @@ -56,6 +56,7 @@ class Processor(object): opcodes.OpQueryNodeVolumes: cmdlib.LUQueryNodeVolumes, opcodes.OpRemoveNode: cmdlib.LURemoveNode, opcodes.OpSetNodeParams: cmdlib.LUSetNodeParams, + opcodes.OpPowercycleNode: cmdlib.LUPowercycleNode, # instance lu opcodes.OpCreateInstance: cmdlib.LUCreateInstance, opcodes.OpReinstallInstance: cmdlib.LUReinstallInstance, diff --git a/lib/opcodes.py b/lib/opcodes.py index 535db910d..f49bfb7f5 100644 --- a/lib/opcodes.py +++ b/lib/opcodes.py @@ -340,6 +340,16 @@ class OpSetNodeParams(OpCode): "drained", ] + +class OpPowercycleNode(OpCode): + """Tries to powercycle a node.""" + OP_ID = "OP_NODE_POWERCYCLE" + OP_DSC_FIELD = "node_name" + __slots__ = [ + "node_name", + "force", + ] + # instance opcodes class OpCreateInstance(OpCode): diff --git a/lib/rpc.py b/lib/rpc.py index 11cdb1f18..6dc8ab4d3 100644 --- a/lib/rpc.py +++ b/lib/rpc.py @@ -1030,6 +1030,16 @@ class RpcRunner(object): """ return self._SingleNodeCall(node, "node_demote_from_mc", []) + + def call_node_powercycle(self, node, hypervisor): + """Tries to powercycle a node. + + This is a single-node call. + + """ + return self._SingleNodeCall(node, "node_powercycle", [hypervisor]) + + def call_test_delay(self, node_list, duration): """Sleep for a fixed time on given node(s). diff --git a/man/gnt-node.sgml b/man/gnt-node.sgml index b3c4f3e26..f00440d71 100644 --- a/man/gnt-node.sgml +++ b/man/gnt-node.sgml @@ -684,6 +684,36 @@ node1.example.com /dev/hdc1 xenvg instance1.example.com-sda_11001.data 256 inst </para> </refsect2> + <refsect2> + <title>POWERCYCLE</title> + + <cmdsynopsis> + <command>powercycle</command> + <arg><option>--confirm</option></arg> + <arg><option>--force</option></arg> + <arg choice="req"><replaceable>node</replaceable></arg> + </cmdsynopsis> + + <para> + This commands (tries to) forcefully reboot a node. It is a + command that can be used if the node environemnt is broken, + such that the admin can no longer login over ssh, but the + ganeti node daemon is still working. + </para> + + <para> + Note that this command is not guaranteed to work; it depends + on the hypervisor how effective is the reboot attempt. For + Linux, this command require that the kernel option + <literal>CONFIG_MAGIC_SYSRQ</literal> is enabled. + </para> + + <para> + The <option>--yes</option> option can be used to skip + confirmation, while the <option>--force</option> option is + needed if the target node is the master node. + </para> + </refsect1> &footer; diff --git a/scripts/gnt-node b/scripts/gnt-node index 5259678cd..c60ab41e4 100755 --- a/scripts/gnt-node +++ b/scripts/gnt-node @@ -391,6 +391,28 @@ def RemoveNode(opts, args): return 0 +def PowercycleNode(opts, args): + """Remove a node from the cluster. + + @param opts: the command line options selected by the user + @type args: list + @param args: should contain only one element, the name of + the node to be removed + @rtype: int + @return: the desired exit code + + """ + node = args[0] + if (not opts.confirm and + not AskUser("Are you sure you want to hard powercycle node %s?" % node)): + return 2 + + op = opcodes.OpPowercycleNode(node_name=node, force=opts.force) + result = SubmitOpCode(op) + ToStderr(result) + return 0 + + def ListVolumes(opts, args): """List logical volumes on node(s). @@ -552,6 +574,8 @@ commands = { help="Set the drained flag on the node"), ], "<instance>", "Alters the parameters of an instance"), + 'powercycle': (PowercycleNode, ARGS_ONE, [DEBUG_OPT, FORCE_OPT, CONFIRM_OPT], + "<node_name>", "Tries to forcefully powercycle a node"), 'remove': (RemoveNode, ARGS_ONE, [DEBUG_OPT], "<node_name>", "Removes a node from the cluster"), 'volumes': (ListVolumes, ARGS_ANY, -- GitLab