Commit f5118ade authored by Iustin Pop's avatar Iustin Pop
Browse files

Add a node powercycle command



This (somewhat big) patch adds support for remotely rebooting the nodes
via whatever support the hypervisor has for such a concept.

For KVM/fake (and containers in the future) this just uses sysrq plus a
‘reboot’ call if the sysrq method failed. For Xen, it first tries the
above, and then Xen-hypervisor reboot (we first try sysrq since that
just requires opening a file handle, whereas xen reboot means launching
an external utility).

The user interface is:

    # gnt-node powercycle node5
    Are you sure you want to hard powercycle node node5?
    y/[n]/?: y
    Reboot scheduled in 5 seconds

The node reboots hopefully after sending the reply. In case the clock is
broken, “time.sleep(5)” might take ages (but then I suspect SSL
negotiation wouldn't work).
Signed-off-by: default avatarIustin Pop <iustin@google.com>
Reviewed-by: default avatarGuido Trotter <ultrotter@google.com>
parent d48031a1
......@@ -552,6 +552,15 @@ class NodeHttpServer(http.server.HttpServer):
return backend.DemoteFromMC()
@staticmethod
def perspective_node_powercycle(params):
"""Tries to powercycle the nod.
"""
hypervisor_type = params[0]
return backend.PowercycleNode(hypervisor_type)
# cluster --------------------------
@staticmethod
......
......@@ -2437,6 +2437,25 @@ def DrbdWaitSync(nodes_ip, disks):
return (not failure, (alldone, min_resync))
def PowercycleNode(hypervisor_type):
"""Hard-powercycle the node.
Because we need to return first, and schedule the powercycle in the
background, we won't be able to report failures nicely.
"""
hyper = hypervisor.GetHypervisor(hypervisor_type)
try:
pid = os.fork()
except OSError, err:
# if we can't fork, we'll pretend that we're in the child process
pid = 0
if pid > 0:
return (True, "Reboot scheduled in 5 seconds")
time.sleep(5)
hyper.PowercycleNode()
class HooksRunner(object):
"""Hook runner.
......
......@@ -2423,6 +2423,51 @@ class LUSetNodeParams(LogicalUnit):
return result
class LUPowercycleNode(NoHooksLU):
"""Powercycles a node.
"""
_OP_REQP = ["node_name", "force"]
REQ_BGL = False
def CheckArguments(self):
node_name = self.cfg.ExpandNodeName(self.op.node_name)
if node_name is None:
raise errors.OpPrereqError("Invalid node name '%s'" % self.op.node_name)
self.op.node_name = node_name
if node_name == self.cfg.GetMasterNode() and not self.op.force:
raise errors.OpPrereqError("The node is the master and the force"
" parameter was not set")
def ExpandNames(self):
"""Locking for PowercycleNode.
This is a last-resource option and shouldn't block on other
jobs. Therefore, we grab no locks.
"""
self.needed_locks = {}
def CheckPrereq(self):
"""Check prerequisites.
This LU has no prereqs.
"""
pass
def Exec(self, feedback_fn):
"""Reboots a node.
"""
result = self.rpc.call_node_powercycle(self.op.node_name,
self.cfg.GetHypervisorType())
msg = result.RemoteFailMsg()
if msg:
raise errors.OpExecError("Failed to schedule the reboot: %s" % msg)
return result.payload
class LUQueryClusterInfo(NoHooksLU):
"""Query cluster configuration.
......
......@@ -286,6 +286,18 @@ class BaseHypervisor(object):
" validation: %s (current value: '%s')" %
(name, errstr, value))
@classmethod
def PowercycleNode(cls):
"""Hard powercycle a node using hypervisor specific methods.
This method should hard powercycle the node, using whatever
methods the hypervisor provides. Note that this means that all
instances running on the node must be stopped too.
"""
raise NotImplementedError
def GetLinuxNodeInfo(self):
"""For linux systems, return actual OS information.
......@@ -346,3 +358,20 @@ class BaseHypervisor(object):
result['cpu_sockets'] = 1
return result
@classmethod
def LinuxPowercycle(cls):
"""Linux-specific powercycle method.
"""
try:
fd = os.open("/proc/sysrq-trigger", os.O_WRONLY)
try:
os.write(fd, "b")
finally:
fd.close()
except OSError:
logging.exception("Can't open the sysrq-trigger file")
result = utils.RunCmd(["reboot", "-n", "-f"])
if not result:
logging.error("Can't run shutdown: %s", result.output)
......@@ -186,3 +186,10 @@ class FakeHypervisor(hv_base.BaseHypervisor):
"""
if not os.path.exists(self._ROOT_DIR):
return "The required directory '%s' does not exist." % self._ROOT_DIR
@classmethod
def PowercycleNode(cls):
"""Fake hypervisor powercycle, just a wrapper over Linux powercycle.
"""
cls.LinuxPowercycle()
......@@ -721,3 +721,10 @@ class KVMHypervisor(hv_base.BaseHypervisor):
hvparams[constants.HV_NIC_TYPE] == constants.HT_NIC_PARAVIRTUAL):
raise errors.HypervisorError("Cannot boot from a paravirtual NIC. Please"
" change the NIC type.")
@classmethod
def PowercycleNode(cls):
"""KVM powercycle, just a wrapper over Linux powercycle.
"""
cls.LinuxPowercycle()
......@@ -402,6 +402,23 @@ class XenHypervisor(hv_base.BaseHypervisor):
except EnvironmentError:
logging.exception("Failure while removing instance config file")
@classmethod
def PowercycleNode(cls):
"""Xen-specific powercycle.
This first does a Linux reboot (which triggers automatically a Xen
reboot), and if that fails it tries to do a Xen reboot. The reason
we don't try a Xen reboot first is that the xen reboot launches an
external command which connects to the Xen hypervisor, and that
won't work in case the root filesystem is broken and/or the xend
daemon is not working.
"""
try:
cls.LinuxPowercycle()
finally:
utils.RunCmd(["xm", "debug", "R"])
class XenPvmHypervisor(XenHypervisor):
"""Xen PVM hypervisor interface"""
......
......@@ -56,6 +56,7 @@ class Processor(object):
opcodes.OpQueryNodeVolumes: cmdlib.LUQueryNodeVolumes,
opcodes.OpRemoveNode: cmdlib.LURemoveNode,
opcodes.OpSetNodeParams: cmdlib.LUSetNodeParams,
opcodes.OpPowercycleNode: cmdlib.LUPowercycleNode,
# instance lu
opcodes.OpCreateInstance: cmdlib.LUCreateInstance,
opcodes.OpReinstallInstance: cmdlib.LUReinstallInstance,
......
......@@ -340,6 +340,16 @@ class OpSetNodeParams(OpCode):
"drained",
]
class OpPowercycleNode(OpCode):
"""Tries to powercycle a node."""
OP_ID = "OP_NODE_POWERCYCLE"
OP_DSC_FIELD = "node_name"
__slots__ = [
"node_name",
"force",
]
# instance opcodes
class OpCreateInstance(OpCode):
......
......@@ -1030,6 +1030,16 @@ class RpcRunner(object):
"""
return self._SingleNodeCall(node, "node_demote_from_mc", [])
def call_node_powercycle(self, node, hypervisor):
"""Tries to powercycle a node.
This is a single-node call.
"""
return self._SingleNodeCall(node, "node_powercycle", [hypervisor])
def call_test_delay(self, node_list, duration):
"""Sleep for a fixed time on given node(s).
......
......@@ -684,6 +684,36 @@ node1.example.com /dev/hdc1 xenvg instance1.example.com-sda_11001.data 256 inst
</para>
</refsect2>
<refsect2>
<title>POWERCYCLE</title>
<cmdsynopsis>
<command>powercycle</command>
<arg><option>--confirm</option></arg>
<arg><option>--force</option></arg>
<arg choice="req"><replaceable>node</replaceable></arg>
</cmdsynopsis>
<para>
This commands (tries to) forcefully reboot a node. It is a
command that can be used if the node environemnt is broken,
such that the admin can no longer login over ssh, but the
ganeti node daemon is still working.
</para>
<para>
Note that this command is not guaranteed to work; it depends
on the hypervisor how effective is the reboot attempt. For
Linux, this command require that the kernel option
<literal>CONFIG_MAGIC_SYSRQ</literal> is enabled.
</para>
<para>
The <option>--yes</option> option can be used to skip
confirmation, while the <option>--force</option> option is
needed if the target node is the master node.
</para>
</refsect1>
&footer;
......
......@@ -391,6 +391,28 @@ def RemoveNode(opts, args):
return 0
def PowercycleNode(opts, args):
"""Remove a node from the cluster.
@param opts: the command line options selected by the user
@type args: list
@param args: should contain only one element, the name of
the node to be removed
@rtype: int
@return: the desired exit code
"""
node = args[0]
if (not opts.confirm and
not AskUser("Are you sure you want to hard powercycle node %s?" % node)):
return 2
op = opcodes.OpPowercycleNode(node_name=node, force=opts.force)
result = SubmitOpCode(op)
ToStderr(result)
return 0
def ListVolumes(opts, args):
"""List logical volumes on node(s).
......@@ -552,6 +574,8 @@ commands = {
help="Set the drained flag on the node"),
],
"<instance>", "Alters the parameters of an instance"),
'powercycle': (PowercycleNode, ARGS_ONE, [DEBUG_OPT, FORCE_OPT, CONFIRM_OPT],
"<node_name>", "Tries to forcefully powercycle a node"),
'remove': (RemoveNode, ARGS_ONE, [DEBUG_OPT],
"<node_name>", "Removes a node from the cluster"),
'volumes': (ListVolumes, ARGS_ANY,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment