Commit b44bd844 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

Add option to ignore offline node on instance start/stop



In some cases it can be useful to mark as an instance as started
or stopped while its primary node is offline. With this patch,
a new option, “--ignore-offline”, is introduced to “gnt-instance
start” and “… stop”.
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent 691c81b7
......@@ -83,6 +83,7 @@ __all__ = [
"IDENTIFY_DEFAULTS_OPT",
"IGNORE_CONSIST_OPT",
"IGNORE_FAILURES_OPT",
"IGNORE_OFFLINE_OPT",
"IGNORE_REMOVE_FAILURES_OPT",
"IGNORE_SECONDARIES_OPT",
"IGNORE_SIZE_OPT",
......@@ -585,6 +586,11 @@ FORCE_OPT = cli_option("-f", "--force", dest="force", action="store_true",
CONFIRM_OPT = cli_option("--yes", dest="confirm", action="store_true",
default=False, help="Do not require confirmation")
IGNORE_OFFLINE_OPT = cli_option("--ignore-offline", dest="ignore_offline",
action="store_true", default=False,
help=("Ignore offline nodes and do as much"
" as possible"))
TAG_SRC_OPT = cli_option("--from", dest="tags_source",
default=None, help="File with tag names")
......
......@@ -73,6 +73,8 @@ _PForce = ("force", False, ht.TBool)
#: a required instance name (for single-instance LUs)
_PInstanceName = ("instance_name", ht.NoDefault, ht.TNonEmptyString)
#: Whether to ignore offline nodes
_PIgnoreOfflineNodes = ("ignore_offline_nodes", False, ht.TBool)
#: a required node name (for single-node LUs)
_PNodeName = ("node_name", ht.NoDefault, ht.TNonEmptyString)
......@@ -4413,6 +4415,7 @@ class LUStartupInstance(LogicalUnit):
_OP_PARAMS = [
_PInstanceName,
_PForce,
_PIgnoreOfflineNodes,
("hvparams", ht.EmptyDict, ht.TDict),
("beparams", ht.EmptyDict, ht.TDict),
]
......@@ -4461,21 +4464,30 @@ class LUStartupInstance(LogicalUnit):
hv_type.CheckParameterSyntax(filled_hvp)
_CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
_CheckNodeOnline(self, instance.primary_node)
self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
bep = self.cfg.GetClusterInfo().FillBE(instance)
# check bridges existence
_CheckInstanceBridgesExist(self, instance)
if self.primary_offline and self.op.ignore_offline_nodes:
self.proc.LogWarning("Ignoring offline primary node")
if self.op.hvparams or self.op.beparams:
self.proc.LogWarning("Overridden parameters are ignored")
else:
_CheckNodeOnline(self, instance.primary_node)
bep = self.cfg.GetClusterInfo().FillBE(instance)
remote_info = self.rpc.call_instance_info(instance.primary_node,
instance.name,
instance.hypervisor)
remote_info.Raise("Error checking node %s" % instance.primary_node,
prereq=True, ecode=errors.ECODE_ENVIRON)
if not remote_info.payload: # not running already
_CheckNodeFreeMemory(self, instance.primary_node,
"starting instance %s" % instance.name,
bep[constants.BE_MEMORY], instance.hypervisor)
# check bridges existence
_CheckInstanceBridgesExist(self, instance)
remote_info = self.rpc.call_instance_info(instance.primary_node,
instance.name,
instance.hypervisor)
remote_info.Raise("Error checking node %s" % instance.primary_node,
prereq=True, ecode=errors.ECODE_ENVIRON)
if not remote_info.payload: # not running already
_CheckNodeFreeMemory(self, instance.primary_node,
"starting instance %s" % instance.name,
bep[constants.BE_MEMORY], instance.hypervisor)
def Exec(self, feedback_fn):
"""Start the instance.
......@@ -4486,16 +4498,20 @@ class LUStartupInstance(LogicalUnit):
self.cfg.MarkInstanceUp(instance.name)
node_current = instance.primary_node
if self.primary_offline:
assert self.op.ignore_offline_nodes
self.proc.LogInfo("Primary node offline, marked instance as started")
else:
node_current = instance.primary_node
_StartInstanceDisks(self, instance, force)
_StartInstanceDisks(self, instance, force)
result = self.rpc.call_instance_start(node_current, instance,
self.op.hvparams, self.op.beparams)
msg = result.fail_msg
if msg:
_ShutdownInstanceDisks(self, instance)
raise errors.OpExecError("Could not start instance: %s" % msg)
result = self.rpc.call_instance_start(node_current, instance,
self.op.hvparams, self.op.beparams)
msg = result.fail_msg
if msg:
_ShutdownInstanceDisks(self, instance)
raise errors.OpExecError("Could not start instance: %s" % msg)
class LURebootInstance(LogicalUnit):
......@@ -4587,6 +4603,7 @@ class LUShutdownInstance(LogicalUnit):
HTYPE = constants.HTYPE_INSTANCE
_OP_PARAMS = [
_PInstanceName,
_PIgnoreOfflineNodes,
("timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT, ht.TPositiveInt),
]
REQ_BGL = False
......@@ -4614,7 +4631,14 @@ class LUShutdownInstance(LogicalUnit):
self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
assert self.instance is not None, \
"Cannot retrieve locked instance %s" % self.op.instance_name
_CheckNodeOnline(self, self.instance.primary_node)
self.primary_offline = \
self.cfg.GetNodeInfo(self.instance.primary_node).offline
if self.primary_offline and self.op.ignore_offline_nodes:
self.proc.LogWarning("Ignoring offline primary node")
else:
_CheckNodeOnline(self, self.instance.primary_node)
def Exec(self, feedback_fn):
"""Shutdown the instance.
......@@ -4623,13 +4647,19 @@ class LUShutdownInstance(LogicalUnit):
instance = self.instance
node_current = instance.primary_node
timeout = self.op.timeout
self.cfg.MarkInstanceDown(instance.name)
result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
msg = result.fail_msg
if msg:
self.proc.LogWarning("Could not shutdown instance: %s" % msg)
_ShutdownInstanceDisks(self, instance)
if self.primary_offline:
assert self.op.ignore_offline_nodes
self.proc.LogInfo("Primary node offline, marked instance as stopped")
else:
result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
msg = result.fail_msg
if msg:
self.proc.LogWarning("Could not shutdown instance: %s" % msg)
_ShutdownInstanceDisks(self, instance)
class LUReinstallInstance(LogicalUnit):
......
......@@ -519,7 +519,7 @@ class OpStartupInstance(OpCode):
OP_ID = "OP_INSTANCE_STARTUP"
OP_DSC_FIELD = "instance_name"
__slots__ = [
"instance_name", "force", "hvparams", "beparams",
"instance_name", "force", "hvparams", "beparams", "ignore_offline_nodes",
]
......@@ -527,7 +527,9 @@ class OpShutdownInstance(OpCode):
"""Shutdown an instance."""
OP_ID = "OP_INSTANCE_SHUTDOWN"
OP_DSC_FIELD = "instance_name"
__slots__ = ["instance_name", "timeout"]
__slots__ = [
"instance_name", "timeout", "ignore_offline_nodes",
]
class OpRebootInstance(OpCode):
......
......@@ -1741,6 +1741,7 @@ instance5: 11225
<command>startup</command>
<sbr>
<arg>--force</arg>
<arg>--ignore-offline</arg>
<sbr>
<arg>--force-multiple</arg>
<sbr>
......@@ -1848,7 +1849,9 @@ instance5: 11225
<para>
Use <option>--force</option> to start even if secondary disks are
failing.
failing. <option>--ignore-offline</option> can be used to ignore
offline primary nodes and mark the instance as started even if
the primary is not available.
</para>
<para>
......@@ -1904,6 +1907,7 @@ instance5: 11225
<arg>--timeout=<replaceable>N</replaceable></arg>
<sbr>
<arg>--force-multiple</arg>
<arg>--ignore-offline</arg>
<sbr>
<group choice="opt">
<arg>--instance</arg>
......@@ -1954,6 +1958,12 @@ instance5: 11225
<command>gnt-job info</command>.
</para>
<para>
<option>--ignore-offline</option> can be used to ignore offline
primary nodes and force the instance to be marked as stopped. This
option should be used with care as it can lead to an
inconsistent cluster state.
</para>
<para>
Example:
......
......@@ -744,7 +744,8 @@ def _StartupInstance(name, opts):
"""
op = opcodes.OpStartupInstance(instance_name=name,
force=opts.force)
force=opts.force,
ignore_offline_nodes=opts.ignore_offline)
# do not add these parameters to the opcode unless they're defined
if opts.hvparams:
op.hvparams = opts.hvparams
......@@ -782,7 +783,8 @@ def _ShutdownInstance(name, opts):
"""
return opcodes.OpShutdownInstance(instance_name=name,
timeout=opts.timeout)
timeout=opts.timeout,
ignore_offline_nodes=opts.ignore_offline)
def ReplaceDisks(opts, args):
......@@ -1487,14 +1489,14 @@ commands = {
[m_node_opt, m_pri_node_opt, m_sec_node_opt, m_clust_opt,
m_node_tags_opt, m_pri_node_tags_opt, m_sec_node_tags_opt,
m_inst_tags_opt, m_inst_opt, m_force_multi, TIMEOUT_OPT, SUBMIT_OPT,
DRY_RUN_OPT, PRIORITY_OPT],
DRY_RUN_OPT, PRIORITY_OPT, IGNORE_OFFLINE_OPT],
"<instance>", "Stops an instance"),
'startup': (
GenericManyOps("startup", _StartupInstance), [ArgInstance()],
[FORCE_OPT, m_force_multi, m_node_opt, m_pri_node_opt, m_sec_node_opt,
m_node_tags_opt, m_pri_node_tags_opt, m_sec_node_tags_opt,
m_inst_tags_opt, m_clust_opt, m_inst_opt, SUBMIT_OPT, HVOPTS_OPT,
BACKEND_OPT, DRY_RUN_OPT, PRIORITY_OPT],
BACKEND_OPT, DRY_RUN_OPT, PRIORITY_OPT, IGNORE_OFFLINE_OPT],
"<instance>", "Starts an instance"),
'reboot': (
GenericManyOps("reboot", _RebootInstance), [ArgInstance()],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment