From b44bd844c54573e7801fc1ef6b4f8007f193555d Mon Sep 17 00:00:00 2001 From: Michael Hanselmann <hansmi@google.com> Date: Wed, 20 Oct 2010 14:51:53 +0200 Subject: [PATCH] Add option to ignore offline node on instance start/stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases it can be useful to mark as an instance as started or stopped while its primary node is offline. With this patch, a new option, β--ignore-offlineβ, is introduced to βgnt-instance startβ and ββ¦ stopβ. Signed-off-by: Michael Hanselmann <hansmi@google.com> Reviewed-by: Iustin Pop <iustin@google.com> --- lib/cli.py | 6 ++++ lib/cmdlib.py | 84 +++++++++++++++++++++++++++++-------------- lib/opcodes.py | 6 ++-- man/gnt-instance.sgml | 12 ++++++- scripts/gnt-instance | 10 +++--- 5 files changed, 84 insertions(+), 34 deletions(-) diff --git a/lib/cli.py b/lib/cli.py index 1db1d4551..b4f36d779 100644 --- a/lib/cli.py +++ b/lib/cli.py @@ -83,6 +83,7 @@ __all__ = [ "IDENTIFY_DEFAULTS_OPT", "IGNORE_CONSIST_OPT", "IGNORE_FAILURES_OPT", + "IGNORE_OFFLINE_OPT", "IGNORE_REMOVE_FAILURES_OPT", "IGNORE_SECONDARIES_OPT", "IGNORE_SIZE_OPT", @@ -585,6 +586,11 @@ FORCE_OPT = cli_option("-f", "--force", dest="force", action="store_true", CONFIRM_OPT = cli_option("--yes", dest="confirm", action="store_true", default=False, help="Do not require confirmation") +IGNORE_OFFLINE_OPT = cli_option("--ignore-offline", dest="ignore_offline", + action="store_true", default=False, + help=("Ignore offline nodes and do as much" + " as possible")) + TAG_SRC_OPT = cli_option("--from", dest="tags_source", default=None, help="File with tag names") diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 95f753843..b47e764cb 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -73,6 +73,8 @@ _PForce = ("force", False, ht.TBool) #: a required instance name (for single-instance LUs) _PInstanceName = ("instance_name", ht.NoDefault, ht.TNonEmptyString) +#: Whether to ignore offline nodes +_PIgnoreOfflineNodes = ("ignore_offline_nodes", False, ht.TBool) #: a required node name (for single-node LUs) _PNodeName = ("node_name", ht.NoDefault, ht.TNonEmptyString) @@ -4413,6 +4415,7 @@ class LUStartupInstance(LogicalUnit): _OP_PARAMS = [ _PInstanceName, _PForce, + _PIgnoreOfflineNodes, ("hvparams", ht.EmptyDict, ht.TDict), ("beparams", ht.EmptyDict, ht.TDict), ] @@ -4461,21 +4464,30 @@ class LUStartupInstance(LogicalUnit): hv_type.CheckParameterSyntax(filled_hvp) _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp) - _CheckNodeOnline(self, instance.primary_node) + self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline - bep = self.cfg.GetClusterInfo().FillBE(instance) - # check bridges existence - _CheckInstanceBridgesExist(self, instance) + if self.primary_offline and self.op.ignore_offline_nodes: + self.proc.LogWarning("Ignoring offline primary node") + + if self.op.hvparams or self.op.beparams: + self.proc.LogWarning("Overridden parameters are ignored") + else: + _CheckNodeOnline(self, instance.primary_node) + + bep = self.cfg.GetClusterInfo().FillBE(instance) - remote_info = self.rpc.call_instance_info(instance.primary_node, - instance.name, - instance.hypervisor) - remote_info.Raise("Error checking node %s" % instance.primary_node, - prereq=True, ecode=errors.ECODE_ENVIRON) - if not remote_info.payload: # not running already - _CheckNodeFreeMemory(self, instance.primary_node, - "starting instance %s" % instance.name, - bep[constants.BE_MEMORY], instance.hypervisor) + # check bridges existence + _CheckInstanceBridgesExist(self, instance) + + remote_info = self.rpc.call_instance_info(instance.primary_node, + instance.name, + instance.hypervisor) + remote_info.Raise("Error checking node %s" % instance.primary_node, + prereq=True, ecode=errors.ECODE_ENVIRON) + if not remote_info.payload: # not running already + _CheckNodeFreeMemory(self, instance.primary_node, + "starting instance %s" % instance.name, + bep[constants.BE_MEMORY], instance.hypervisor) def Exec(self, feedback_fn): """Start the instance. @@ -4486,16 +4498,20 @@ class LUStartupInstance(LogicalUnit): self.cfg.MarkInstanceUp(instance.name) - node_current = instance.primary_node + if self.primary_offline: + assert self.op.ignore_offline_nodes + self.proc.LogInfo("Primary node offline, marked instance as started") + else: + node_current = instance.primary_node - _StartInstanceDisks(self, instance, force) + _StartInstanceDisks(self, instance, force) - result = self.rpc.call_instance_start(node_current, instance, - self.op.hvparams, self.op.beparams) - msg = result.fail_msg - if msg: - _ShutdownInstanceDisks(self, instance) - raise errors.OpExecError("Could not start instance: %s" % msg) + result = self.rpc.call_instance_start(node_current, instance, + self.op.hvparams, self.op.beparams) + msg = result.fail_msg + if msg: + _ShutdownInstanceDisks(self, instance) + raise errors.OpExecError("Could not start instance: %s" % msg) class LURebootInstance(LogicalUnit): @@ -4587,6 +4603,7 @@ class LUShutdownInstance(LogicalUnit): HTYPE = constants.HTYPE_INSTANCE _OP_PARAMS = [ _PInstanceName, + _PIgnoreOfflineNodes, ("timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT, ht.TPositiveInt), ] REQ_BGL = False @@ -4614,7 +4631,14 @@ class LUShutdownInstance(LogicalUnit): self.instance = self.cfg.GetInstanceInfo(self.op.instance_name) assert self.instance is not None, \ "Cannot retrieve locked instance %s" % self.op.instance_name - _CheckNodeOnline(self, self.instance.primary_node) + + self.primary_offline = \ + self.cfg.GetNodeInfo(self.instance.primary_node).offline + + if self.primary_offline and self.op.ignore_offline_nodes: + self.proc.LogWarning("Ignoring offline primary node") + else: + _CheckNodeOnline(self, self.instance.primary_node) def Exec(self, feedback_fn): """Shutdown the instance. @@ -4623,13 +4647,19 @@ class LUShutdownInstance(LogicalUnit): instance = self.instance node_current = instance.primary_node timeout = self.op.timeout + self.cfg.MarkInstanceDown(instance.name) - result = self.rpc.call_instance_shutdown(node_current, instance, timeout) - msg = result.fail_msg - if msg: - self.proc.LogWarning("Could not shutdown instance: %s" % msg) - _ShutdownInstanceDisks(self, instance) + if self.primary_offline: + assert self.op.ignore_offline_nodes + self.proc.LogInfo("Primary node offline, marked instance as stopped") + else: + result = self.rpc.call_instance_shutdown(node_current, instance, timeout) + msg = result.fail_msg + if msg: + self.proc.LogWarning("Could not shutdown instance: %s" % msg) + + _ShutdownInstanceDisks(self, instance) class LUReinstallInstance(LogicalUnit): diff --git a/lib/opcodes.py b/lib/opcodes.py index f9c002ad0..48677063c 100644 --- a/lib/opcodes.py +++ b/lib/opcodes.py @@ -519,7 +519,7 @@ class OpStartupInstance(OpCode): OP_ID = "OP_INSTANCE_STARTUP" OP_DSC_FIELD = "instance_name" __slots__ = [ - "instance_name", "force", "hvparams", "beparams", + "instance_name", "force", "hvparams", "beparams", "ignore_offline_nodes", ] @@ -527,7 +527,9 @@ class OpShutdownInstance(OpCode): """Shutdown an instance.""" OP_ID = "OP_INSTANCE_SHUTDOWN" OP_DSC_FIELD = "instance_name" - __slots__ = ["instance_name", "timeout"] + __slots__ = [ + "instance_name", "timeout", "ignore_offline_nodes", + ] class OpRebootInstance(OpCode): diff --git a/man/gnt-instance.sgml b/man/gnt-instance.sgml index bddd8d24a..5a4c2e6f6 100644 --- a/man/gnt-instance.sgml +++ b/man/gnt-instance.sgml @@ -1741,6 +1741,7 @@ instance5: 11225 <command>startup</command> <sbr> <arg>--force</arg> + <arg>--ignore-offline</arg> <sbr> <arg>--force-multiple</arg> <sbr> @@ -1848,7 +1849,9 @@ instance5: 11225 <para> Use <option>--force</option> to start even if secondary disks are - failing. + failing. <option>--ignore-offline</option> can be used to ignore + offline primary nodes and mark the instance as started even if + the primary is not available. </para> <para> @@ -1904,6 +1907,7 @@ instance5: 11225 <arg>--timeout=<replaceable>N</replaceable></arg> <sbr> <arg>--force-multiple</arg> + <arg>--ignore-offline</arg> <sbr> <group choice="opt"> <arg>--instance</arg> @@ -1954,6 +1958,12 @@ instance5: 11225 <command>gnt-job info</command>. </para> + <para> + <option>--ignore-offline</option> can be used to ignore offline + primary nodes and force the instance to be marked as stopped. This + option should be used with care as it can lead to an + inconsistent cluster state. + </para> <para> Example: diff --git a/scripts/gnt-instance b/scripts/gnt-instance index 1c62463df..bb236e888 100755 --- a/scripts/gnt-instance +++ b/scripts/gnt-instance @@ -744,7 +744,8 @@ def _StartupInstance(name, opts): """ op = opcodes.OpStartupInstance(instance_name=name, - force=opts.force) + force=opts.force, + ignore_offline_nodes=opts.ignore_offline) # do not add these parameters to the opcode unless they're defined if opts.hvparams: op.hvparams = opts.hvparams @@ -782,7 +783,8 @@ def _ShutdownInstance(name, opts): """ return opcodes.OpShutdownInstance(instance_name=name, - timeout=opts.timeout) + timeout=opts.timeout, + ignore_offline_nodes=opts.ignore_offline) def ReplaceDisks(opts, args): @@ -1487,14 +1489,14 @@ commands = { [m_node_opt, m_pri_node_opt, m_sec_node_opt, m_clust_opt, m_node_tags_opt, m_pri_node_tags_opt, m_sec_node_tags_opt, m_inst_tags_opt, m_inst_opt, m_force_multi, TIMEOUT_OPT, SUBMIT_OPT, - DRY_RUN_OPT, PRIORITY_OPT], + DRY_RUN_OPT, PRIORITY_OPT, IGNORE_OFFLINE_OPT], "<instance>", "Stops an instance"), 'startup': ( GenericManyOps("startup", _StartupInstance), [ArgInstance()], [FORCE_OPT, m_force_multi, m_node_opt, m_pri_node_opt, m_sec_node_opt, m_node_tags_opt, m_pri_node_tags_opt, m_sec_node_tags_opt, m_inst_tags_opt, m_clust_opt, m_inst_opt, SUBMIT_OPT, HVOPTS_OPT, - BACKEND_OPT, DRY_RUN_OPT, PRIORITY_OPT], + BACKEND_OPT, DRY_RUN_OPT, PRIORITY_OPT, IGNORE_OFFLINE_OPT], "<instance>", "Starts an instance"), 'reboot': ( GenericManyOps("reboot", _RebootInstance), [ArgInstance()], -- GitLab