From b44bd844c54573e7801fc1ef6b4f8007f193555d Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <hansmi@google.com>
Date: Wed, 20 Oct 2010 14:51:53 +0200
Subject: [PATCH] Add option to ignore offline node on instance start/stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases it can be useful to mark as an instance as started
or stopped while its primary node is offline. With this patch,
a new option, β€œ--ignore-offline”, is introduced to β€œgnt-instance
start” and β€œβ€¦ stop”.

Signed-off-by: Michael Hanselmann <hansmi@google.com>
Reviewed-by: Iustin Pop <iustin@google.com>
---
 lib/cli.py            |  6 ++++
 lib/cmdlib.py         | 84 +++++++++++++++++++++++++++++--------------
 lib/opcodes.py        |  6 ++--
 man/gnt-instance.sgml | 12 ++++++-
 scripts/gnt-instance  | 10 +++---
 5 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/lib/cli.py b/lib/cli.py
index 1db1d4551..b4f36d779 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -83,6 +83,7 @@ __all__ = [
   "IDENTIFY_DEFAULTS_OPT",
   "IGNORE_CONSIST_OPT",
   "IGNORE_FAILURES_OPT",
+  "IGNORE_OFFLINE_OPT",
   "IGNORE_REMOVE_FAILURES_OPT",
   "IGNORE_SECONDARIES_OPT",
   "IGNORE_SIZE_OPT",
@@ -585,6 +586,11 @@ FORCE_OPT = cli_option("-f", "--force", dest="force", action="store_true",
 CONFIRM_OPT = cli_option("--yes", dest="confirm", action="store_true",
                          default=False, help="Do not require confirmation")
 
+IGNORE_OFFLINE_OPT = cli_option("--ignore-offline", dest="ignore_offline",
+                                  action="store_true", default=False,
+                                  help=("Ignore offline nodes and do as much"
+                                        " as possible"))
+
 TAG_SRC_OPT = cli_option("--from", dest="tags_source",
                          default=None, help="File with tag names")
 
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index 95f753843..b47e764cb 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -73,6 +73,8 @@ _PForce = ("force", False, ht.TBool)
 #: a required instance name (for single-instance LUs)
 _PInstanceName = ("instance_name", ht.NoDefault, ht.TNonEmptyString)
 
+#: Whether to ignore offline nodes
+_PIgnoreOfflineNodes = ("ignore_offline_nodes", False, ht.TBool)
 
 #: a required node name (for single-node LUs)
 _PNodeName = ("node_name", ht.NoDefault, ht.TNonEmptyString)
@@ -4413,6 +4415,7 @@ class LUStartupInstance(LogicalUnit):
   _OP_PARAMS = [
     _PInstanceName,
     _PForce,
+    _PIgnoreOfflineNodes,
     ("hvparams", ht.EmptyDict, ht.TDict),
     ("beparams", ht.EmptyDict, ht.TDict),
     ]
@@ -4461,21 +4464,30 @@ class LUStartupInstance(LogicalUnit):
       hv_type.CheckParameterSyntax(filled_hvp)
       _CheckHVParams(self, instance.all_nodes, instance.hypervisor, filled_hvp)
 
-    _CheckNodeOnline(self, instance.primary_node)
+    self.primary_offline = self.cfg.GetNodeInfo(instance.primary_node).offline
 
-    bep = self.cfg.GetClusterInfo().FillBE(instance)
-    # check bridges existence
-    _CheckInstanceBridgesExist(self, instance)
+    if self.primary_offline and self.op.ignore_offline_nodes:
+      self.proc.LogWarning("Ignoring offline primary node")
+
+      if self.op.hvparams or self.op.beparams:
+        self.proc.LogWarning("Overridden parameters are ignored")
+    else:
+      _CheckNodeOnline(self, instance.primary_node)
+
+      bep = self.cfg.GetClusterInfo().FillBE(instance)
 
-    remote_info = self.rpc.call_instance_info(instance.primary_node,
-                                              instance.name,
-                                              instance.hypervisor)
-    remote_info.Raise("Error checking node %s" % instance.primary_node,
-                      prereq=True, ecode=errors.ECODE_ENVIRON)
-    if not remote_info.payload: # not running already
-      _CheckNodeFreeMemory(self, instance.primary_node,
-                           "starting instance %s" % instance.name,
-                           bep[constants.BE_MEMORY], instance.hypervisor)
+      # check bridges existence
+      _CheckInstanceBridgesExist(self, instance)
+
+      remote_info = self.rpc.call_instance_info(instance.primary_node,
+                                                instance.name,
+                                                instance.hypervisor)
+      remote_info.Raise("Error checking node %s" % instance.primary_node,
+                        prereq=True, ecode=errors.ECODE_ENVIRON)
+      if not remote_info.payload: # not running already
+        _CheckNodeFreeMemory(self, instance.primary_node,
+                             "starting instance %s" % instance.name,
+                             bep[constants.BE_MEMORY], instance.hypervisor)
 
   def Exec(self, feedback_fn):
     """Start the instance.
@@ -4486,16 +4498,20 @@ class LUStartupInstance(LogicalUnit):
 
     self.cfg.MarkInstanceUp(instance.name)
 
-    node_current = instance.primary_node
+    if self.primary_offline:
+      assert self.op.ignore_offline_nodes
+      self.proc.LogInfo("Primary node offline, marked instance as started")
+    else:
+      node_current = instance.primary_node
 
-    _StartInstanceDisks(self, instance, force)
+      _StartInstanceDisks(self, instance, force)
 
-    result = self.rpc.call_instance_start(node_current, instance,
-                                          self.op.hvparams, self.op.beparams)
-    msg = result.fail_msg
-    if msg:
-      _ShutdownInstanceDisks(self, instance)
-      raise errors.OpExecError("Could not start instance: %s" % msg)
+      result = self.rpc.call_instance_start(node_current, instance,
+                                            self.op.hvparams, self.op.beparams)
+      msg = result.fail_msg
+      if msg:
+        _ShutdownInstanceDisks(self, instance)
+        raise errors.OpExecError("Could not start instance: %s" % msg)
 
 
 class LURebootInstance(LogicalUnit):
@@ -4587,6 +4603,7 @@ class LUShutdownInstance(LogicalUnit):
   HTYPE = constants.HTYPE_INSTANCE
   _OP_PARAMS = [
     _PInstanceName,
+    _PIgnoreOfflineNodes,
     ("timeout", constants.DEFAULT_SHUTDOWN_TIMEOUT, ht.TPositiveInt),
     ]
   REQ_BGL = False
@@ -4614,7 +4631,14 @@ class LUShutdownInstance(LogicalUnit):
     self.instance = self.cfg.GetInstanceInfo(self.op.instance_name)
     assert self.instance is not None, \
       "Cannot retrieve locked instance %s" % self.op.instance_name
-    _CheckNodeOnline(self, self.instance.primary_node)
+
+    self.primary_offline = \
+      self.cfg.GetNodeInfo(self.instance.primary_node).offline
+
+    if self.primary_offline and self.op.ignore_offline_nodes:
+      self.proc.LogWarning("Ignoring offline primary node")
+    else:
+      _CheckNodeOnline(self, self.instance.primary_node)
 
   def Exec(self, feedback_fn):
     """Shutdown the instance.
@@ -4623,13 +4647,19 @@ class LUShutdownInstance(LogicalUnit):
     instance = self.instance
     node_current = instance.primary_node
     timeout = self.op.timeout
+
     self.cfg.MarkInstanceDown(instance.name)
-    result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
-    msg = result.fail_msg
-    if msg:
-      self.proc.LogWarning("Could not shutdown instance: %s" % msg)
 
-    _ShutdownInstanceDisks(self, instance)
+    if self.primary_offline:
+      assert self.op.ignore_offline_nodes
+      self.proc.LogInfo("Primary node offline, marked instance as stopped")
+    else:
+      result = self.rpc.call_instance_shutdown(node_current, instance, timeout)
+      msg = result.fail_msg
+      if msg:
+        self.proc.LogWarning("Could not shutdown instance: %s" % msg)
+
+      _ShutdownInstanceDisks(self, instance)
 
 
 class LUReinstallInstance(LogicalUnit):
diff --git a/lib/opcodes.py b/lib/opcodes.py
index f9c002ad0..48677063c 100644
--- a/lib/opcodes.py
+++ b/lib/opcodes.py
@@ -519,7 +519,7 @@ class OpStartupInstance(OpCode):
   OP_ID = "OP_INSTANCE_STARTUP"
   OP_DSC_FIELD = "instance_name"
   __slots__ = [
-    "instance_name", "force", "hvparams", "beparams",
+    "instance_name", "force", "hvparams", "beparams", "ignore_offline_nodes",
     ]
 
 
@@ -527,7 +527,9 @@ class OpShutdownInstance(OpCode):
   """Shutdown an instance."""
   OP_ID = "OP_INSTANCE_SHUTDOWN"
   OP_DSC_FIELD = "instance_name"
-  __slots__ = ["instance_name", "timeout"]
+  __slots__ = [
+    "instance_name", "timeout", "ignore_offline_nodes",
+    ]
 
 
 class OpRebootInstance(OpCode):
diff --git a/man/gnt-instance.sgml b/man/gnt-instance.sgml
index bddd8d24a..5a4c2e6f6 100644
--- a/man/gnt-instance.sgml
+++ b/man/gnt-instance.sgml
@@ -1741,6 +1741,7 @@ instance5: 11225
           <command>startup</command>
           <sbr>
           <arg>--force</arg>
+          <arg>--ignore-offline</arg>
           <sbr>
           <arg>--force-multiple</arg>
           <sbr>
@@ -1848,7 +1849,9 @@ instance5: 11225
 
         <para>
           Use <option>--force</option> to start even if secondary disks are
-          failing.
+          failing. <option>--ignore-offline</option> can be used to ignore
+          offline primary nodes and mark the instance as started even if
+          the primary is not available.
         </para>
 
         <para>
@@ -1904,6 +1907,7 @@ instance5: 11225
           <arg>--timeout=<replaceable>N</replaceable></arg>
           <sbr>
           <arg>--force-multiple</arg>
+          <arg>--ignore-offline</arg>
           <sbr>
           <group choice="opt">
             <arg>--instance</arg>
@@ -1954,6 +1958,12 @@ instance5: 11225
           <command>gnt-job info</command>.
         </para>
 
+        <para>
+          <option>--ignore-offline</option> can be used to ignore offline
+          primary nodes and force the instance to be marked as stopped. This
+          option should be used with care as it can lead to an
+          inconsistent cluster state.
+        </para>
 
         <para>
           Example:
diff --git a/scripts/gnt-instance b/scripts/gnt-instance
index 1c62463df..bb236e888 100755
--- a/scripts/gnt-instance
+++ b/scripts/gnt-instance
@@ -744,7 +744,8 @@ def _StartupInstance(name, opts):
 
   """
   op = opcodes.OpStartupInstance(instance_name=name,
-                                 force=opts.force)
+                                 force=opts.force,
+                                 ignore_offline_nodes=opts.ignore_offline)
   # do not add these parameters to the opcode unless they're defined
   if opts.hvparams:
     op.hvparams = opts.hvparams
@@ -782,7 +783,8 @@ def _ShutdownInstance(name, opts):
 
   """
   return opcodes.OpShutdownInstance(instance_name=name,
-                                    timeout=opts.timeout)
+                                    timeout=opts.timeout,
+                                    ignore_offline_nodes=opts.ignore_offline)
 
 
 def ReplaceDisks(opts, args):
@@ -1487,14 +1489,14 @@ commands = {
     [m_node_opt, m_pri_node_opt, m_sec_node_opt, m_clust_opt,
      m_node_tags_opt, m_pri_node_tags_opt, m_sec_node_tags_opt,
      m_inst_tags_opt, m_inst_opt, m_force_multi, TIMEOUT_OPT, SUBMIT_OPT,
-     DRY_RUN_OPT, PRIORITY_OPT],
+     DRY_RUN_OPT, PRIORITY_OPT, IGNORE_OFFLINE_OPT],
     "<instance>", "Stops an instance"),
   'startup': (
     GenericManyOps("startup", _StartupInstance), [ArgInstance()],
     [FORCE_OPT, m_force_multi, m_node_opt, m_pri_node_opt, m_sec_node_opt,
      m_node_tags_opt, m_pri_node_tags_opt, m_sec_node_tags_opt,
      m_inst_tags_opt, m_clust_opt, m_inst_opt, SUBMIT_OPT, HVOPTS_OPT,
-     BACKEND_OPT, DRY_RUN_OPT, PRIORITY_OPT],
+     BACKEND_OPT, DRY_RUN_OPT, PRIORITY_OPT, IGNORE_OFFLINE_OPT],
     "<instance>", "Starts an instance"),
   'reboot': (
     GenericManyOps("reboot", _RebootInstance), [ArgInstance()],
-- 
GitLab