From 7ea7bcf6d21e8ebe6d24f0cc6f7852a04a28e202 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Tue, 9 Feb 2010 09:57:50 +0100
Subject: [PATCH] Add an early release lock/storage for disk replace

This patch adds an early_release parameter in the OpReplaceDisks and
OpEvacuateNode opcodes, allowing earlier release of storage and more
importantly of internal Ganeti locks.

The behaviour of the early release is that any locks and storage on all
secondary nodes are released early. This is valid for change secondary
(where we remove the storage on the old secondary, and release the locks
on the old and new secondary) and replace on secondary (where we remove
the old storage and release the lock on the secondary node.

Using this, on a three node setup:

- instance1 on nodes A:B
- instance2 on nodes C:B

It is possible to run in parallel a replace-disks -s (on secondary) for
instances 1 and 2.

Replace on primary will remove the storage, but not the locks, as we use
the primary node later in the LU to check consistency.

It is debatable whether to also remove the locks on the primary node,
and thus making replace-disks keep zero locks during the sync. While
this would allow greatly enhanced parallelism, let's first see how
removal of secondary locks works.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Guido Trotter <ultrotter@google.com>
---
 lib/cli.py            |  7 +++++++
 lib/cmdlib.py         | 48 +++++++++++++++++++++++++++++++++++--------
 lib/opcodes.py        |  3 ++-
 man/gnt-instance.sgml | 17 +++++++++++++++
 man/gnt-node.sgml     | 14 +++++++++++++
 scripts/gnt-instance  |  5 +++--
 tools/burnin          |  7 +++++--
 7 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/lib/cli.py b/lib/cli.py
index d9c2dbb26..523cab7b6 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -56,6 +56,7 @@ __all__ = [
   "DISK_OPT",
   "DISK_TEMPLATE_OPT",
   "DRAINED_OPT",
+  "EARLY_RELEASE_OPT",
   "ENABLED_HV_OPT",
   "ERROR_CODES_OPT",
   "FIELDS_OPT",
@@ -837,6 +838,12 @@ SHUTDOWN_TIMEOUT_OPT = cli_option("--shutdown-timeout",
                          default=constants.DEFAULT_SHUTDOWN_TIMEOUT,
                          help="Maximum time to wait for instance shutdown")
 
+EARLY_RELEASE_OPT = cli_option("--early-release",
+                               dest="early_release", default=False,
+                               action="store_true",
+                               help="Release the locks on the secondary"
+                               " node(s) early")
+
 
 def _ParseArgs(argv, commands, aliases):
   """Parser for the command line arguments.
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index cb922eb0d..5cf76ad4a 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -6332,6 +6332,8 @@ class LUReplaceDisks(LogicalUnit):
       self.op.remote_node = None
     if not hasattr(self.op, "iallocator"):
       self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
 
     TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
                                   self.op.iallocator)
@@ -6363,7 +6365,7 @@ class LUReplaceDisks(LogicalUnit):
 
     self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
                                    self.op.iallocator, self.op.remote_node,
-                                   self.op.disks, False)
+                                   self.op.disks, False, self.op.early_release)
 
     self.tasklets = [self.replacer]
 
@@ -6410,6 +6412,8 @@ class LUEvacuateNode(LogicalUnit):
       self.op.remote_node = None
     if not hasattr(self.op, "iallocator"):
       self.op.iallocator = None
+    if not hasattr(self.op, "early_release"):
+      self.op.early_release = False
 
     TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG,
                                   self.op.remote_node,
@@ -6456,7 +6460,7 @@ class LUEvacuateNode(LogicalUnit):
 
       replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG,
                                 self.op.iallocator, self.op.remote_node, [],
-                                True)
+                                True, self.op.early_release)
       tasklets.append(replacer)
 
     self.tasklets = tasklets
@@ -6498,7 +6502,7 @@ class TLReplaceDisks(Tasklet):
 
   """
   def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
-               disks, delay_iallocator):
+               disks, delay_iallocator, early_release):
     """Initializes this class.
 
     """
@@ -6511,6 +6515,7 @@ class TLReplaceDisks(Tasklet):
     self.remote_node = remote_node
     self.disks = disks
     self.delay_iallocator = delay_iallocator
+    self.early_release = early_release
 
     # Runtime data
     self.instance = None
@@ -6853,6 +6858,10 @@ class TLReplaceDisks(Tasklet):
           self.lu.LogWarning("Can't remove old LV: %s" % msg,
                              hint="remove unused LVs manually")
 
+  def _ReleaseNodeLock(self, node_name):
+    """Releases the lock for a given node."""
+    self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
+
   def _ExecDrbd8DiskOnly(self, feedback_fn):
     """Replace a disk on the primary or secondary for DRBD 8.
 
@@ -6963,18 +6972,31 @@ class TLReplaceDisks(Tasklet):
 
       self.cfg.Update(self.instance, feedback_fn)
 
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      # only release the lock if we're doing secondary replace, since
+      # we use the primary node later
+      if self.target_node != self.instance.primary_node:
+        self._ReleaseNodeLock(self.target_node)
+
     # Wait for sync
     # This can fail as the old devices are degraded and _WaitForSync
     # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
     _WaitForSync(self.lu, self.instance)
 
     # Check all devices manually
     self._CheckDevices(self.instance.primary_node, iv_names)
 
     # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
 
   def _ExecDrbd8Secondary(self, feedback_fn):
     """Replace the secondary node for DRBD 8.
@@ -7108,19 +7130,27 @@ class TLReplaceDisks(Tasklet):
                            to_node, msg,
                            hint=("please do a gnt-instance info to see the"
                                  " status of disks"))
+    cstep = 5
+    if self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      cstep += 1
+      self._RemoveOldStorage(self.target_node, iv_names)
+      self._ReleaseNodeLock([self.target_node, self.new_node])
 
     # Wait for sync
     # This can fail as the old devices are degraded and _WaitForSync
     # does a combined result over all disks, so we don't check its return value
-    self.lu.LogStep(5, steps_total, "Sync devices")
+    self.lu.LogStep(cstep, steps_total, "Sync devices")
+    cstep += 1
     _WaitForSync(self.lu, self.instance)
 
     # Check all devices manually
     self._CheckDevices(self.instance.primary_node, iv_names)
 
     # Step: remove old storage
-    self.lu.LogStep(6, steps_total, "Removing old storage")
-    self._RemoveOldStorage(self.target_node, iv_names)
+    if not self.early_release:
+      self.lu.LogStep(cstep, steps_total, "Removing old storage")
+      self._RemoveOldStorage(self.target_node, iv_names)
 
 
 class LURepairNodeStorage(NoHooksLU):
diff --git a/lib/opcodes.py b/lib/opcodes.py
index 906542e14..3aed41e0c 100644
--- a/lib/opcodes.py
+++ b/lib/opcodes.py
@@ -419,7 +419,7 @@ class OpEvacuateNode(OpCode):
   OP_ID = "OP_NODE_EVACUATE"
   OP_DSC_FIELD = "node_name"
   __slots__ = [
-    "node_name", "remote_node", "iallocator",
+    "node_name", "remote_node", "iallocator", "early_release",
     ]
 
 
@@ -509,6 +509,7 @@ class OpReplaceDisks(OpCode):
   OP_DSC_FIELD = "instance_name"
   __slots__ = [
     "instance_name", "remote_node", "mode", "disks", "iallocator",
+    "early_release",
     ]
 
 
diff --git a/man/gnt-instance.sgml b/man/gnt-instance.sgml
index 2cbb67ef6..64ec96105 100644
--- a/man/gnt-instance.sgml
+++ b/man/gnt-instance.sgml
@@ -1828,6 +1828,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <arg choice="req">-p</arg>
           <arg>--disks <replaceable>idx</replaceable></arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
@@ -1836,6 +1837,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <arg choice="req">-s</arg>
           <arg>--disks <replaceable>idx</replaceable></arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
@@ -1844,6 +1846,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <group choice="req">
             <arg>--iallocator <replaceable>name</replaceable></arg>
             <arg>--new-secondary <replaceable>NODE</replaceable></arg>
@@ -1855,6 +1858,7 @@ instance5: 11225
         <cmdsynopsis>
           <command>replace-disks</command>
           <arg>--submit</arg>
+          <arg>--early-release</arg>
           <arg choice="req">--auto</arg>
           <arg choice="req"><replaceable>instance</replaceable></arg>
         </cmdsynopsis>
@@ -1905,6 +1909,19 @@ instance5: 11225
           <command>gnt-job info</command>.
         </para>
 
+        <para>
+          The <option>--early-release</option> changes the code so
+          that the old storage on secondary node(s) is removed early
+          (before the resync is completed) and the internal Ganeti
+          locks for the current (and new, if any) secondary node are
+          also released, thus allowing more parallelism in the cluster
+          operation. This should be used only when recovering from a
+          disk failure on the current secondary (thus the old storage
+          is already broken) or when the storage on the primary node
+          is known to be fine (thus we won't need the old storage for
+          potential recovery).
+        </para>
+
         <para>
           Note that it is not possible to select an offline or drained
           node as a new secondary.
diff --git a/man/gnt-node.sgml b/man/gnt-node.sgml
index b16a4a71c..c1980f23d 100644
--- a/man/gnt-node.sgml
+++ b/man/gnt-node.sgml
@@ -143,6 +143,7 @@
       <cmdsynopsis>
         <command>evacuate</command>
         <arg>-f</arg>
+        <arg>--early-release</arg>
         <group>
           <arg>--iallocator <replaceable>NAME</replaceable></arg>
           <arg>--new-secondary <replaceable>destination_node</replaceable></arg>
@@ -172,6 +173,19 @@
         </itemizedlist>
       </para>
 
+      <para>
+        The <option>--early-release</option> changes the code so that
+        the old storage on node being evacuated is removed early
+        (before the resync is completed) and the internal Ganeti locks
+        are also released for both the current secondary and the new
+        secondary, thus allowing more parallelism in the cluster
+        operation. This should be used only when recovering from a
+        disk failure on the current secondary (thus the old storage is
+        already broken) or when the storage on the primary node is
+        known to be fine (thus we won't need the old storage for
+        potential recovery).
+      </para>
+
       <para>
         Example:
         <screen>
diff --git a/scripts/gnt-instance b/scripts/gnt-instance
index 0c02b9e18..51f892a90 100755
--- a/scripts/gnt-instance
+++ b/scripts/gnt-instance
@@ -807,7 +807,8 @@ def ReplaceDisks(opts, args):
 
   op = opcodes.OpReplaceDisks(instance_name=args[0], disks=disks,
                               remote_node=new_2ndary, mode=mode,
-                              iallocator=iallocator)
+                              iallocator=iallocator,
+                              early_release=opts.early_release)
   SubmitOrSend(op, opts)
   return 0
 
@@ -1400,7 +1401,7 @@ commands = {
     "<instance> <new_name>", "Rename the instance"),
   'replace-disks': (
     ReplaceDisks, ARGS_ONE_INSTANCE,
-    [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT,
+    [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT, EARLY_RELEASE_OPT,
      NEW_SECONDARY_OPT, ON_PRIMARY_OPT, ON_SECONDARY_OPT, SUBMIT_OPT],
     "[-s|-p|-n NODE|-I NAME] <instance>",
     "Replaces all disks for the instance"),
diff --git a/tools/burnin b/tools/burnin
index f0e4113a9..8bd428135 100755
--- a/tools/burnin
+++ b/tools/burnin
@@ -120,6 +120,7 @@ OPTIONS = [
   cli.VERBOSE_OPT,
   cli.NOIPCHECK_OPT,
   cli.NONAMECHECK_OPT,
+  cli.EARLY_RELEASE_OPT,
   cli.cli_option("--no-replace1", dest="do_replace1",
                  help="Skip disk replacement with the same secondary",
                  action="store_false", default=True),
@@ -544,7 +545,8 @@ class Burner(object):
       for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
         op = opcodes.OpReplaceDisks(instance_name=instance,
                                     mode=mode,
-                                    disks=[i for i in range(self.disk_count)])
+                                    disks=[i for i in range(self.disk_count)],
+                                    early_release=self.opts.early_release)
         Log("run %s" % mode, indent=2)
         ops.append(op)
       self.ExecOrQueue(instance, *ops) # pylint: disable-msg=W0142
@@ -568,7 +570,8 @@ class Burner(object):
                                   mode=mode,
                                   remote_node=tnode,
                                   iallocator=self.opts.iallocator,
-                                  disks=[])
+                                  disks=[],
+                                  early_release=self.opts.early_release)
       Log("run %s %s" % (mode, msg), indent=2)
       self.ExecOrQueue(instance, op)
 
-- 
GitLab