diff --git a/lib/cli.py b/lib/cli.py index d9c2dbb26ddafdf113b17e2ce84fa88f254ef511..523cab7b6ff38037480bb7cee45c3509e9bce965 100644 --- a/lib/cli.py +++ b/lib/cli.py @@ -56,6 +56,7 @@ __all__ = [ "DISK_OPT", "DISK_TEMPLATE_OPT", "DRAINED_OPT", + "EARLY_RELEASE_OPT", "ENABLED_HV_OPT", "ERROR_CODES_OPT", "FIELDS_OPT", @@ -837,6 +838,12 @@ SHUTDOWN_TIMEOUT_OPT = cli_option("--shutdown-timeout", default=constants.DEFAULT_SHUTDOWN_TIMEOUT, help="Maximum time to wait for instance shutdown") +EARLY_RELEASE_OPT = cli_option("--early-release", + dest="early_release", default=False, + action="store_true", + help="Release the locks on the secondary" + " node(s) early") + def _ParseArgs(argv, commands, aliases): """Parser for the command line arguments. diff --git a/lib/cmdlib.py b/lib/cmdlib.py index cb922eb0dbe87164c2391b334d06dcc349641379..5cf76ad4a6624b4c9517ca000e79cb7f95eb458a 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -6332,6 +6332,8 @@ class LUReplaceDisks(LogicalUnit): self.op.remote_node = None if not hasattr(self.op, "iallocator"): self.op.iallocator = None + if not hasattr(self.op, "early_release"): + self.op.early_release = False TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node, self.op.iallocator) @@ -6363,7 +6365,7 @@ class LUReplaceDisks(LogicalUnit): self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode, self.op.iallocator, self.op.remote_node, - self.op.disks, False) + self.op.disks, False, self.op.early_release) self.tasklets = [self.replacer] @@ -6410,6 +6412,8 @@ class LUEvacuateNode(LogicalUnit): self.op.remote_node = None if not hasattr(self.op, "iallocator"): self.op.iallocator = None + if not hasattr(self.op, "early_release"): + self.op.early_release = False TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG, self.op.remote_node, @@ -6456,7 +6460,7 @@ class LUEvacuateNode(LogicalUnit): replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG, self.op.iallocator, self.op.remote_node, [], - True) + True, self.op.early_release) tasklets.append(replacer) self.tasklets = tasklets @@ -6498,7 +6502,7 @@ class TLReplaceDisks(Tasklet): """ def __init__(self, lu, instance_name, mode, iallocator_name, remote_node, - disks, delay_iallocator): + disks, delay_iallocator, early_release): """Initializes this class. """ @@ -6511,6 +6515,7 @@ class TLReplaceDisks(Tasklet): self.remote_node = remote_node self.disks = disks self.delay_iallocator = delay_iallocator + self.early_release = early_release # Runtime data self.instance = None @@ -6853,6 +6858,10 @@ class TLReplaceDisks(Tasklet): self.lu.LogWarning("Can't remove old LV: %s" % msg, hint="remove unused LVs manually") + def _ReleaseNodeLock(self, node_name): + """Releases the lock for a given node.""" + self.lu.context.glm.release(locking.LEVEL_NODE, node_name) + def _ExecDrbd8DiskOnly(self, feedback_fn): """Replace a disk on the primary or secondary for DRBD 8. @@ -6963,18 +6972,31 @@ class TLReplaceDisks(Tasklet): self.cfg.Update(self.instance, feedback_fn) + cstep = 5 + if self.early_release: + self.lu.LogStep(cstep, steps_total, "Removing old storage") + cstep += 1 + self._RemoveOldStorage(self.target_node, iv_names) + # only release the lock if we're doing secondary replace, since + # we use the primary node later + if self.target_node != self.instance.primary_node: + self._ReleaseNodeLock(self.target_node) + # Wait for sync # This can fail as the old devices are degraded and _WaitForSync # does a combined result over all disks, so we don't check its return value - self.lu.LogStep(5, steps_total, "Sync devices") + self.lu.LogStep(cstep, steps_total, "Sync devices") + cstep += 1 _WaitForSync(self.lu, self.instance) # Check all devices manually self._CheckDevices(self.instance.primary_node, iv_names) # Step: remove old storage - self.lu.LogStep(6, steps_total, "Removing old storage") - self._RemoveOldStorage(self.target_node, iv_names) + if not self.early_release: + self.lu.LogStep(cstep, steps_total, "Removing old storage") + cstep += 1 + self._RemoveOldStorage(self.target_node, iv_names) def _ExecDrbd8Secondary(self, feedback_fn): """Replace the secondary node for DRBD 8. @@ -7108,19 +7130,27 @@ class TLReplaceDisks(Tasklet): to_node, msg, hint=("please do a gnt-instance info to see the" " status of disks")) + cstep = 5 + if self.early_release: + self.lu.LogStep(cstep, steps_total, "Removing old storage") + cstep += 1 + self._RemoveOldStorage(self.target_node, iv_names) + self._ReleaseNodeLock([self.target_node, self.new_node]) # Wait for sync # This can fail as the old devices are degraded and _WaitForSync # does a combined result over all disks, so we don't check its return value - self.lu.LogStep(5, steps_total, "Sync devices") + self.lu.LogStep(cstep, steps_total, "Sync devices") + cstep += 1 _WaitForSync(self.lu, self.instance) # Check all devices manually self._CheckDevices(self.instance.primary_node, iv_names) # Step: remove old storage - self.lu.LogStep(6, steps_total, "Removing old storage") - self._RemoveOldStorage(self.target_node, iv_names) + if not self.early_release: + self.lu.LogStep(cstep, steps_total, "Removing old storage") + self._RemoveOldStorage(self.target_node, iv_names) class LURepairNodeStorage(NoHooksLU): diff --git a/lib/opcodes.py b/lib/opcodes.py index 906542e143ca68ceb5aaae9e9c7fe620510e2e54..3aed41e0cd75e2bb41958bbfd9b37e8f7fc75da8 100644 --- a/lib/opcodes.py +++ b/lib/opcodes.py @@ -419,7 +419,7 @@ class OpEvacuateNode(OpCode): OP_ID = "OP_NODE_EVACUATE" OP_DSC_FIELD = "node_name" __slots__ = [ - "node_name", "remote_node", "iallocator", + "node_name", "remote_node", "iallocator", "early_release", ] @@ -509,6 +509,7 @@ class OpReplaceDisks(OpCode): OP_DSC_FIELD = "instance_name" __slots__ = [ "instance_name", "remote_node", "mode", "disks", "iallocator", + "early_release", ] diff --git a/man/gnt-instance.sgml b/man/gnt-instance.sgml index 2cbb67ef61bd39ed93a0aa9da6d1a7cdcf02c4ea..64ec9610504e75ba98343a9df5f886e0b99e2f16 100644 --- a/man/gnt-instance.sgml +++ b/man/gnt-instance.sgml @@ -1828,6 +1828,7 @@ instance5: 11225 <cmdsynopsis> <command>replace-disks</command> <arg>--submit</arg> + <arg>--early-release</arg> <arg choice="req">-p</arg> <arg>--disks <replaceable>idx</replaceable></arg> <arg choice="req"><replaceable>instance</replaceable></arg> @@ -1836,6 +1837,7 @@ instance5: 11225 <cmdsynopsis> <command>replace-disks</command> <arg>--submit</arg> + <arg>--early-release</arg> <arg choice="req">-s</arg> <arg>--disks <replaceable>idx</replaceable></arg> <arg choice="req"><replaceable>instance</replaceable></arg> @@ -1844,6 +1846,7 @@ instance5: 11225 <cmdsynopsis> <command>replace-disks</command> <arg>--submit</arg> + <arg>--early-release</arg> <group choice="req"> <arg>--iallocator <replaceable>name</replaceable></arg> <arg>--new-secondary <replaceable>NODE</replaceable></arg> @@ -1855,6 +1858,7 @@ instance5: 11225 <cmdsynopsis> <command>replace-disks</command> <arg>--submit</arg> + <arg>--early-release</arg> <arg choice="req">--auto</arg> <arg choice="req"><replaceable>instance</replaceable></arg> </cmdsynopsis> @@ -1905,6 +1909,19 @@ instance5: 11225 <command>gnt-job info</command>. </para> + <para> + The <option>--early-release</option> changes the code so + that the old storage on secondary node(s) is removed early + (before the resync is completed) and the internal Ganeti + locks for the current (and new, if any) secondary node are + also released, thus allowing more parallelism in the cluster + operation. This should be used only when recovering from a + disk failure on the current secondary (thus the old storage + is already broken) or when the storage on the primary node + is known to be fine (thus we won't need the old storage for + potential recovery). + </para> + <para> Note that it is not possible to select an offline or drained node as a new secondary. diff --git a/man/gnt-node.sgml b/man/gnt-node.sgml index b16a4a71c9eb15ad42790924c7cb46efd83820d4..c1980f23d706d1cae307dd5bd2ef5bd423de15f1 100644 --- a/man/gnt-node.sgml +++ b/man/gnt-node.sgml @@ -143,6 +143,7 @@ <cmdsynopsis> <command>evacuate</command> <arg>-f</arg> + <arg>--early-release</arg> <group> <arg>--iallocator <replaceable>NAME</replaceable></arg> <arg>--new-secondary <replaceable>destination_node</replaceable></arg> @@ -172,6 +173,19 @@ </itemizedlist> </para> + <para> + The <option>--early-release</option> changes the code so that + the old storage on node being evacuated is removed early + (before the resync is completed) and the internal Ganeti locks + are also released for both the current secondary and the new + secondary, thus allowing more parallelism in the cluster + operation. This should be used only when recovering from a + disk failure on the current secondary (thus the old storage is + already broken) or when the storage on the primary node is + known to be fine (thus we won't need the old storage for + potential recovery). + </para> + <para> Example: <screen> diff --git a/scripts/gnt-instance b/scripts/gnt-instance index 0c02b9e18f3169fd783e0777ebf61a43258ac9a5..51f892a9031f1c418d61f8d5f132c161fadd7def 100755 --- a/scripts/gnt-instance +++ b/scripts/gnt-instance @@ -807,7 +807,8 @@ def ReplaceDisks(opts, args): op = opcodes.OpReplaceDisks(instance_name=args[0], disks=disks, remote_node=new_2ndary, mode=mode, - iallocator=iallocator) + iallocator=iallocator, + early_release=opts.early_release) SubmitOrSend(op, opts) return 0 @@ -1400,7 +1401,7 @@ commands = { "<instance> <new_name>", "Rename the instance"), 'replace-disks': ( ReplaceDisks, ARGS_ONE_INSTANCE, - [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT, + [AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT, EARLY_RELEASE_OPT, NEW_SECONDARY_OPT, ON_PRIMARY_OPT, ON_SECONDARY_OPT, SUBMIT_OPT], "[-s|-p|-n NODE|-I NAME] <instance>", "Replaces all disks for the instance"), diff --git a/tools/burnin b/tools/burnin index f0e4113a9ad6db39fdbe07da5797ca9a02195e69..8bd4281352957f6be35a6d62bda1470ed164c27d 100755 --- a/tools/burnin +++ b/tools/burnin @@ -120,6 +120,7 @@ OPTIONS = [ cli.VERBOSE_OPT, cli.NOIPCHECK_OPT, cli.NONAMECHECK_OPT, + cli.EARLY_RELEASE_OPT, cli.cli_option("--no-replace1", dest="do_replace1", help="Skip disk replacement with the same secondary", action="store_false", default=True), @@ -544,7 +545,8 @@ class Burner(object): for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI: op = opcodes.OpReplaceDisks(instance_name=instance, mode=mode, - disks=[i for i in range(self.disk_count)]) + disks=[i for i in range(self.disk_count)], + early_release=self.opts.early_release) Log("run %s" % mode, indent=2) ops.append(op) self.ExecOrQueue(instance, *ops) # pylint: disable-msg=W0142 @@ -568,7 +570,8 @@ class Burner(object): mode=mode, remote_node=tnode, iallocator=self.opts.iallocator, - disks=[]) + disks=[], + early_release=self.opts.early_release) Log("run %s %s" % (mode, msg), indent=2) self.ExecOrQueue(instance, op)