Commit 7ea7bcf6 authored by Iustin Pop's avatar Iustin Pop
Browse files

Add an early release lock/storage for disk replace



This patch adds an early_release parameter in the OpReplaceDisks and
OpEvacuateNode opcodes, allowing earlier release of storage and more
importantly of internal Ganeti locks.

The behaviour of the early release is that any locks and storage on all
secondary nodes are released early. This is valid for change secondary
(where we remove the storage on the old secondary, and release the locks
on the old and new secondary) and replace on secondary (where we remove
the old storage and release the lock on the secondary node.

Using this, on a three node setup:

- instance1 on nodes A:B
- instance2 on nodes C:B

It is possible to run in parallel a replace-disks -s (on secondary) for
instances 1 and 2.

Replace on primary will remove the storage, but not the locks, as we use
the primary node later in the LU to check consistency.

It is debatable whether to also remove the locks on the primary node,
and thus making replace-disks keep zero locks during the sync. While
this would allow greatly enhanced parallelism, let's first see how
removal of secondary locks works.
Signed-off-by: default avatarIustin Pop <iustin@google.com>
Reviewed-by: default avatarGuido Trotter <ultrotter@google.com>
parent 4c06a2d0
......@@ -56,6 +56,7 @@ __all__ = [
"DISK_OPT",
"DISK_TEMPLATE_OPT",
"DRAINED_OPT",
"EARLY_RELEASE_OPT",
"ENABLED_HV_OPT",
"ERROR_CODES_OPT",
"FIELDS_OPT",
......@@ -837,6 +838,12 @@ SHUTDOWN_TIMEOUT_OPT = cli_option("--shutdown-timeout",
default=constants.DEFAULT_SHUTDOWN_TIMEOUT,
help="Maximum time to wait for instance shutdown")
EARLY_RELEASE_OPT = cli_option("--early-release",
dest="early_release", default=False,
action="store_true",
help="Release the locks on the secondary"
" node(s) early")
def _ParseArgs(argv, commands, aliases):
"""Parser for the command line arguments.
......
......@@ -6332,6 +6332,8 @@ class LUReplaceDisks(LogicalUnit):
self.op.remote_node = None
if not hasattr(self.op, "iallocator"):
self.op.iallocator = None
if not hasattr(self.op, "early_release"):
self.op.early_release = False
TLReplaceDisks.CheckArguments(self.op.mode, self.op.remote_node,
self.op.iallocator)
......@@ -6363,7 +6365,7 @@ class LUReplaceDisks(LogicalUnit):
self.replacer = TLReplaceDisks(self, self.op.instance_name, self.op.mode,
self.op.iallocator, self.op.remote_node,
self.op.disks, False)
self.op.disks, False, self.op.early_release)
self.tasklets = [self.replacer]
......@@ -6410,6 +6412,8 @@ class LUEvacuateNode(LogicalUnit):
self.op.remote_node = None
if not hasattr(self.op, "iallocator"):
self.op.iallocator = None
if not hasattr(self.op, "early_release"):
self.op.early_release = False
TLReplaceDisks.CheckArguments(constants.REPLACE_DISK_CHG,
self.op.remote_node,
......@@ -6456,7 +6460,7 @@ class LUEvacuateNode(LogicalUnit):
replacer = TLReplaceDisks(self, inst.name, constants.REPLACE_DISK_CHG,
self.op.iallocator, self.op.remote_node, [],
True)
True, self.op.early_release)
tasklets.append(replacer)
self.tasklets = tasklets
......@@ -6498,7 +6502,7 @@ class TLReplaceDisks(Tasklet):
"""
def __init__(self, lu, instance_name, mode, iallocator_name, remote_node,
disks, delay_iallocator):
disks, delay_iallocator, early_release):
"""Initializes this class.
"""
......@@ -6511,6 +6515,7 @@ class TLReplaceDisks(Tasklet):
self.remote_node = remote_node
self.disks = disks
self.delay_iallocator = delay_iallocator
self.early_release = early_release
# Runtime data
self.instance = None
......@@ -6853,6 +6858,10 @@ class TLReplaceDisks(Tasklet):
self.lu.LogWarning("Can't remove old LV: %s" % msg,
hint="remove unused LVs manually")
def _ReleaseNodeLock(self, node_name):
"""Releases the lock for a given node."""
self.lu.context.glm.release(locking.LEVEL_NODE, node_name)
def _ExecDrbd8DiskOnly(self, feedback_fn):
"""Replace a disk on the primary or secondary for DRBD 8.
......@@ -6963,18 +6972,31 @@ class TLReplaceDisks(Tasklet):
self.cfg.Update(self.instance, feedback_fn)
cstep = 5
if self.early_release:
self.lu.LogStep(cstep, steps_total, "Removing old storage")
cstep += 1
self._RemoveOldStorage(self.target_node, iv_names)
# only release the lock if we're doing secondary replace, since
# we use the primary node later
if self.target_node != self.instance.primary_node:
self._ReleaseNodeLock(self.target_node)
# Wait for sync
# This can fail as the old devices are degraded and _WaitForSync
# does a combined result over all disks, so we don't check its return value
self.lu.LogStep(5, steps_total, "Sync devices")
self.lu.LogStep(cstep, steps_total, "Sync devices")
cstep += 1
_WaitForSync(self.lu, self.instance)
# Check all devices manually
self._CheckDevices(self.instance.primary_node, iv_names)
# Step: remove old storage
self.lu.LogStep(6, steps_total, "Removing old storage")
self._RemoveOldStorage(self.target_node, iv_names)
if not self.early_release:
self.lu.LogStep(cstep, steps_total, "Removing old storage")
cstep += 1
self._RemoveOldStorage(self.target_node, iv_names)
def _ExecDrbd8Secondary(self, feedback_fn):
"""Replace the secondary node for DRBD 8.
......@@ -7108,19 +7130,27 @@ class TLReplaceDisks(Tasklet):
to_node, msg,
hint=("please do a gnt-instance info to see the"
" status of disks"))
cstep = 5
if self.early_release:
self.lu.LogStep(cstep, steps_total, "Removing old storage")
cstep += 1
self._RemoveOldStorage(self.target_node, iv_names)
self._ReleaseNodeLock([self.target_node, self.new_node])
# Wait for sync
# This can fail as the old devices are degraded and _WaitForSync
# does a combined result over all disks, so we don't check its return value
self.lu.LogStep(5, steps_total, "Sync devices")
self.lu.LogStep(cstep, steps_total, "Sync devices")
cstep += 1
_WaitForSync(self.lu, self.instance)
# Check all devices manually
self._CheckDevices(self.instance.primary_node, iv_names)
# Step: remove old storage
self.lu.LogStep(6, steps_total, "Removing old storage")
self._RemoveOldStorage(self.target_node, iv_names)
if not self.early_release:
self.lu.LogStep(cstep, steps_total, "Removing old storage")
self._RemoveOldStorage(self.target_node, iv_names)
class LURepairNodeStorage(NoHooksLU):
......
......@@ -419,7 +419,7 @@ class OpEvacuateNode(OpCode):
OP_ID = "OP_NODE_EVACUATE"
OP_DSC_FIELD = "node_name"
__slots__ = [
"node_name", "remote_node", "iallocator",
"node_name", "remote_node", "iallocator", "early_release",
]
......@@ -509,6 +509,7 @@ class OpReplaceDisks(OpCode):
OP_DSC_FIELD = "instance_name"
__slots__ = [
"instance_name", "remote_node", "mode", "disks", "iallocator",
"early_release",
]
......
......@@ -1828,6 +1828,7 @@ instance5: 11225
<cmdsynopsis>
<command>replace-disks</command>
<arg>--submit</arg>
<arg>--early-release</arg>
<arg choice="req">-p</arg>
<arg>--disks <replaceable>idx</replaceable></arg>
<arg choice="req"><replaceable>instance</replaceable></arg>
......@@ -1836,6 +1837,7 @@ instance5: 11225
<cmdsynopsis>
<command>replace-disks</command>
<arg>--submit</arg>
<arg>--early-release</arg>
<arg choice="req">-s</arg>
<arg>--disks <replaceable>idx</replaceable></arg>
<arg choice="req"><replaceable>instance</replaceable></arg>
......@@ -1844,6 +1846,7 @@ instance5: 11225
<cmdsynopsis>
<command>replace-disks</command>
<arg>--submit</arg>
<arg>--early-release</arg>
<group choice="req">
<arg>--iallocator <replaceable>name</replaceable></arg>
<arg>--new-secondary <replaceable>NODE</replaceable></arg>
......@@ -1855,6 +1858,7 @@ instance5: 11225
<cmdsynopsis>
<command>replace-disks</command>
<arg>--submit</arg>
<arg>--early-release</arg>
<arg choice="req">--auto</arg>
<arg choice="req"><replaceable>instance</replaceable></arg>
</cmdsynopsis>
......@@ -1905,6 +1909,19 @@ instance5: 11225
<command>gnt-job info</command>.
</para>
<para>
The <option>--early-release</option> changes the code so
that the old storage on secondary node(s) is removed early
(before the resync is completed) and the internal Ganeti
locks for the current (and new, if any) secondary node are
also released, thus allowing more parallelism in the cluster
operation. This should be used only when recovering from a
disk failure on the current secondary (thus the old storage
is already broken) or when the storage on the primary node
is known to be fine (thus we won't need the old storage for
potential recovery).
</para>
<para>
Note that it is not possible to select an offline or drained
node as a new secondary.
......
......@@ -143,6 +143,7 @@
<cmdsynopsis>
<command>evacuate</command>
<arg>-f</arg>
<arg>--early-release</arg>
<group>
<arg>--iallocator <replaceable>NAME</replaceable></arg>
<arg>--new-secondary <replaceable>destination_node</replaceable></arg>
......@@ -172,6 +173,19 @@
</itemizedlist>
</para>
<para>
The <option>--early-release</option> changes the code so that
the old storage on node being evacuated is removed early
(before the resync is completed) and the internal Ganeti locks
are also released for both the current secondary and the new
secondary, thus allowing more parallelism in the cluster
operation. This should be used only when recovering from a
disk failure on the current secondary (thus the old storage is
already broken) or when the storage on the primary node is
known to be fine (thus we won't need the old storage for
potential recovery).
</para>
<para>
Example:
<screen>
......
......@@ -807,7 +807,8 @@ def ReplaceDisks(opts, args):
op = opcodes.OpReplaceDisks(instance_name=args[0], disks=disks,
remote_node=new_2ndary, mode=mode,
iallocator=iallocator)
iallocator=iallocator,
early_release=opts.early_release)
SubmitOrSend(op, opts)
return 0
......@@ -1400,7 +1401,7 @@ commands = {
"<instance> <new_name>", "Rename the instance"),
'replace-disks': (
ReplaceDisks, ARGS_ONE_INSTANCE,
[AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT,
[AUTO_REPLACE_OPT, DISKIDX_OPT, IALLOCATOR_OPT, EARLY_RELEASE_OPT,
NEW_SECONDARY_OPT, ON_PRIMARY_OPT, ON_SECONDARY_OPT, SUBMIT_OPT],
"[-s|-p|-n NODE|-I NAME] <instance>",
"Replaces all disks for the instance"),
......
......@@ -120,6 +120,7 @@ OPTIONS = [
cli.VERBOSE_OPT,
cli.NOIPCHECK_OPT,
cli.NONAMECHECK_OPT,
cli.EARLY_RELEASE_OPT,
cli.cli_option("--no-replace1", dest="do_replace1",
help="Skip disk replacement with the same secondary",
action="store_false", default=True),
......@@ -544,7 +545,8 @@ class Burner(object):
for mode in constants.REPLACE_DISK_SEC, constants.REPLACE_DISK_PRI:
op = opcodes.OpReplaceDisks(instance_name=instance,
mode=mode,
disks=[i for i in range(self.disk_count)])
disks=[i for i in range(self.disk_count)],
early_release=self.opts.early_release)
Log("run %s" % mode, indent=2)
ops.append(op)
self.ExecOrQueue(instance, *ops) # pylint: disable-msg=W0142
......@@ -568,7 +570,8 @@ class Burner(object):
mode=mode,
remote_node=tnode,
iallocator=self.opts.iallocator,
disks=[])
disks=[],
early_release=self.opts.early_release)
Log("run %s %s" % (mode, msg), indent=2)
self.ExecOrQueue(instance, op)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment