Commit 0834c866 authored by Iustin Pop's avatar Iustin Pop
Browse files

Enhance secondary node replace for drbd8

This (big) patch does two things:
  - add "local disk status" to the block device checks
    (BlockDevice.GetSyncStatus and the rpc calls that call this
    function, and therefore cmdlib._CheckDiskConsistency)
  - improve the drbd8 secondary replace operation using the above
    functionality

The "local disk status" adds a new variable to the result of
GetSyncStatus that shows the degradation of the local storage of the
device. Of course, not all device support this - for now, we only modify
LogicalVolumes and DRBD8 to return degraded in some cases, other devices
always return non-degraded. This variable should be a subset of
is_degraded - whenever this variable is true, the is_degraded should
also be true.

The drbd8 secondary replace uses this variable as we don't care if the
primary drbd device is network-degraded, only if it has good local disk
data (ldisk is False).

The patch also increases the protocol version (due to rpc changes).

Reviewed-by: imsnah
parent e3c826ec
......@@ -895,8 +895,7 @@ def FindBlockDevice(disk):
rbd = _RecursiveFindBD(disk)
if rbd is None:
return rbd
sync_p, est_t, is_degr = rbd.GetSyncStatus()
return rbd.dev_path, rbd.major, rbd.minor, sync_p, est_t, is_degr
return (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus()
def UploadFile(file_name, data, mode, uid, gid, atime, mtime):
......
......@@ -220,17 +220,23 @@ class BlockDev(object):
status of the mirror.
Returns:
(sync_percent, estimated_time, is_degraded)
(sync_percent, estimated_time, is_degraded, ldisk)
If sync_percent is None, it means the device is not syncing.
If sync_percent is None, it means all is ok
If estimated_time is None, it means we can't estimate
the time needed, otherwise it's the time left in seconds
the time needed, otherwise it's the time left in seconds.
If is_degraded is True, it means the device is missing
redundancy. This is usually a sign that something went wrong in
the device setup, if sync_percent is None.
The ldisk parameter represents the degradation of the local
data. This is only valid for some devices, the rest will always
return False (not degraded).
"""
return None, None, False
return None, None, False, False
def CombinedSyncStatus(self):
......@@ -241,10 +247,10 @@ class BlockDev(object):
children.
"""
min_percent, max_time, is_degraded = self.GetSyncStatus()
min_percent, max_time, is_degraded, ldisk = self.GetSyncStatus()
if self._children:
for child in self._children:
c_percent, c_time, c_degraded = child.GetSyncStatus()
c_percent, c_time, c_degraded, c_ldisk = child.GetSyncStatus()
if min_percent is None:
min_percent = c_percent
elif c_percent is not None:
......@@ -254,7 +260,8 @@ class BlockDev(object):
elif c_time is not None:
max_time = max(max_time, c_time)
is_degraded = is_degraded or c_degraded
return min_percent, max_time, is_degraded
ldisk = ldisk or c_ldisk
return min_percent, max_time, is_degraded, ldisk
def SetInfo(self, text):
......@@ -458,30 +465,32 @@ class LogicalVolume(BlockDev):
status of the mirror.
Returns:
(sync_percent, estimated_time, is_degraded)
(sync_percent, estimated_time, is_degraded, ldisk)
For logical volumes, sync_percent and estimated_time are always
None (no recovery in progress, as we don't handle the mirrored LV
case).
case). The is_degraded parameter is the inverse of the ldisk
parameter.
For the is_degraded parameter, we check if the logical volume has
the 'virtual' type, which means it's not backed by existing
storage anymore (read from it return I/O error). This happens
after a physical disk failure and subsequent 'vgreduce
--removemissing' on the volume group.
For the ldisk parameter, we check if the logical volume has the
'virtual' type, which means it's not backed by existing storage
anymore (read from it return I/O error). This happens after a
physical disk failure and subsequent 'vgreduce --removemissing' on
the volume group.
"""
result = utils.RunCmd(["lvs", "--noheadings", "-olv_attr", self.dev_path])
if result.failed:
logger.Error("Can't display lv: %s" % result.fail_reason)
return None, None, True
return None, None, True, True
out = result.stdout.strip()
# format: type/permissions/alloc/fixed_minor/state/open
if len(out) != 6:
return None, None, True
is_degraded = out[0] == 'v' # virtual volume, i.e. doesn't have
logger.Debug("Error in lvs output: attrs=%s, len != 6" % out)
return None, None, True, True
ldisk = out[0] == 'v' # virtual volume, i.e. doesn't have
# backing storage
return None, None, is_degraded
return None, None, ldisk, ldisk
def Open(self, force=False):
"""Make the device ready for I/O.
......@@ -898,11 +907,13 @@ class MDRaid1(BlockDev):
"""Returns the sync status of the device.
Returns:
(sync_percent, estimated_time, is_degraded)
(sync_percent, estimated_time, is_degraded, ldisk)
If sync_percent is None, it means all is ok
If estimated_time is None, it means we can't esimate
the time needed, otherwise it's the time left in seconds
the time needed, otherwise it's the time left in seconds.
The ldisk parameter is always true for MD devices.
"""
if self.minor is None and not self.Attach():
......@@ -916,12 +927,12 @@ class MDRaid1(BlockDev):
sync_status = f.readline().strip()
f.close()
if sync_status == "idle":
return None, None, not is_clean
return None, None, not is_clean, False
f = file(sys_path + "sync_completed")
sync_completed = f.readline().strip().split(" / ")
f.close()
if len(sync_completed) != 2:
return 0, None, not is_clean
return 0, None, not is_clean, False
sync_done, sync_total = [float(i) for i in sync_completed]
sync_percent = 100.0*sync_done/sync_total
f = file(sys_path + "sync_speed")
......@@ -930,7 +941,7 @@ class MDRaid1(BlockDev):
time_est = None
else:
time_est = (sync_total - sync_done) / 2 / sync_speed_k
return sync_percent, time_est, not is_clean
return sync_percent, time_est, not is_clean, False
def Open(self, force=False):
"""Make the device ready for I/O.
......@@ -1476,11 +1487,14 @@ class DRBDev(BaseDRBD):
"""Returns the sync status of the device.
Returns:
(sync_percent, estimated_time, is_degraded)
(sync_percent, estimated_time, is_degraded, ldisk)
If sync_percent is None, it means all is ok
If estimated_time is None, it means we can't esimate
the time needed, otherwise it's the time left in seconds
the time needed, otherwise it's the time left in seconds.
The ldisk parameter will be returned as True, since the DRBD7
devices have not been converted.
"""
if self.minor is None and not self.Attach():
......@@ -1507,7 +1521,7 @@ class DRBDev(BaseDRBD):
self.minor)
client_state = match.group(1)
is_degraded = client_state != "Connected"
return sync_percent, est_time, is_degraded
return sync_percent, est_time, is_degraded, False
def GetStatus(self):
"""Compute the status of the DRBD device
......@@ -1953,7 +1967,14 @@ class DRBD8(BaseDRBD):
If sync_percent is None, it means all is ok
If estimated_time is None, it means we can't esimate
the time needed, otherwise it's the time left in seconds
the time needed, otherwise it's the time left in seconds.
We set the is_degraded parameter to True on two conditions:
network not connected or local disk missing.
We compute the ldisk parameter based on wheter we have a local
disk or not.
"""
if self.minor is None and not self.Attach():
......@@ -1980,9 +2001,9 @@ class DRBD8(BaseDRBD):
self.minor)
client_state = match.group(1)
local_disk_state = match.group(2)
is_degraded = (client_state != "Connected" or
local_disk_state != "UpToDate")
return sync_percent, est_time, is_degraded
ldisk = local_disk_state != "UpToDate"
is_degraded = client_state != "Connected"
return sync_percent, est_time, is_degraded or ldisk, ldisk
def GetStatus(self):
"""Compute the status of the DRBD device
......
......@@ -1051,7 +1051,8 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
logger.ToStderr("Can't compute data for node %s/%s" %
(node, instance.disks[i].iv_name))
continue
perc_done, est_time, is_degraded = mstat
# we ignore the ldisk parameter
perc_done, est_time, is_degraded, _ = mstat
cumul_degraded = cumul_degraded or (is_degraded and perc_done is None)
if perc_done is not None:
done = False
......@@ -1078,11 +1079,19 @@ def _WaitForSync(cfgw, instance, oneshot=False, unlock=False):
return not cumul_degraded
def _CheckDiskConsistency(cfgw, dev, node, on_primary):
def _CheckDiskConsistency(cfgw, dev, node, on_primary, ldisk=False):
"""Check that mirrors are not degraded.
The ldisk parameter, if True, will change the test from the
is_degraded attribute (which represents overall non-ok status for
the device(s)) to the ldisk (representing the local storage status).
"""
cfgw.SetDiskID(dev, node)
if ldisk:
idx = 6
else:
idx = 5
result = True
if on_primary or dev.AssembleOnSecondary():
......@@ -1091,7 +1100,7 @@ def _CheckDiskConsistency(cfgw, dev, node, on_primary):
logger.ToStderr("Can't get any data from node %s" % node)
result = False
else:
result = result and (not rstats[5])
result = result and (not rstats[idx])
if dev.children:
for child in dev.children:
result = result and _CheckDiskConsistency(cfgw, child, node, on_primary)
......@@ -3360,8 +3369,12 @@ class LUReplaceDisks(LogicalUnit):
"OLD_SECONDARY": self.instance.secondary_nodes[0],
}
env.update(_BuildInstanceHookEnvByObject(self.instance))
nl = [self.sstore.GetMasterNode(),
self.instance.primary_node] + list(self.instance.secondary_nodes)
nl = [
self.sstore.GetMasterNode(),
self.instance.primary_node,
]
if self.op.remote_node is not None:
nl.append(self.op.remote_node)
return env, nl, nl
def CheckPrereq(self):
......@@ -3401,8 +3414,13 @@ class LUReplaceDisks(LogicalUnit):
raise errors.OpPrereqError("The specified node is the primary node of"
" the instance.")
elif remote_node == self.sec_node:
if self.op.mode == constants.REPLACE_DISK_SEC:
# this is for DRBD8, where we can't execute the same mode of
# replacement as for drbd7 (no different port allocated)
raise errors.OpPrereqError("Same secondary given, cannot execute"
" replacement")
# the user gave the current secondary, switch to
# 'no-replace-secondary' mode
# 'no-replace-secondary' mode for drbd7
remote_node = None
if (instance.disk_template == constants.DT_REMOTE_RAID1 and
self.op.mode != constants.REPLACE_DISK_ALL):
......@@ -3717,7 +3735,10 @@ class LUReplaceDisks(LogicalUnit):
- remove all disks from the old secondary
Failures are not very well handled.
"""
steps_total = 6
warning, info = (self.processor.LogWarning, self.processor.LogInfo)
instance = self.instance
iv_names = {}
vgname = self.cfg.GetVGName()
......@@ -3726,10 +3747,44 @@ class LUReplaceDisks(LogicalUnit):
old_node = self.tgt_node
new_node = self.new_node
pri_node = instance.primary_node
# Step: check device activation
self.processor.LogStep(1, steps_total, "check device existence")
info("checking volume groups")
my_vg = cfg.GetVGName()
results = rpc.call_vg_list([pri_node, new_node])
if not results:
raise errors.OpExecError("Can't list volume groups on the nodes")
for node in pri_node, new_node:
res = results.get(node, False)
if not res or my_vg not in res:
raise errors.OpExecError("Volume group '%s' not found on %s" %
(my_vg, node))
for dev in instance.disks:
if not dev.iv_name in self.op.disks:
continue
info("checking %s on %s" % (dev.iv_name, pri_node))
cfg.SetDiskID(dev, pri_node)
if not rpc.call_blockdev_find(pri_node, dev):
raise errors.OpExecError("Can't find device %s on node %s" %
(dev.iv_name, pri_node))
# Step: check other node consistency
self.processor.LogStep(2, steps_total, "check peer consistency")
for dev in instance.disks:
if not dev.iv_name in self.op.disks:
continue
info("checking %s consistency on %s" % (dev.iv_name, pri_node))
if not _CheckDiskConsistency(self.cfg, dev, pri_node, True, ldisk=True):
raise errors.OpExecError("Primary node (%s) has degraded storage,"
" unsafe to replace the secondary" %
pri_node)
# Step: create new storage
self.processor.LogStep(3, steps_total, "allocate new storage")
for dev in instance.disks:
size = dev.size
logger.Info("adding new local storage on %s for %s" %
(new_node, dev.iv_name))
info("adding new local storage on %s for %s" % (new_node, dev.iv_name))
# since we *always* want to create this LV, we use the
# _Create...OnPrimary (which forces the creation), even if we
# are talking about the secondary node
......@@ -3740,6 +3795,12 @@ class LUReplaceDisks(LogicalUnit):
" node '%s'" %
(new_lv.logical_id[1], new_node))
iv_names[dev.iv_name] = (dev, dev.children)
self.processor.LogStep(4, steps_total, "changing drbd configuration")
for dev in instance.disks:
size = dev.size
info("activating a new drbd on %s for %s" % (new_node, dev.iv_name))
# create new devices on new_node
new_drbd = objects.Disk(dev_type=constants.LD_DRBD8,
logical_id=(pri_node, new_node,
......@@ -3751,31 +3812,34 @@ class LUReplaceDisks(LogicalUnit):
raise errors.OpExecError("Failed to create new DRBD on"
" node '%s'" % new_node)
for dev in instance.disks:
# we have new devices, shutdown the drbd on the old secondary
info("shutting down drbd for %s on old node" % dev.iv_name)
cfg.SetDiskID(dev, old_node)
if not rpc.call_blockdev_shutdown(old_node, dev):
raise errors.OpExecError("Failed to shutdown DRBD on old node")
warning("Failed to shutdown drbd for %s on old node" % dev.iv_name,
"Please cleanup this device manuall as soon as possible")
# we have new storage, we 'rename' the network on the primary
info("switching primary drbd for %s to new secondary node" % dev.iv_name)
cfg.SetDiskID(dev, pri_node)
# rename to the ip of the new node
new_uid = list(dev.physical_id)
new_uid[2] = self.remote_node_info.secondary_ip
rlist = [(dev, tuple(new_uid))]
if not rpc.call_blockdev_rename(pri_node, rlist):
raise errors.OpExecError("Can't detach re-attach drbd %s on node"
raise errors.OpExecError("Can't detach & re-attach drbd %s on node"
" %s from %s to %s" %
(dev.iv_name, pri_node, old_node, new_node))
dev.logical_id = (pri_node, new_node, dev.logical_id[2])
cfg.SetDiskID(dev, pri_node)
cfg.Update(instance)
iv_names[dev.iv_name] = (dev, dev.children)
# this can fail as the old devices are degraded and _WaitForSync
# does a combined result over all disks, so we don't check its
# return value
logger.Info("Done changing drbd configs, waiting for sync")
self.processor.LogStep(5, steps_total, "sync devices")
_WaitForSync(cfg, instance, unlock=True)
# so check manually all the devices
......@@ -3785,14 +3849,14 @@ class LUReplaceDisks(LogicalUnit):
if is_degr:
raise errors.OpExecError("DRBD device %s is degraded!" % name)
self.processor.LogStep(6, steps_total, "removing old storage")
for name, (dev, old_lvs) in iv_names.iteritems():
logger.Info("remove logical volumes for %s" % name)
info("remove logical volumes for %s" % name)
for lv in old_lvs:
cfg.SetDiskID(lv, old_node)
if not rpc.call_blockdev_remove(old_node, lv):
logger.Error("Can't cleanup child device, skipping. You need to"
" fix manually!")
continue
warning("Can't remove LV on old secondary",
"Cleanup stale volumes by hand")
def Exec(self, feedback_fn):
"""Execute disk replacement.
......
......@@ -25,7 +25,7 @@ from ganeti import _autoconf
# various versions
CONFIG_VERSION = 3
PROTOCOL_VERSION = 6
PROTOCOL_VERSION = 7
RELEASE_VERSION = _autoconf.PACKAGE_VERSION
OS_API_VERSION = 5
EXPORT_VERSION = 0
......
......@@ -523,7 +523,7 @@ def _FormatBlockDevInfo(buf, dev, indent_level):
if not status:
buf.write("not active\n")
else:
(path, major, minor, syncp, estt, degr) = status
(path, major, minor, syncp, estt, degr, ldisk) = status
buf.write("%s (%d:%d)" % (path, major, minor))
if dtype in (constants.LD_MD_R1, constants.LD_DRBD7, constants.LD_DRBD8):
if syncp is not None:
......@@ -538,13 +538,17 @@ def _FormatBlockDevInfo(buf, dev, indent_level):
degr_text = "*DEGRADED*"
else:
degr_text = "ok"
buf.write(" %s, status %s" % (sync_text, degr_text))
if ldisk:
ldisk_text = " *MISSING DISK*"
else:
ldisk_text = ""
buf.write(" %s, status %s%s" % (sync_text, degr_text, ldisk_text))
elif dtype == constants.LD_LV:
if degr:
degr_text = " *DEGRADED* (failed drive?)"
if ldisk:
ldisk_text = " *FAILED* (failed drive?)"
else:
degr_text = ""
buf.write(degr_text)
ldisk_text = ""
buf.write(ldisk_text)
buf.write("\n")
if dev["iv_name"] is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment