Commit b63ed789 authored by Iustin Pop's avatar Iustin Pop
Browse files

Improve verify-disks: broken/missing LV detection

This patch improves the ‘gnt-cluster verify-disks’ command by adding
support for detecting broken volume groups and missing logical volume

As such, we don't try anymore to activate disks for instances that are
not likely to succeed anyway, and instead report them.

Reviewed-by: schreiberal
parent 5574047a
......@@ -223,7 +223,7 @@ def GetVolumeList(vg_name):
if result.failed:
logger.Error("Failed to list logical volumes, lvs output: %s" %
return lvs
return result.output
for line in result.stdout.splitlines():
line = line.strip().rstrip(sep)
......@@ -840,12 +840,17 @@ class LUVerifyCluster(NoHooksLU):
# node_volume
volumeinfo = all_volumeinfo[node]
if type(volumeinfo) != dict:
if isinstance(volumeinfo, basestring):
feedback_fn(" - ERROR: LVM problem on node %s: %s" %
(node, volumeinfo[-400:].encode('string_escape')))
bad = True
node_volume[node] = {}
elif not isinstance(volumeinfo, dict):
feedback_fn(" - ERROR: connection to %s failed" % (node,))
bad = True
node_volume[node] = volumeinfo
node_volume[node] = volumeinfo
# node_instance
nodeinstance = all_instanceinfo[node]
......@@ -899,7 +904,7 @@ class LUVerifyDisks(NoHooksLU):
"""Verify integrity of cluster disks.
result = res_nodes, res_instances = [], []
result = res_nodes, res_nlvm, res_instances, res_missing = [], {}, [], {}
vg_name = self.cfg.GetVGName()
nodes = utils.NiceSort(self.cfg.GetNodeList())
......@@ -928,18 +933,28 @@ class LUVerifyDisks(NoHooksLU):
# node_volume
lvs = node_lvs[node]
if not isinstance(lvs, dict):
if isinstance(lvs, basestring):
logger.Info("error enumerating LVs on node %s: %s" % (node, lvs))
res_nlvm[node] = lvs
elif not isinstance(lvs, dict):
logger.Info("connection to node %s failed or invalid data returned" %
for lv_name, (_, lv_inactive, lv_online) in lvs.iteritems():
if not lv_online:
inst = nv_dict.get((node, lv_name), None)
if inst is not None and not in res_instances:
inst = nv_dict.pop((node, lv_name), None)
if (not lv_online and inst is not None
and not in res_instances):
# any leftover items in nv_dict are missing LVs, let's arrange the
# data better
for key, inst in nv_dict.iteritems():
if not in res_missing:
res_missing[] = []
return result
......@@ -25,7 +25,7 @@ from ganeti import _autoconf
# various versions
......@@ -93,12 +93,15 @@ class OpVerifyDisks(OpCode):
Result: two lists:
- list of node names with bad data returned (unreachable, etc.)
- dist of node names with broken volume groups (values: error msg)
- list of instances with degraded disks (that should be activated)
- dict of instances with missing logical volumes (values: (node, vol)
pairs with details about the missing volumes)
In normal operation, both lists should be empty. A non-empty
instance list is still ok (errors were fixed) but non-empty node
list means some node is down, and probably there are unfixable drbd
In normal operation, all lists should be empty. A non-empty instance
list (3rd element of the result) is still ok (errors were fixed) but
non-empty node list means some node is down, and probably there are
unfixable drbd errors.
Note that only instances that are drbd-based are taken into
consideration. This might need to be revisited in the future.
......@@ -27,6 +27,7 @@ from ganeti.cli import *
from ganeti import opcodes
from ganeti import constants
from ganeti import errors
from ganeti import utils
def InitCluster(opts, args):
......@@ -191,17 +192,28 @@ def VerifyDisks(opts, args):
op = opcodes.OpVerifyDisks()
result = SubmitOpCode(op)
if not isinstance(result, tuple) or len(result) != 2:
if not isinstance(result, tuple) or len(result) != 4:
raise errors.ProgrammerError("Unknown result type for OpVerifyDisks")
nodes, instances = result
nodes, nlvm, instances, missing = result
if nodes:
print "Nodes unreachable or with bad data:"
for name in nodes:
print "\t%s" % name
retcode = constants.EXIT_SUCCESS
if nlvm:
for node, text in nlvm.iteritems():
print ("Error on node %s: LVM error: %s" %
(node, text[-400:].encode('string_escape')))
retcode |= 1
print "You need to fix these nodes first before fixing instances"
if instances:
for iname in instances:
if iname in missing:
op = opcodes.OpActivateInstanceDisks(instance_name=iname)
print "Activating disks for instance '%s'" % iname
......@@ -209,8 +221,26 @@ def VerifyDisks(opts, args):
except errors.GenericError, err:
nret, msg = FormatError(err)
retcode |= nret
print >>sys.stderr, ("Error activating disks for instance %s: %s" %
(iname, msg))
print >> sys.stderr, ("Error activating disks for instance %s: %s" %
(iname, msg))
if missing:
for iname, ival in missing.iteritems():
all_missing = utils.all(ival, lambda x: x[0] in nlvm)
if all_missing:
print ("Instance %s cannot be verified as it lives on"
" broken nodes" % iname)
print "Instance %s has missing logical volumes:" % iname
for node, vol in ival:
if node in nlvm:
print ("\tbroken node %s /dev/xenvg/%s" % (node, vol))
print ("\t%s /dev/xenvg/%s" % (node, vol))
print ("You need to run replace_disks for all the above"
" instances, if this message persist after fixing nodes.")
retcode |= 1
return retcode
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment