Commit b63ed789 authored by Iustin Pop's avatar Iustin Pop
Browse files

Improve verify-disks: broken/missing LV detection

This patch improves the ‘gnt-cluster verify-disks’ command by adding
support for detecting broken volume groups and missing logical volume
names.

As such, we don't try anymore to activate disks for instances that are
not likely to succeed anyway, and instead report them.

Reviewed-by: schreiberal
parent 5574047a
...@@ -223,7 +223,7 @@ def GetVolumeList(vg_name): ...@@ -223,7 +223,7 @@ def GetVolumeList(vg_name):
if result.failed: if result.failed:
logger.Error("Failed to list logical volumes, lvs output: %s" % logger.Error("Failed to list logical volumes, lvs output: %s" %
result.output) result.output)
return lvs return result.output
for line in result.stdout.splitlines(): for line in result.stdout.splitlines():
line = line.strip().rstrip(sep) line = line.strip().rstrip(sep)
......
...@@ -840,12 +840,17 @@ class LUVerifyCluster(NoHooksLU): ...@@ -840,12 +840,17 @@ class LUVerifyCluster(NoHooksLU):
# node_volume # node_volume
volumeinfo = all_volumeinfo[node] volumeinfo = all_volumeinfo[node]
if type(volumeinfo) != dict: if isinstance(volumeinfo, basestring):
feedback_fn(" - ERROR: LVM problem on node %s: %s" %
(node, volumeinfo[-400:].encode('string_escape')))
bad = True
node_volume[node] = {}
elif not isinstance(volumeinfo, dict):
feedback_fn(" - ERROR: connection to %s failed" % (node,)) feedback_fn(" - ERROR: connection to %s failed" % (node,))
bad = True bad = True
continue continue
else:
node_volume[node] = volumeinfo node_volume[node] = volumeinfo
# node_instance # node_instance
nodeinstance = all_instanceinfo[node] nodeinstance = all_instanceinfo[node]
...@@ -899,7 +904,7 @@ class LUVerifyDisks(NoHooksLU): ...@@ -899,7 +904,7 @@ class LUVerifyDisks(NoHooksLU):
"""Verify integrity of cluster disks. """Verify integrity of cluster disks.
""" """
result = res_nodes, res_instances = [], [] result = res_nodes, res_nlvm, res_instances, res_missing = [], {}, [], {}
vg_name = self.cfg.GetVGName() vg_name = self.cfg.GetVGName()
nodes = utils.NiceSort(self.cfg.GetNodeList()) nodes = utils.NiceSort(self.cfg.GetNodeList())
...@@ -928,18 +933,28 @@ class LUVerifyDisks(NoHooksLU): ...@@ -928,18 +933,28 @@ class LUVerifyDisks(NoHooksLU):
# node_volume # node_volume
lvs = node_lvs[node] lvs = node_lvs[node]
if not isinstance(lvs, dict): if isinstance(lvs, basestring):
logger.Info("error enumerating LVs on node %s: %s" % (node, lvs))
res_nlvm[node] = lvs
elif not isinstance(lvs, dict):
logger.Info("connection to node %s failed or invalid data returned" % logger.Info("connection to node %s failed or invalid data returned" %
(node,)) (node,))
res_nodes.append(node) res_nodes.append(node)
continue continue
for lv_name, (_, lv_inactive, lv_online) in lvs.iteritems(): for lv_name, (_, lv_inactive, lv_online) in lvs.iteritems():
if not lv_online: inst = nv_dict.pop((node, lv_name), None)
inst = nv_dict.get((node, lv_name), None) if (not lv_online and inst is not None
if inst is not None and inst.name not in res_instances: and inst.name not in res_instances):
res_instances.append(inst.name) res_instances.append(inst.name)
# any leftover items in nv_dict are missing LVs, let's arrange the
# data better
for key, inst in nv_dict.iteritems():
if inst.name not in res_missing:
res_missing[inst.name] = []
res_missing[inst.name].append(key)
return result return result
......
...@@ -25,7 +25,7 @@ from ganeti import _autoconf ...@@ -25,7 +25,7 @@ from ganeti import _autoconf
# various versions # various versions
CONFIG_VERSION = 3 CONFIG_VERSION = 3
PROTOCOL_VERSION = 9 PROTOCOL_VERSION = 10
RELEASE_VERSION = _autoconf.PACKAGE_VERSION RELEASE_VERSION = _autoconf.PACKAGE_VERSION
OS_API_VERSION = 5 OS_API_VERSION = 5
EXPORT_VERSION = 0 EXPORT_VERSION = 0
......
...@@ -93,12 +93,15 @@ class OpVerifyDisks(OpCode): ...@@ -93,12 +93,15 @@ class OpVerifyDisks(OpCode):
Result: two lists: Result: two lists:
- list of node names with bad data returned (unreachable, etc.) - list of node names with bad data returned (unreachable, etc.)
- dist of node names with broken volume groups (values: error msg)
- list of instances with degraded disks (that should be activated) - list of instances with degraded disks (that should be activated)
- dict of instances with missing logical volumes (values: (node, vol)
pairs with details about the missing volumes)
In normal operation, both lists should be empty. A non-empty In normal operation, all lists should be empty. A non-empty instance
instance list is still ok (errors were fixed) but non-empty node list (3rd element of the result) is still ok (errors were fixed) but
list means some node is down, and probably there are unfixable drbd non-empty node list means some node is down, and probably there are
errors. unfixable drbd errors.
Note that only instances that are drbd-based are taken into Note that only instances that are drbd-based are taken into
consideration. This might need to be revisited in the future. consideration. This might need to be revisited in the future.
......
...@@ -27,6 +27,7 @@ from ganeti.cli import * ...@@ -27,6 +27,7 @@ from ganeti.cli import *
from ganeti import opcodes from ganeti import opcodes
from ganeti import constants from ganeti import constants
from ganeti import errors from ganeti import errors
from ganeti import utils
def InitCluster(opts, args): def InitCluster(opts, args):
...@@ -191,17 +192,28 @@ def VerifyDisks(opts, args): ...@@ -191,17 +192,28 @@ def VerifyDisks(opts, args):
""" """
op = opcodes.OpVerifyDisks() op = opcodes.OpVerifyDisks()
result = SubmitOpCode(op) result = SubmitOpCode(op)
if not isinstance(result, tuple) or len(result) != 2: if not isinstance(result, tuple) or len(result) != 4:
raise errors.ProgrammerError("Unknown result type for OpVerifyDisks") raise errors.ProgrammerError("Unknown result type for OpVerifyDisks")
nodes, instances = result nodes, nlvm, instances, missing = result
if nodes: if nodes:
print "Nodes unreachable or with bad data:" print "Nodes unreachable or with bad data:"
for name in nodes: for name in nodes:
print "\t%s" % name print "\t%s" % name
retcode = constants.EXIT_SUCCESS retcode = constants.EXIT_SUCCESS
if nlvm:
for node, text in nlvm.iteritems():
print ("Error on node %s: LVM error: %s" %
(node, text[-400:].encode('string_escape')))
retcode |= 1
print "You need to fix these nodes first before fixing instances"
if instances: if instances:
for iname in instances: for iname in instances:
if iname in missing:
continue
op = opcodes.OpActivateInstanceDisks(instance_name=iname) op = opcodes.OpActivateInstanceDisks(instance_name=iname)
try: try:
print "Activating disks for instance '%s'" % iname print "Activating disks for instance '%s'" % iname
...@@ -209,8 +221,26 @@ def VerifyDisks(opts, args): ...@@ -209,8 +221,26 @@ def VerifyDisks(opts, args):
except errors.GenericError, err: except errors.GenericError, err:
nret, msg = FormatError(err) nret, msg = FormatError(err)
retcode |= nret retcode |= nret
print >>sys.stderr, ("Error activating disks for instance %s: %s" % print >> sys.stderr, ("Error activating disks for instance %s: %s" %
(iname, msg)) (iname, msg))
if missing:
for iname, ival in missing.iteritems():
all_missing = utils.all(ival, lambda x: x[0] in nlvm)
if all_missing:
print ("Instance %s cannot be verified as it lives on"
" broken nodes" % iname)
else:
print "Instance %s has missing logical volumes:" % iname
ival.sort()
for node, vol in ival:
if node in nlvm:
print ("\tbroken node %s /dev/xenvg/%s" % (node, vol))
else:
print ("\t%s /dev/xenvg/%s" % (node, vol))
print ("You need to run replace_disks for all the above"
" instances, if this message persist after fixing nodes.")
retcode |= 1
return retcode return retcode
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment