Commit 83e5e26f authored by René Nussbaumer's avatar René Nussbaumer
Browse files

Watcher: Fix endless repair tries for broken secondary



In cases where secondary was offline and not evacuated watcher tried
to activate-disks in an endless manner, but this is useless, as the
secondary is offline and therefore not responding to this approach.

This patch skips activation of the disk if the secondary is bad but
instance up and running.
Signed-off-by: default avatarRené Nussbaumer <rn@google.com>
Reviewed-by: default avatarMichael Hanselmann <hansmi@google.com>
parent 397693d3
......@@ -39,6 +39,7 @@ from optparse import OptionParser
from ganeti import utils
from ganeti import constants
from ganeti import compat
from ganeti import serializer
from ganeti import errors
from ganeti import opcodes
......@@ -390,10 +391,11 @@ class Instance(object):
"""Abstraction for a Virtual Machine instance.
"""
def __init__(self, name, state, autostart):
def __init__(self, name, state, autostart, snodes):
self.name = name
self.state = state
self.autostart = autostart
self.snodes = snodes
def Restart(self):
"""Encapsulates the start of an instance.
......@@ -445,7 +447,7 @@ def GetClusterData():
smap[node] = []
smap[node].append(name)
instances[name] = Instance(name, status, autostart)
instances[name] = Instance(name, status, autostart, snodes)
nodes = dict([(name, (bootid, offline))
for name, bootid, offline in all_results[1]])
......@@ -575,8 +577,20 @@ class Watcher(object):
notepad.RemoveInstance(instance)
logging.info("Restart of %s succeeded", instance.name)
@staticmethod
def VerifyDisks():
def _CheckForOfflineNodes(self, instance):
"""Checks if given instances has any secondary in offline status.
@param instance: The instance object
@return: True if any of the secondary is offline, False otherwise
"""
bootids = []
for node in instance.snodes:
bootids.append(self.bootids[node])
return compat.any(offline for (_, offline) in bootids)
def VerifyDisks(self):
"""Run gnt-cluster verify-disks.
"""
......@@ -587,7 +601,7 @@ class Watcher(object):
if not isinstance(result, (tuple, list)):
logging.error("Can't get a valid result from verify-disks")
return
offline_disk_instances = result[2]
offline_disk_instances = result[1]
if not offline_disk_instances:
# nothing to do
return
......@@ -595,14 +609,23 @@ class Watcher(object):
utils.CommaJoin(offline_disk_instances))
# we submit only one job, and wait for it. not optimal, but spams
# less the job queue
job = [opcodes.OpInstanceActivateDisks(instance_name=name)
for name in offline_disk_instances]
job_id = cli.SendJob(job, cl=client)
job = []
for name in offline_disk_instances:
instance = self.instances[name]
if (instance.state in HELPLESS_STATES or
self._CheckForOfflineNodes(instance)):
logging.info("Skip instance %s because it is in helpless state or has"
" one offline secondary", name)
continue
job.append(opcodes.OpInstanceActivateDisks(instance_name=name))
try:
cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
except Exception: # pylint: disable-msg=W0703
logging.exception("Error while activating disks")
if job:
job_id = cli.SendJob(job, cl=client)
try:
cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
except Exception: # pylint: disable-msg=W0703
logging.exception("Error while activating disks")
def OpenStateFile(path):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment