Skip to content
Snippets Groups Projects
  • Iustin Pop's avatar
    Fix the watcher with down nodes · 37b77b18
    Iustin Pop authored
    The watcher didn't handle the down nodes, fix this by ignoring (in
    secondary node reboot checks) any node that doesn't return a boot id.
    
    Reviewed-by: imsnah
    37b77b18
ganeti-watcher 12.43 KiB
#!/usr/bin/python
#

# Copyright (C) 2006, 2007, 2008 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Tool to restart erronously downed virtual machines.

This program and set of classes implement a watchdog to restart
virtual machines in a Ganeti cluster that have crashed or been killed
by a node reboot.  Run from cron or similar.

"""

import os
import sys
import time
import logging
from optparse import OptionParser

from ganeti import utils
from ganeti import constants
from ganeti import serializer
from ganeti import ssconf
from ganeti import errors
from ganeti import opcodes
from ganeti import logger
from ganeti import cli


MAXTRIES = 5
BAD_STATES = ['ERROR_down']
HELPLESS_STATES = ['ERROR_nodedown']
NOTICE = 'NOTICE'
ERROR = 'ERROR'
KEY_RESTART_COUNT = "restart_count"
KEY_RESTART_WHEN = "restart_when"
KEY_BOOT_ID = "bootid"


# Global client object
client = None


class NotMasterError(errors.GenericError):
  """Exception raised when this host is not the master."""


def Indent(s, prefix='| '):
  """Indent a piece of text with a given prefix before each line.

  Args:
    s: The string to indent
    prefix: The string to prepend each line.

  """
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))


class WatcherState(object):
  """Interface to a state file recording restart attempts.

  """
  def __init__(self):
    """Open, lock, read and parse the file.

    Raises exception on lock contention.

    """
    # The two-step dance below is necessary to allow both opening existing
    # file read/write and creating if not existing.  Vanilla open will truncate
    # an existing file -or- allow creating if not existing.
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
    self.statefile = os.fdopen(fd, 'w+')

    utils.LockFile(self.statefile.fileno())

    try:
      self._data = serializer.Load(self.statefile.read())
    except Exception, msg:
      # Ignore errors while loading the file and treat it as empty
      self._data = {}
      logging.warning(("Empty or invalid state file. Using defaults."
                       " Error message: %s"), msg)

    if "instance" not in self._data:
      self._data["instance"] = {}
    if "node" not in self._data:
      self._data["node"] = {}

    self._orig_data = serializer.Dump(self._data)

  def Save(self):
    """Save state to file, then unlock and close it.

    """
    assert self.statefile

    serialized_form = serializer.Dump(self._data)
    if self._orig_data == serialized_form:
      logging.debug("Data didn't change, just touching status file")
      os.utime(constants.WATCHER_STATEFILE, None)
      return

    # We need to make sure the file is locked before renaming it, otherwise
    # starting ganeti-watcher again at the same time will create a conflict.
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
                         data=serialized_form,
                         prewrite=utils.LockFile, close=False)
    self.statefile = os.fdopen(fd, 'w+')

  def Close(self):
    """Unlock configuration file and close it.

    """
    assert self.statefile

    # Files are automatically unlocked when closing them
    self.statefile.close()
    self.statefile = None

  def GetNodeBootID(self, name):
    """Returns the last boot ID of a node or None.

    """
    ndata = self._data["node"]

    if name in ndata and KEY_BOOT_ID in ndata[name]:
      return ndata[name][KEY_BOOT_ID]
    return None

  def SetNodeBootID(self, name, bootid):
    """Sets the boot ID of a node.

    """
    assert bootid

    ndata = self._data["node"]

    if name not in ndata:
      ndata[name] = {}

    ndata[name][KEY_BOOT_ID] = bootid

  def NumberOfRestartAttempts(self, instance):
    """Returns number of previous restart attempts.

    Args:
      instance - the instance to look up.

    """
    idata = self._data["instance"]

    if instance.name in idata:
      return idata[instance.name][KEY_RESTART_COUNT]

    return 0

  def RecordRestartAttempt(self, instance):
    """Record a restart attempt.

    Args:
      instance - the instance being restarted

    """
    idata = self._data["instance"]

    if instance.name not in idata:
      inst = idata[instance.name] = {}
    else:
      inst = idata[instance.name]

    inst[KEY_RESTART_WHEN] = time.time()
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1

  def RemoveInstance(self, instance):
    """Update state to reflect that a machine is running, i.e. remove record.

    Args:
      instance - the instance to remove from books

    This method removes the record for a named instance.

    """
    idata = self._data["instance"]

    if instance.name in idata:
      del idata[instance.name]


class Instance(object):
  """Abstraction for a Virtual Machine instance.

  Methods:
    Restart(): issue a command to restart the represented machine.

  """
  def __init__(self, name, state, autostart):
    self.name = name
    self.state = state
    self.autostart = autostart

  def Restart(self):
    """Encapsulates the start of an instance.

    """
    op = opcodes.OpStartupInstance(instance_name=self.name,
                                   force=False,
                                   extra_args=None)
    cli.SubmitOpCode(op, cl=client)

  def ActivateDisks(self):
    """Encapsulates the activation of all disks of an instance.

    """
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
    cli.SubmitOpCode(op, cl=client)


def GetInstanceList(with_secondaries=None):
  """Get a list of instances on this cluster.

  """
  fields = ["name", "status", "admin_state"]

  if with_secondaries is not None:
    fields.append("snodes")

  result = client.QueryInstances([], fields)

  instances = []
  for fields in result:
    if with_secondaries is not None:
      (name, status, autostart, snodes) = fields

      if not snodes:
        continue

      for node in with_secondaries:
        if node in snodes:
          break
      else:
        continue

    else:
      (name, status, autostart) = fields

    instances.append(Instance(name, status, autostart))

  return instances


def GetNodeBootIDs():
  """Get a dict mapping nodes to boot IDs.

  """
  result = client.QueryNodes([], ["name", "bootid"])
  return dict([(name, bootid) for name, bootid in result])


class Watcher(object):
  """Encapsulate the logic for restarting erronously halted virtual machines.

  The calling program should periodically instantiate me and call Run().
  This will traverse the list of instances, and make up to MAXTRIES attempts
  to restart machines that are down.

  """
  def __init__(self):
    sstore = ssconf.SimpleStore()
    master = sstore.GetMasterNode()
    if master != utils.HostInfo().name:
      raise NotMasterError("This is not the master node")
    self.instances = GetInstanceList()
    self.bootids = GetNodeBootIDs()
    self.started_instances = set()

  def Run(self):
    notepad = WatcherState()
    try:
      self.CheckInstances(notepad)
      self.CheckDisks(notepad)
      self.VerifyDisks()
    finally:
      notepad.Save()

  def CheckDisks(self, notepad):
    """Check all nodes for restarted ones.

    """
    check_nodes = []
    for name, new_id in self.bootids.iteritems():
      old = notepad.GetNodeBootID(name)
      if new_id is None:
        # Bad node, not returning a boot id
        logging.debug("Node %s missing boot id, skipping secondary checks",
                      name)
        continue
      if old != new_id:
        # Node's boot ID has changed, proably through a reboot.
        check_nodes.append(name)

    if check_nodes:
      # Activate disks for all instances with any of the checked nodes as a
      # secondary node.
      for instance in GetInstanceList(with_secondaries=check_nodes):
        if not instance.autostart:
          logging.info(("Skipping disk activation for non-autostart"
                        " instance %s"), instance.name)
          continue
        if instance.name in self.started_instances:
          # we already tried to start the instance, which should have
          # activated its drives (if they can be at all)
          continue
        try:
          logging.info("Activating disks for instance %s", instance.name)
          instance.ActivateDisks()
        except Exception:
          logging.exception("Error while activating disks for instance %s",
                            instance.name)

      # Keep changed boot IDs
      for name in check_nodes:
        notepad.SetNodeBootID(name, self.bootids[name])

  def CheckInstances(self, notepad):
    """Make a pass over the list of instances, restarting downed ones.

    """
    for instance in self.instances:
      if instance.state in BAD_STATES:
        n = notepad.NumberOfRestartAttempts(instance)

        if n > MAXTRIES:
          # stay quiet.
          continue
        elif n < MAXTRIES:
          last = " (Attempt #%d)" % (n + 1)
        else:
          notepad.RecordRestartAttempt(instance)
          logging.error("Could not restart %s after %d attempts, giving up",
                        instance.name, MAXTRIES)
          continue
        try:
          logging.info("Restarting %s%s",
                        instance.name, last)
          instance.Restart()
          self.started_instances.add(instance.name)
        except Exception:
          logging.exception("Erro while restarting instance %s", instance.name)

        notepad.RecordRestartAttempt(instance)
      elif instance.state in HELPLESS_STATES:
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
      else:
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
          logging.info("Restart of %s succeeded", instance.name)

  @staticmethod
  def VerifyDisks():
    """Run gnt-cluster verify-disks.

    """
    op = opcodes.OpVerifyDisks()
    result = cli.SubmitOpCode(op, cl=client)
    if not isinstance(result, (tuple, list)):
      logging.error("Can't get a valid result from verify-disks")
      return
    offline_disk_instances = result[2]
    if not offline_disk_instances:
      # nothing to do
      return
    logging.debug("Will activate disks for instances %s",
                  ", ".join(offline_disk_instances))
    # we submit only one job, and wait for it. not optimal, but spams
    # less the job queue
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
           for name in offline_disk_instances]
    job_id = cli.SendJob(job, cl=client)

    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)


def ParseOptions():
  """Parse the command line options.

  Returns:
    (options, args) as from OptionParser.parse_args()

  """
  parser = OptionParser(description="Ganeti cluster watcher",
                        usage="%prog [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-d", "--debug", dest="debug",
                    help="Write all messages to stderr",
                    default=False, action="store_true")
  options, args = parser.parse_args()
  return options, args


def main():
  """Main function.

  """
  global client

  options, args = ParseOptions()

  logger.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
                      stderr_logging=options.debug)

  try:
    client = cli.GetClient()

    try:
      watcher = Watcher()
    except errors.ConfigurationError:
      # Just exit if there's no configuration
      sys.exit(constants.EXIT_SUCCESS)

    watcher.Run()
  except SystemExit:
    raise
  except NotMasterError:
    logging.debug("Not master, exiting")
    sys.exit(constants.EXIT_NOTMASTER)
  except errors.ResolverError, err:
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
    sys.exit(constants.EXIT_NODESETUP_ERROR)
  except Exception, err:
    logging.error(str(err), exc_info=True)
    sys.exit(constants.EXIT_FAILURE)


if __name__ == '__main__':
  main()