ganeti-watcher 13 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1 2 3
#!/usr/bin/python
#

4
# Copyright (C) 2006, 2007, 2008 Google Inc.
Iustin Pop's avatar
Iustin Pop committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Tool to restart erronously downed virtual machines.

This program and set of classes implement a watchdog to restart
virtual machines in a Ganeti cluster that have crashed or been killed
by a node reboot.  Run from cron or similar.

28
"""
Iustin Pop's avatar
Iustin Pop committed
29 30 31 32

import os
import sys
import time
33
import logging
Iustin Pop's avatar
Iustin Pop committed
34 35 36 37
from optparse import OptionParser

from ganeti import utils
from ganeti import constants
38
from ganeti import serializer
39
from ganeti import errors
40 41
from ganeti import opcodes
from ganeti import cli
Iustin Pop's avatar
Iustin Pop committed
42 43


44
MAXTRIES = 5
45
BAD_STATES = ['ERROR_down']
46
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
47 48
NOTICE = 'NOTICE'
ERROR = 'ERROR'
49 50 51
KEY_RESTART_COUNT = "restart_count"
KEY_RESTART_WHEN = "restart_when"
KEY_BOOT_ID = "bootid"
52 53


54 55 56 57
# Global client object
client = None


58
class NotMasterError(errors.GenericError):
59
  """Exception raised when this host is not the master."""
Iustin Pop's avatar
Iustin Pop committed
60 61 62 63 64


def Indent(s, prefix='| '):
  """Indent a piece of text with a given prefix before each line.

Iustin Pop's avatar
Iustin Pop committed
65 66
  @param s: the string to indent
  @param prefix: the string to prepend each line
67

Iustin Pop's avatar
Iustin Pop committed
68 69 70 71
  """
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))


72
class WatcherState(object):
Iustin Pop's avatar
Iustin Pop committed
73 74 75 76
  """Interface to a state file recording restart attempts.

  """
  def __init__(self):
77 78
    """Open, lock, read and parse the file.

79
    Raises exception on lock contention.
80 81

    """
Iustin Pop's avatar
Iustin Pop committed
82 83 84
    # The two-step dance below is necessary to allow both opening existing
    # file read/write and creating if not existing.  Vanilla open will truncate
    # an existing file -or- allow creating if not existing.
85 86
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
    self.statefile = os.fdopen(fd, 'w+')
Iustin Pop's avatar
Iustin Pop committed
87

88
    utils.LockFile(self.statefile.fileno())
Iustin Pop's avatar
Iustin Pop committed
89

90
    try:
91
      self._data = serializer.Load(self.statefile.read())
92 93
    except Exception, msg:
      # Ignore errors while loading the file and treat it as empty
94
      self._data = {}
95 96
      logging.warning(("Empty or invalid state file. Using defaults."
                       " Error message: %s"), msg)
97

98 99 100 101
    if "instance" not in self._data:
      self._data["instance"] = {}
    if "node" not in self._data:
      self._data["node"] = {}
102

Iustin Pop's avatar
Iustin Pop committed
103
    self._orig_data = serializer.Dump(self._data)
104

105 106
  def Save(self):
    """Save state to file, then unlock and close it.
107 108

    """
109 110
    assert self.statefile

Iustin Pop's avatar
Iustin Pop committed
111 112
    serialized_form = serializer.Dump(self._data)
    if self._orig_data == serialized_form:
113 114 115 116
      logging.debug("Data didn't change, just touching status file")
      os.utime(constants.WATCHER_STATEFILE, None)
      return

117 118 119
    # We need to make sure the file is locked before renaming it, otherwise
    # starting ganeti-watcher again at the same time will create a conflict.
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
Iustin Pop's avatar
Iustin Pop committed
120
                         data=serialized_form,
121
                         prewrite=utils.LockFile, close=False)
122
    self.statefile = os.fdopen(fd, 'w+')
123

124
  def Close(self):
125 126 127 128 129
    """Unlock configuration file and close it.

    """
    assert self.statefile

130
    # Files are automatically unlocked when closing them
131 132 133 134 135
    self.statefile.close()
    self.statefile = None

  def GetNodeBootID(self, name):
    """Returns the last boot ID of a node or None.
Iustin Pop's avatar
Iustin Pop committed
136

137
    """
138
    ndata = self._data["node"]
139

140 141
    if name in ndata and KEY_BOOT_ID in ndata[name]:
      return ndata[name][KEY_BOOT_ID]
142 143 144 145 146 147 148
    return None

  def SetNodeBootID(self, name, bootid):
    """Sets the boot ID of a node.

    """
    assert bootid
Iustin Pop's avatar
Iustin Pop committed
149

150
    ndata = self._data["node"]
Iustin Pop's avatar
Iustin Pop committed
151

152 153 154
    if name not in ndata:
      ndata[name] = {}

155
    ndata[name][KEY_BOOT_ID] = bootid
156 157

  def NumberOfRestartAttempts(self, instance):
Iustin Pop's avatar
Iustin Pop committed
158 159
    """Returns number of previous restart attempts.

Iustin Pop's avatar
Iustin Pop committed
160 161
    @type instance: L{Instance}
    @param instance: the instance to look up
162

Iustin Pop's avatar
Iustin Pop committed
163
    """
164
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
165

166
    if instance.name in idata:
167
      return idata[instance.name][KEY_RESTART_COUNT]
Iustin Pop's avatar
Iustin Pop committed
168 169 170

    return 0

171
  def RecordRestartAttempt(self, instance):
Iustin Pop's avatar
Iustin Pop committed
172 173
    """Record a restart attempt.

Iustin Pop's avatar
Iustin Pop committed
174 175
    @type instance: L{Instance}
    @param instance: the instance being restarted
176

Iustin Pop's avatar
Iustin Pop committed
177
    """
178
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
179

180 181 182 183
    if instance.name not in idata:
      inst = idata[instance.name] = {}
    else:
      inst = idata[instance.name]
Iustin Pop's avatar
Iustin Pop committed
184

185 186
    inst[KEY_RESTART_WHEN] = time.time()
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
Iustin Pop's avatar
Iustin Pop committed
187

188
  def RemoveInstance(self, instance):
Iustin Pop's avatar
Iustin Pop committed
189
    """Update state to reflect that a machine is running.
Iustin Pop's avatar
Iustin Pop committed
190

Iustin Pop's avatar
Iustin Pop committed
191 192
    This method removes the record for a named instance (as we only
    track down instances).
Iustin Pop's avatar
Iustin Pop committed
193

Iustin Pop's avatar
Iustin Pop committed
194 195
    @type instance: L{Instance}
    @param instance: the instance to remove from books
196

Iustin Pop's avatar
Iustin Pop committed
197
    """
198
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
199

200 201
    if instance.name in idata:
      del idata[instance.name]
Iustin Pop's avatar
Iustin Pop committed
202 203 204 205 206 207


class Instance(object):
  """Abstraction for a Virtual Machine instance.

  """
208
  def __init__(self, name, state, autostart):
Iustin Pop's avatar
Iustin Pop committed
209 210
    self.name = name
    self.state = state
211
    self.autostart = autostart
Iustin Pop's avatar
Iustin Pop committed
212 213

  def Restart(self):
214 215 216
    """Encapsulates the start of an instance.

    """
217
    op = opcodes.OpStartupInstance(instance_name=self.name, force=False)
218
    cli.SubmitOpCode(op, cl=client)
Iustin Pop's avatar
Iustin Pop committed
219

220 221 222 223
  def ActivateDisks(self):
    """Encapsulates the activation of all disks of an instance.

    """
224 225
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
    cli.SubmitOpCode(op, cl=client)
Iustin Pop's avatar
Iustin Pop committed
226 227


228 229 230 231
def GetInstanceList(with_secondaries=None):
  """Get a list of instances on this cluster.

  """
232
  fields = ["name", "status", "admin_state"]
Iustin Pop's avatar
Iustin Pop committed
233

234
  if with_secondaries is not None:
235
    fields.append("snodes")
Iustin Pop's avatar
Iustin Pop committed
236

237
  result = client.QueryInstances([], fields, True)
238 239

  instances = []
240
  for fields in result:
241 242 243
    if with_secondaries is not None:
      (name, status, autostart, snodes) = fields

244
      if not snodes:
Iustin Pop's avatar
Iustin Pop committed
245
        continue
246 247

      for node in with_secondaries:
248
        if node in snodes:
249 250
          break
      else:
Iustin Pop's avatar
Iustin Pop committed
251 252
        continue

253 254 255
    else:
      (name, status, autostart) = fields

256
    instances.append(Instance(name, status, autostart))
Iustin Pop's avatar
Iustin Pop committed
257

258 259 260 261 262 263 264
  return instances


def GetNodeBootIDs():
  """Get a dict mapping nodes to boot IDs.

  """
265
  result = client.QueryNodes([], ["name", "bootid", "offline"], True)
266
  return dict([(name, (bootid, offline)) for name, bootid, offline in result])
Iustin Pop's avatar
Iustin Pop committed
267 268


269
class Watcher(object):
Iustin Pop's avatar
Iustin Pop committed
270 271 272 273 274
  """Encapsulate the logic for restarting erronously halted virtual machines.

  The calling program should periodically instantiate me and call Run().
  This will traverse the list of instances, and make up to MAXTRIES attempts
  to restart machines that are down.
275

Iustin Pop's avatar
Iustin Pop committed
276
  """
277 278
  def __init__(self, opts, notepad):
    self.notepad = notepad
Michael Hanselmann's avatar
Michael Hanselmann committed
279
    master = client.QueryConfigValues(["master_node"])[0]
280
    if master != utils.HostInfo().name:
281
      raise NotMasterError("This is not the master node")
282 283
    self.instances = GetInstanceList()
    self.bootids = GetNodeBootIDs()
284
    self.started_instances = set()
Iustin Pop's avatar
Iustin Pop committed
285
    self.opts = opts
Iustin Pop's avatar
Iustin Pop committed
286 287

  def Run(self):
288 289 290 291 292 293 294 295
    """Watcher run sequence.

    """
    notepad = self.notepad
    self.ArchiveJobs(self.opts.job_age)
    self.CheckInstances(notepad)
    self.CheckDisks(notepad)
    self.VerifyDisks()
296

Iustin Pop's avatar
Iustin Pop committed
297 298 299 300 301 302 303
  def ArchiveJobs(self, age):
    """Archive old jobs.

    """
    arch_count, left_count = client.AutoArchiveJobs(age)
    logging.debug("Archived %s jobs, left %s" % (arch_count, left_count))

304 305
  def CheckDisks(self, notepad):
    """Check all nodes for restarted ones.
306

Iustin Pop's avatar
Iustin Pop committed
307
    """
308
    check_nodes = []
309
    for name, (new_id, offline) in self.bootids.iteritems():
310
      old = notepad.GetNodeBootID(name)
Iustin Pop's avatar
Iustin Pop committed
311 312
      if new_id is None:
        # Bad node, not returning a boot id
313 314 315
        if not offline:
          logging.debug("Node %s missing boot id, skipping secondary checks",
                        name)
Iustin Pop's avatar
Iustin Pop committed
316
        continue
Iustin Pop's avatar
Iustin Pop committed
317
      if old != new_id:
318 319 320 321 322 323 324
        # Node's boot ID has changed, proably through a reboot.
        check_nodes.append(name)

    if check_nodes:
      # Activate disks for all instances with any of the checked nodes as a
      # secondary node.
      for instance in GetInstanceList(with_secondaries=check_nodes):
325
        if not instance.autostart:
326 327
          logging.info(("Skipping disk activation for non-autostart"
                        " instance %s"), instance.name)
328
          continue
329 330 331 332
        if instance.name in self.started_instances:
          # we already tried to start the instance, which should have
          # activated its drives (if they can be at all)
          continue
333
        try:
334
          logging.info("Activating disks for instance %s", instance.name)
335
          instance.ActivateDisks()
336 337 338
        except Exception:
          logging.exception("Error while activating disks for instance %s",
                            instance.name)
339 340 341

      # Keep changed boot IDs
      for name in check_nodes:
Iustin Pop's avatar
Iustin Pop committed
342
        notepad.SetNodeBootID(name, self.bootids[name][0])
Iustin Pop's avatar
Iustin Pop committed
343

344 345 346 347
  def CheckInstances(self, notepad):
    """Make a pass over the list of instances, restarting downed ones.

    """
Iustin Pop's avatar
Iustin Pop committed
348 349
    for instance in self.instances:
      if instance.state in BAD_STATES:
350
        n = notepad.NumberOfRestartAttempts(instance)
Iustin Pop's avatar
Iustin Pop committed
351 352 353 354 355 356 357

        if n > MAXTRIES:
          # stay quiet.
          continue
        elif n < MAXTRIES:
          last = " (Attempt #%d)" % (n + 1)
        else:
358
          notepad.RecordRestartAttempt(instance)
359 360
          logging.error("Could not restart %s after %d attempts, giving up",
                        instance.name, MAXTRIES)
Iustin Pop's avatar
Iustin Pop committed
361 362
          continue
        try:
363 364
          logging.info("Restarting %s%s",
                        instance.name, last)
Iustin Pop's avatar
Iustin Pop committed
365
          instance.Restart()
366
          self.started_instances.add(instance.name)
367
        except Exception:
Iustin Pop's avatar
Iustin Pop committed
368 369
          logging.exception("Error while restarting instance %s",
                            instance.name)
Iustin Pop's avatar
Iustin Pop committed
370

371
        notepad.RecordRestartAttempt(instance)
Iustin Pop's avatar
Iustin Pop committed
372
      elif instance.state in HELPLESS_STATES:
373 374
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
Iustin Pop's avatar
Iustin Pop committed
375
      else:
376 377
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
378
          logging.info("Restart of %s succeeded", instance.name)
Iustin Pop's avatar
Iustin Pop committed
379

380 381
  @staticmethod
  def VerifyDisks():
382 383 384
    """Run gnt-cluster verify-disks.

    """
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
    op = opcodes.OpVerifyDisks()
    result = cli.SubmitOpCode(op, cl=client)
    if not isinstance(result, (tuple, list)):
      logging.error("Can't get a valid result from verify-disks")
      return
    offline_disk_instances = result[2]
    if not offline_disk_instances:
      # nothing to do
      return
    logging.debug("Will activate disks for instances %s",
                  ", ".join(offline_disk_instances))
    # we submit only one job, and wait for it. not optimal, but spams
    # less the job queue
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
           for name in offline_disk_instances]
    job_id = cli.SendJob(job, cl=client)

    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
Iustin Pop's avatar
Iustin Pop committed
403 404 405 406 407


def ParseOptions():
  """Parse the command line options.

Iustin Pop's avatar
Iustin Pop committed
408
  @return: (options, args) as from OptionParser.parse_args()
Iustin Pop's avatar
Iustin Pop committed
409 410 411 412 413 414 415 416

  """
  parser = OptionParser(description="Ganeti cluster watcher",
                        usage="%prog [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-d", "--debug", dest="debug",
417
                    help="Write all messages to stderr",
Iustin Pop's avatar
Iustin Pop committed
418
                    default=False, action="store_true")
Iustin Pop's avatar
Iustin Pop committed
419 420 421
  parser.add_option("-A", "--job-age", dest="job_age",
                    help="Autoarchive jobs older than this age (default"
                    " 6 hours)", default=6*3600)
Iustin Pop's avatar
Iustin Pop committed
422
  options, args = parser.parse_args()
Iustin Pop's avatar
Iustin Pop committed
423
  options.job_age = cli.ParseTimespec(options.job_age)
Iustin Pop's avatar
Iustin Pop committed
424 425 426 427 428 429 430
  return options, args


def main():
  """Main function.

  """
431 432
  global client

Iustin Pop's avatar
Iustin Pop committed
433 434
  options, args = ParseOptions()

Iustin Pop's avatar
Iustin Pop committed
435 436
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
                     stderr_logging=options.debug)
Iustin Pop's avatar
Iustin Pop committed
437 438

  try:
439
    notepad = WatcherState()
440
    try:
441 442 443 444 445 446 447
      client = cli.GetClient()

      try:
        watcher = Watcher(options, notepad)
      except errors.ConfigurationError:
        # Just exit if there's no configuration
        sys.exit(constants.EXIT_SUCCESS)
448

449 450 451
      watcher.Run()
    finally:
      notepad.Save()
452 453
  except SystemExit:
    raise
454
  except NotMasterError:
455
    logging.debug("Not master, exiting")
456
    sys.exit(constants.EXIT_NOTMASTER)
457
  except errors.ResolverError, err:
458
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
459
    sys.exit(constants.EXIT_NODESETUP_ERROR)
460 461 462
  except Exception, err:
    logging.error(str(err), exc_info=True)
    sys.exit(constants.EXIT_FAILURE)
Iustin Pop's avatar
Iustin Pop committed
463

464

Iustin Pop's avatar
Iustin Pop committed
465 466
if __name__ == '__main__':
  main()