ganeti-watcher 12.3 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
2
3
#!/usr/bin/python
#

4
# Copyright (C) 2006, 2007, 2008 Google Inc.
Iustin Pop's avatar
Iustin Pop committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Tool to restart erronously downed virtual machines.

This program and set of classes implement a watchdog to restart
virtual machines in a Ganeti cluster that have crashed or been killed
by a node reboot.  Run from cron or similar.

28
"""
Iustin Pop's avatar
Iustin Pop committed
29
30
31

import os
import sys
32
import re
Iustin Pop's avatar
Iustin Pop committed
33
34
35
import time
import fcntl
import errno
36
import logging
Iustin Pop's avatar
Iustin Pop committed
37
38
39
40
from optparse import OptionParser

from ganeti import utils
from ganeti import constants
41
from ganeti import serializer
42
from ganeti import ssconf
43
from ganeti import errors
44
from ganeti import logger
Iustin Pop's avatar
Iustin Pop committed
45
46


47
48
49
50
51
MAXTRIES = 5
BAD_STATES = ['stopped']
HELPLESS_STATES = ['(node down)']
NOTICE = 'NOTICE'
ERROR = 'ERROR'
52
53
54
KEY_RESTART_COUNT = "restart_count"
KEY_RESTART_WHEN = "restart_when"
KEY_BOOT_ID = "bootid"
55
56


57
class NotMasterError(errors.GenericError):
58
  """Exception raised when this host is not the master."""
Iustin Pop's avatar
Iustin Pop committed
59
60
61
62
63
64
65
66


def Indent(s, prefix='| '):
  """Indent a piece of text with a given prefix before each line.

  Args:
    s: The string to indent
    prefix: The string to prepend each line.
67

Iustin Pop's avatar
Iustin Pop committed
68
69
70
71
72
73
74
75
76
77
78
  """
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))


def DoCmd(cmd):
  """Run a shell command.

  Args:
    cmd: the command to run.

  Raises CommandError with verbose commentary on error.
79

Iustin Pop's avatar
Iustin Pop committed
80
81
82
83
  """
  res = utils.RunCmd(cmd)

  if res.failed:
84
85
86
87
88
89
    msg = ("Command %s failed:\n%s\nstdout:\n%sstderr:\n%s" %
           (repr(cmd),
            Indent(res.fail_reason),
            Indent(res.stdout),
            Indent(res.stderr)))
    raise errors.CommandError(msg)
Iustin Pop's avatar
Iustin Pop committed
90
91
92
93

  return res


94
95
96
97
98
99
100
101
def LockFile(fd):
  """Locks a file using POSIX locks.

  """
  try:
    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
  except IOError, err:
    if err.errno == errno.EAGAIN:
102
      raise errors.LockError("File already locked")
103
104
105
    raise


106
class WatcherState(object):
Iustin Pop's avatar
Iustin Pop committed
107
108
109
110
  """Interface to a state file recording restart attempts.

  """
  def __init__(self):
111
112
    """Open, lock, read and parse the file.

113
    Raises exception on lock contention.
114
115

    """
Iustin Pop's avatar
Iustin Pop committed
116
117
118
    # The two-step dance below is necessary to allow both opening existing
    # file read/write and creating if not existing.  Vanilla open will truncate
    # an existing file -or- allow creating if not existing.
119
120
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
    self.statefile = os.fdopen(fd, 'w+')
Iustin Pop's avatar
Iustin Pop committed
121

122
    LockFile(self.statefile.fileno())
Iustin Pop's avatar
Iustin Pop committed
123

124
    try:
125
      self._data = serializer.Load(self.statefile.read())
126
127
    except Exception, msg:
      # Ignore errors while loading the file and treat it as empty
128
      self._data = {}
129
130
      logging.warning(("Empty or invalid state file. Using defaults."
                       " Error message: %s"), msg)
131

132
133
134
135
    if "instance" not in self._data:
      self._data["instance"] = {}
    if "node" not in self._data:
      self._data["node"] = {}
136

137
138
    self._orig_data = self._data.copy()

139
140
  def Save(self):
    """Save state to file, then unlock and close it.
141
142

    """
143
144
    assert self.statefile

145
146
147
148
149
    if self._orig_data == self._data:
      logging.debug("Data didn't change, just touching status file")
      os.utime(constants.WATCHER_STATEFILE, None)
      return

150
151
152
    # We need to make sure the file is locked before renaming it, otherwise
    # starting ganeti-watcher again at the same time will create a conflict.
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
153
                         data=serializer.Dump(self._data),
154
155
                         prewrite=LockFile, close=False)
    self.statefile = os.fdopen(fd, 'w+')
156

157
  def Close(self):
158
159
160
161
162
    """Unlock configuration file and close it.

    """
    assert self.statefile

163
    # Files are automatically unlocked when closing them
164
165
166
167
168
    self.statefile.close()
    self.statefile = None

  def GetNodeBootID(self, name):
    """Returns the last boot ID of a node or None.
Iustin Pop's avatar
Iustin Pop committed
169

170
    """
171
    ndata = self._data["node"]
172

173
174
    if name in ndata and KEY_BOOT_ID in ndata[name]:
      return ndata[name][KEY_BOOT_ID]
175
176
177
178
179
180
181
    return None

  def SetNodeBootID(self, name, bootid):
    """Sets the boot ID of a node.

    """
    assert bootid
Iustin Pop's avatar
Iustin Pop committed
182

183
    ndata = self._data["node"]
Iustin Pop's avatar
Iustin Pop committed
184

185
186
187
    if name not in ndata:
      ndata[name] = {}

188
    ndata[name][KEY_BOOT_ID] = bootid
189
190

  def NumberOfRestartAttempts(self, instance):
Iustin Pop's avatar
Iustin Pop committed
191
192
193
194
    """Returns number of previous restart attempts.

    Args:
      instance - the instance to look up.
195

Iustin Pop's avatar
Iustin Pop committed
196
    """
197
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
198

199
    if instance.name in idata:
200
      return idata[instance.name][KEY_RESTART_COUNT]
Iustin Pop's avatar
Iustin Pop committed
201
202
203

    return 0

204
  def RecordRestartAttempt(self, instance):
Iustin Pop's avatar
Iustin Pop committed
205
206
207
208
    """Record a restart attempt.

    Args:
      instance - the instance being restarted
209

Iustin Pop's avatar
Iustin Pop committed
210
    """
211
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
212

213
214
215
216
    if instance.name not in idata:
      inst = idata[instance.name] = {}
    else:
      inst = idata[instance.name]
Iustin Pop's avatar
Iustin Pop committed
217

218
219
    inst[KEY_RESTART_WHEN] = time.time()
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
Iustin Pop's avatar
Iustin Pop committed
220

221
  def RemoveInstance(self, instance):
222
    """Update state to reflect that a machine is running, i.e. remove record.
Iustin Pop's avatar
Iustin Pop committed
223
224
225
226

    Args:
      instance - the instance to remove from books

227
228
    This method removes the record for a named instance.

Iustin Pop's avatar
Iustin Pop committed
229
    """
230
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
231

232
233
    if instance.name in idata:
      del idata[instance.name]
Iustin Pop's avatar
Iustin Pop committed
234
235
236
237
238
239
240


class Instance(object):
  """Abstraction for a Virtual Machine instance.

  Methods:
    Restart(): issue a command to restart the represented machine.
241

Iustin Pop's avatar
Iustin Pop committed
242
  """
243
  def __init__(self, name, state, autostart):
Iustin Pop's avatar
Iustin Pop committed
244
245
    self.name = name
    self.state = state
246
    self.autostart = autostart
Iustin Pop's avatar
Iustin Pop committed
247
248

  def Restart(self):
249
250
251
    """Encapsulates the start of an instance.

    """
Iustin Pop's avatar
Iustin Pop committed
252
253
    DoCmd(['gnt-instance', 'startup', '--lock-retries=15', self.name])

254
255
256
257
258
259
  def ActivateDisks(self):
    """Encapsulates the activation of all disks of an instance.

    """
    DoCmd(['gnt-instance', 'activate-disks', '--lock-retries=15', self.name])

Iustin Pop's avatar
Iustin Pop committed
260

261
262
def _RunListCmd(cmd):
  """Runs a command and parses its output into lists.
263

Iustin Pop's avatar
Iustin Pop committed
264
  """
265
266
  for line in DoCmd(cmd).stdout.splitlines():
    yield line.split(':')
Iustin Pop's avatar
Iustin Pop committed
267
268


269
270
271
272
273
274
275
276
def GetInstanceList(with_secondaries=None):
  """Get a list of instances on this cluster.

  """
  cmd = ['gnt-instance', 'list', '--lock-retries=15', '--no-headers',
         '--separator=:']

  fields = 'name,oper_state,admin_state'
Iustin Pop's avatar
Iustin Pop committed
277

278
279
  if with_secondaries is not None:
    fields += ',snodes'
Iustin Pop's avatar
Iustin Pop committed
280

281
282
283
284
285
286
287
288
289
  cmd.append('-o')
  cmd.append(fields)

  instances = []
  for fields in _RunListCmd(cmd):
    if with_secondaries is not None:
      (name, status, autostart, snodes) = fields

      if snodes == "-":
Iustin Pop's avatar
Iustin Pop committed
290
        continue
291
292
293
294
295

      for node in with_secondaries:
        if node in snodes.split(','):
          break
      else:
Iustin Pop's avatar
Iustin Pop committed
296
297
        continue

298
299
300
301
    else:
      (name, status, autostart) = fields

    instances.append(Instance(name, status, autostart != "no"))
Iustin Pop's avatar
Iustin Pop committed
302

303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
  return instances


def GetNodeBootIDs():
  """Get a dict mapping nodes to boot IDs.

  """
  cmd = ['gnt-node', 'list', '--lock-retries=15', '--no-headers',
         '--separator=:', '-o', 'name,bootid']

  ids = {}
  for fields in _RunListCmd(cmd):
    (name, bootid) = fields
    ids[name] = bootid

  return ids
Iustin Pop's avatar
Iustin Pop committed
319
320


321
class Watcher(object):
Iustin Pop's avatar
Iustin Pop committed
322
323
324
325
326
  """Encapsulate the logic for restarting erronously halted virtual machines.

  The calling program should periodically instantiate me and call Run().
  This will traverse the list of instances, and make up to MAXTRIES attempts
  to restart machines that are down.
327

Iustin Pop's avatar
Iustin Pop committed
328
329
  """
  def __init__(self):
330
331
    sstore = ssconf.SimpleStore()
    master = sstore.GetMasterNode()
332
    if master != utils.HostInfo().name:
333
      raise NotMasterError("This is not the master node")
334
335
    self.instances = GetInstanceList()
    self.bootids = GetNodeBootIDs()
336
    self.started_instances = set()
Iustin Pop's avatar
Iustin Pop committed
337
338

  def Run(self):
339
    notepad = WatcherState()
340
341
342
343
344
345
    try:
      self.CheckInstances(notepad)
      self.CheckDisks(notepad)
      self.VerifyDisks()
    finally:
      notepad.Save()
346
347
348

  def CheckDisks(self, notepad):
    """Check all nodes for restarted ones.
349

Iustin Pop's avatar
Iustin Pop committed
350
    """
351
352
353
354
355
356
357
358
359
360
361
    check_nodes = []
    for name, id in self.bootids.iteritems():
      old = notepad.GetNodeBootID(name)
      if old != id:
        # Node's boot ID has changed, proably through a reboot.
        check_nodes.append(name)

    if check_nodes:
      # Activate disks for all instances with any of the checked nodes as a
      # secondary node.
      for instance in GetInstanceList(with_secondaries=check_nodes):
362
        if not instance.autostart:
363
364
          logging.info(("Skipping disk activation for non-autostart"
                        " instance %s"), instance.name)
365
          continue
366
367
368
369
        if instance.name in self.started_instances:
          # we already tried to start the instance, which should have
          # activated its drives (if they can be at all)
          continue
370
        try:
371
          logging.info("Activating disks for instance %s", instance.name)
372
          instance.ActivateDisks()
373
374
        except Error, err:
          logging.error(str(err), exc_info=True)
375
376
377
378

      # Keep changed boot IDs
      for name in check_nodes:
        notepad.SetNodeBootID(name, self.bootids[name])
Iustin Pop's avatar
Iustin Pop committed
379

380
381
382
383
  def CheckInstances(self, notepad):
    """Make a pass over the list of instances, restarting downed ones.

    """
Iustin Pop's avatar
Iustin Pop committed
384
    for instance in self.instances:
385
386
387
388
      # Don't care about manually stopped instances
      if not instance.autostart:
        continue

Iustin Pop's avatar
Iustin Pop committed
389
      if instance.state in BAD_STATES:
390
        n = notepad.NumberOfRestartAttempts(instance)
Iustin Pop's avatar
Iustin Pop committed
391
392
393
394
395
396
397

        if n > MAXTRIES:
          # stay quiet.
          continue
        elif n < MAXTRIES:
          last = " (Attempt #%d)" % (n + 1)
        else:
398
          notepad.RecordRestartAttempt(instance)
399
400
          logging.error("Could not restart %s after %d attempts, giving up",
                        instance.name, MAXTRIES)
Iustin Pop's avatar
Iustin Pop committed
401
402
          continue
        try:
403
404
          logging.info("Restarting %s%s",
                        instance.name, last)
Iustin Pop's avatar
Iustin Pop committed
405
          instance.Restart()
406
          self.started_instances.add(instance.name)
407
408
        except Error, err:
          logging.error(str(err), exc_info=True)
Iustin Pop's avatar
Iustin Pop committed
409

410
        notepad.RecordRestartAttempt(instance)
Iustin Pop's avatar
Iustin Pop committed
411
      elif instance.state in HELPLESS_STATES:
412
413
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
Iustin Pop's avatar
Iustin Pop committed
414
      else:
415
416
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
417
          logging.info("Restart of %s succeeded", instance.name)
Iustin Pop's avatar
Iustin Pop committed
418

419
420
421
422
423
424
  def VerifyDisks(self):
    """Run gnt-cluster verify-disks.

    """
    result = DoCmd(['gnt-cluster', 'verify-disks', '--lock-retries=15'])
    if result.output:
425
      logging.info(result.output)
Iustin Pop's avatar
Iustin Pop committed
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440


def ParseOptions():
  """Parse the command line options.

  Returns:
    (options, args) as from OptionParser.parse_args()

  """
  parser = OptionParser(description="Ganeti cluster watcher",
                        usage="%prog [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-d", "--debug", dest="debug",
441
                    help="Write all messages to stderr",
Iustin Pop's avatar
Iustin Pop committed
442
443
444
445
446
447
448
449
450
451
452
                    default=False, action="store_true")
  options, args = parser.parse_args()
  return options, args


def main():
  """Main function.

  """
  options, args = ParseOptions()

453
  logger.SetupDaemon(constants.LOG_WATCHER, debug=options.debug)
Iustin Pop's avatar
Iustin Pop committed
454
455

  try:
456
457
458
459
460
    try:
      watcher = Watcher()
    except errors.ConfigurationError:
      # Just exit if there's no configuration
      sys.exit(constants.EXIT_SUCCESS)
461
    watcher.Run()
462
463
  except SystemExit:
    raise
464
  except NotMasterError:
465
    logging.debug("Not master, exiting")
466
    sys.exit(constants.EXIT_NOTMASTER)
467
  except errors.ResolverError, err:
468
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
469
    sys.exit(constants.EXIT_NODESETUP_ERROR)
470
471
472
  except Exception, err:
    logging.error(str(err), exc_info=True)
    sys.exit(constants.EXIT_FAILURE)
Iustin Pop's avatar
Iustin Pop committed
473

474

Iustin Pop's avatar
Iustin Pop committed
475
476
if __name__ == '__main__':
  main()