ganeti-watcher 12.5 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
2
3
#!/usr/bin/python
#

4
# Copyright (C) 2006, 2007, 2008 Google Inc.
Iustin Pop's avatar
Iustin Pop committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Tool to restart erronously downed virtual machines.

This program and set of classes implement a watchdog to restart
virtual machines in a Ganeti cluster that have crashed or been killed
by a node reboot.  Run from cron or similar.

28
"""
Iustin Pop's avatar
Iustin Pop committed
29
30
31
32

import os
import sys
import time
33
import logging
Iustin Pop's avatar
Iustin Pop committed
34
35
36
37
from optparse import OptionParser

from ganeti import utils
from ganeti import constants
38
from ganeti import serializer
39
from ganeti import errors
40
41
from ganeti import opcodes
from ganeti import cli
Iustin Pop's avatar
Iustin Pop committed
42
43


44
MAXTRIES = 5
45
BAD_STATES = ['ERROR_down']
46
HELPLESS_STATES = ['ERROR_nodedown', 'ERROR_nodeoffline']
47
48
NOTICE = 'NOTICE'
ERROR = 'ERROR'
49
50
51
KEY_RESTART_COUNT = "restart_count"
KEY_RESTART_WHEN = "restart_when"
KEY_BOOT_ID = "bootid"
52
53


54
55
56
57
# Global client object
client = None


58
class NotMasterError(errors.GenericError):
59
  """Exception raised when this host is not the master."""
Iustin Pop's avatar
Iustin Pop committed
60
61
62
63
64


def Indent(s, prefix='| '):
  """Indent a piece of text with a given prefix before each line.

Iustin Pop's avatar
Iustin Pop committed
65
66
  @param s: the string to indent
  @param prefix: the string to prepend each line
67

Iustin Pop's avatar
Iustin Pop committed
68
69
70
71
  """
  return "%s%s\n" % (prefix, ('\n' + prefix).join(s.splitlines()))


72
class WatcherState(object):
Iustin Pop's avatar
Iustin Pop committed
73
74
75
76
  """Interface to a state file recording restart attempts.

  """
  def __init__(self):
77
78
    """Open, lock, read and parse the file.

79
    Raises exception on lock contention.
80
81

    """
Iustin Pop's avatar
Iustin Pop committed
82
83
84
    # The two-step dance below is necessary to allow both opening existing
    # file read/write and creating if not existing.  Vanilla open will truncate
    # an existing file -or- allow creating if not existing.
85
86
    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
    self.statefile = os.fdopen(fd, 'w+')
Iustin Pop's avatar
Iustin Pop committed
87

88
    utils.LockFile(self.statefile.fileno())
Iustin Pop's avatar
Iustin Pop committed
89

90
    try:
91
      self._data = serializer.Load(self.statefile.read())
92
93
    except Exception, msg:
      # Ignore errors while loading the file and treat it as empty
94
      self._data = {}
95
96
      logging.warning(("Empty or invalid state file. Using defaults."
                       " Error message: %s"), msg)
97

98
99
100
101
    if "instance" not in self._data:
      self._data["instance"] = {}
    if "node" not in self._data:
      self._data["node"] = {}
102

Iustin Pop's avatar
Iustin Pop committed
103
    self._orig_data = serializer.Dump(self._data)
104

105
106
  def Save(self):
    """Save state to file, then unlock and close it.
107
108

    """
109
110
    assert self.statefile

Iustin Pop's avatar
Iustin Pop committed
111
112
    serialized_form = serializer.Dump(self._data)
    if self._orig_data == serialized_form:
113
114
115
116
      logging.debug("Data didn't change, just touching status file")
      os.utime(constants.WATCHER_STATEFILE, None)
      return

117
118
119
    # We need to make sure the file is locked before renaming it, otherwise
    # starting ganeti-watcher again at the same time will create a conflict.
    fd = utils.WriteFile(constants.WATCHER_STATEFILE,
Iustin Pop's avatar
Iustin Pop committed
120
                         data=serialized_form,
121
                         prewrite=utils.LockFile, close=False)
122
    self.statefile = os.fdopen(fd, 'w+')
123

124
  def Close(self):
125
126
127
128
129
    """Unlock configuration file and close it.

    """
    assert self.statefile

130
    # Files are automatically unlocked when closing them
131
132
133
134
135
    self.statefile.close()
    self.statefile = None

  def GetNodeBootID(self, name):
    """Returns the last boot ID of a node or None.
Iustin Pop's avatar
Iustin Pop committed
136

137
    """
138
    ndata = self._data["node"]
139

140
141
    if name in ndata and KEY_BOOT_ID in ndata[name]:
      return ndata[name][KEY_BOOT_ID]
142
143
144
145
146
147
148
    return None

  def SetNodeBootID(self, name, bootid):
    """Sets the boot ID of a node.

    """
    assert bootid
Iustin Pop's avatar
Iustin Pop committed
149

150
    ndata = self._data["node"]
Iustin Pop's avatar
Iustin Pop committed
151

152
153
154
    if name not in ndata:
      ndata[name] = {}

155
    ndata[name][KEY_BOOT_ID] = bootid
156
157

  def NumberOfRestartAttempts(self, instance):
Iustin Pop's avatar
Iustin Pop committed
158
159
    """Returns number of previous restart attempts.

Iustin Pop's avatar
Iustin Pop committed
160
161
    @type instance: L{Instance}
    @param instance: the instance to look up
162

Iustin Pop's avatar
Iustin Pop committed
163
    """
164
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
165

166
    if instance.name in idata:
167
      return idata[instance.name][KEY_RESTART_COUNT]
Iustin Pop's avatar
Iustin Pop committed
168
169
170

    return 0

171
  def RecordRestartAttempt(self, instance):
Iustin Pop's avatar
Iustin Pop committed
172
173
    """Record a restart attempt.

Iustin Pop's avatar
Iustin Pop committed
174
175
    @type instance: L{Instance}
    @param instance: the instance being restarted
176

Iustin Pop's avatar
Iustin Pop committed
177
    """
178
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
179

180
181
182
183
    if instance.name not in idata:
      inst = idata[instance.name] = {}
    else:
      inst = idata[instance.name]
Iustin Pop's avatar
Iustin Pop committed
184

185
186
    inst[KEY_RESTART_WHEN] = time.time()
    inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
Iustin Pop's avatar
Iustin Pop committed
187

188
  def RemoveInstance(self, instance):
Iustin Pop's avatar
Iustin Pop committed
189
    """Update state to reflect that a machine is running.
Iustin Pop's avatar
Iustin Pop committed
190

Iustin Pop's avatar
Iustin Pop committed
191
192
    This method removes the record for a named instance (as we only
    track down instances).
Iustin Pop's avatar
Iustin Pop committed
193

Iustin Pop's avatar
Iustin Pop committed
194
195
    @type instance: L{Instance}
    @param instance: the instance to remove from books
196

Iustin Pop's avatar
Iustin Pop committed
197
    """
198
    idata = self._data["instance"]
Iustin Pop's avatar
Iustin Pop committed
199

200
201
    if instance.name in idata:
      del idata[instance.name]
Iustin Pop's avatar
Iustin Pop committed
202
203
204
205
206
207


class Instance(object):
  """Abstraction for a Virtual Machine instance.

  """
208
  def __init__(self, name, state, autostart):
Iustin Pop's avatar
Iustin Pop committed
209
210
    self.name = name
    self.state = state
211
    self.autostart = autostart
Iustin Pop's avatar
Iustin Pop committed
212
213

  def Restart(self):
214
215
216
    """Encapsulates the start of an instance.

    """
217
218
219
220
    op = opcodes.OpStartupInstance(instance_name=self.name,
                                   force=False,
                                   extra_args=None)
    cli.SubmitOpCode(op, cl=client)
Iustin Pop's avatar
Iustin Pop committed
221

222
223
224
225
  def ActivateDisks(self):
    """Encapsulates the activation of all disks of an instance.

    """
226
227
    op = opcodes.OpActivateInstanceDisks(instance_name=self.name)
    cli.SubmitOpCode(op, cl=client)
Iustin Pop's avatar
Iustin Pop committed
228
229


230
231
232
233
def GetInstanceList(with_secondaries=None):
  """Get a list of instances on this cluster.

  """
234
  fields = ["name", "status", "admin_state"]
Iustin Pop's avatar
Iustin Pop committed
235

236
  if with_secondaries is not None:
237
    fields.append("snodes")
Iustin Pop's avatar
Iustin Pop committed
238

239
  result = client.QueryInstances([], fields)
240
241

  instances = []
242
  for fields in result:
243
244
245
    if with_secondaries is not None:
      (name, status, autostart, snodes) = fields

246
      if not snodes:
Iustin Pop's avatar
Iustin Pop committed
247
        continue
248
249

      for node in with_secondaries:
250
        if node in snodes:
251
252
          break
      else:
Iustin Pop's avatar
Iustin Pop committed
253
254
        continue

255
256
257
    else:
      (name, status, autostart) = fields

258
    instances.append(Instance(name, status, autostart))
Iustin Pop's avatar
Iustin Pop committed
259

260
261
262
263
264
265
266
  return instances


def GetNodeBootIDs():
  """Get a dict mapping nodes to boot IDs.

  """
267
268
  result = client.QueryNodes([], ["name", "bootid", "offline"])
  return dict([(name, (bootid, offline)) for name, bootid, offline in result])
Iustin Pop's avatar
Iustin Pop committed
269
270


271
class Watcher(object):
Iustin Pop's avatar
Iustin Pop committed
272
273
274
275
276
  """Encapsulate the logic for restarting erronously halted virtual machines.

  The calling program should periodically instantiate me and call Run().
  This will traverse the list of instances, and make up to MAXTRIES attempts
  to restart machines that are down.
277

Iustin Pop's avatar
Iustin Pop committed
278
279
  """
  def __init__(self):
Michael Hanselmann's avatar
Michael Hanselmann committed
280
    master = client.QueryConfigValues(["master_node"])[0]
281
    if master != utils.HostInfo().name:
282
      raise NotMasterError("This is not the master node")
283
284
    self.instances = GetInstanceList()
    self.bootids = GetNodeBootIDs()
285
    self.started_instances = set()
Iustin Pop's avatar
Iustin Pop committed
286
287

  def Run(self):
288
    notepad = WatcherState()
289
290
291
292
293
294
    try:
      self.CheckInstances(notepad)
      self.CheckDisks(notepad)
      self.VerifyDisks()
    finally:
      notepad.Save()
295
296
297

  def CheckDisks(self, notepad):
    """Check all nodes for restarted ones.
298

Iustin Pop's avatar
Iustin Pop committed
299
    """
300
    check_nodes = []
301
    for name, (new_id, offline) in self.bootids.iteritems():
302
      old = notepad.GetNodeBootID(name)
Iustin Pop's avatar
Iustin Pop committed
303
304
      if new_id is None:
        # Bad node, not returning a boot id
305
306
307
        if not offline:
          logging.debug("Node %s missing boot id, skipping secondary checks",
                        name)
Iustin Pop's avatar
Iustin Pop committed
308
        continue
Iustin Pop's avatar
Iustin Pop committed
309
      if old != new_id:
310
311
312
313
314
315
316
        # Node's boot ID has changed, proably through a reboot.
        check_nodes.append(name)

    if check_nodes:
      # Activate disks for all instances with any of the checked nodes as a
      # secondary node.
      for instance in GetInstanceList(with_secondaries=check_nodes):
317
        if not instance.autostart:
318
319
          logging.info(("Skipping disk activation for non-autostart"
                        " instance %s"), instance.name)
320
          continue
321
322
323
324
        if instance.name in self.started_instances:
          # we already tried to start the instance, which should have
          # activated its drives (if they can be at all)
          continue
325
        try:
326
          logging.info("Activating disks for instance %s", instance.name)
327
          instance.ActivateDisks()
328
329
330
        except Exception:
          logging.exception("Error while activating disks for instance %s",
                            instance.name)
331
332
333
334

      # Keep changed boot IDs
      for name in check_nodes:
        notepad.SetNodeBootID(name, self.bootids[name])
Iustin Pop's avatar
Iustin Pop committed
335

336
337
338
339
  def CheckInstances(self, notepad):
    """Make a pass over the list of instances, restarting downed ones.

    """
Iustin Pop's avatar
Iustin Pop committed
340
341
    for instance in self.instances:
      if instance.state in BAD_STATES:
342
        n = notepad.NumberOfRestartAttempts(instance)
Iustin Pop's avatar
Iustin Pop committed
343
344
345
346
347
348
349

        if n > MAXTRIES:
          # stay quiet.
          continue
        elif n < MAXTRIES:
          last = " (Attempt #%d)" % (n + 1)
        else:
350
          notepad.RecordRestartAttempt(instance)
351
352
          logging.error("Could not restart %s after %d attempts, giving up",
                        instance.name, MAXTRIES)
Iustin Pop's avatar
Iustin Pop committed
353
354
          continue
        try:
355
356
          logging.info("Restarting %s%s",
                        instance.name, last)
Iustin Pop's avatar
Iustin Pop committed
357
          instance.Restart()
358
          self.started_instances.add(instance.name)
359
360
        except Exception:
          logging.exception("Erro while restarting instance %s", instance.name)
Iustin Pop's avatar
Iustin Pop committed
361

362
        notepad.RecordRestartAttempt(instance)
Iustin Pop's avatar
Iustin Pop committed
363
      elif instance.state in HELPLESS_STATES:
364
365
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
Iustin Pop's avatar
Iustin Pop committed
366
      else:
367
368
        if notepad.NumberOfRestartAttempts(instance):
          notepad.RemoveInstance(instance)
369
          logging.info("Restart of %s succeeded", instance.name)
Iustin Pop's avatar
Iustin Pop committed
370

371
372
  @staticmethod
  def VerifyDisks():
373
374
375
    """Run gnt-cluster verify-disks.

    """
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
    op = opcodes.OpVerifyDisks()
    result = cli.SubmitOpCode(op, cl=client)
    if not isinstance(result, (tuple, list)):
      logging.error("Can't get a valid result from verify-disks")
      return
    offline_disk_instances = result[2]
    if not offline_disk_instances:
      # nothing to do
      return
    logging.debug("Will activate disks for instances %s",
                  ", ".join(offline_disk_instances))
    # we submit only one job, and wait for it. not optimal, but spams
    # less the job queue
    job = [opcodes.OpActivateInstanceDisks(instance_name=name)
           for name in offline_disk_instances]
    job_id = cli.SendJob(job, cl=client)

    cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
Iustin Pop's avatar
Iustin Pop committed
394
395
396
397
398


def ParseOptions():
  """Parse the command line options.

Iustin Pop's avatar
Iustin Pop committed
399
  @return: (options, args) as from OptionParser.parse_args()
Iustin Pop's avatar
Iustin Pop committed
400
401
402
403
404
405
406
407

  """
  parser = OptionParser(description="Ganeti cluster watcher",
                        usage="%prog [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-d", "--debug", dest="debug",
408
                    help="Write all messages to stderr",
Iustin Pop's avatar
Iustin Pop committed
409
410
411
412
413
414
415
416
417
                    default=False, action="store_true")
  options, args = parser.parse_args()
  return options, args


def main():
  """Main function.

  """
418
419
  global client

Iustin Pop's avatar
Iustin Pop committed
420
421
  options, args = ParseOptions()

Iustin Pop's avatar
Iustin Pop committed
422
423
  utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
                     stderr_logging=options.debug)
Iustin Pop's avatar
Iustin Pop committed
424
425

  try:
426
427
    client = cli.GetClient()

428
429
430
431
432
    try:
      watcher = Watcher()
    except errors.ConfigurationError:
      # Just exit if there's no configuration
      sys.exit(constants.EXIT_SUCCESS)
433

434
    watcher.Run()
435
436
  except SystemExit:
    raise
437
  except NotMasterError:
438
    logging.debug("Not master, exiting")
439
    sys.exit(constants.EXIT_NOTMASTER)
440
  except errors.ResolverError, err:
441
    logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
442
    sys.exit(constants.EXIT_NODESETUP_ERROR)
443
444
445
  except Exception, err:
    logging.error(str(err), exc_info=True)
    sys.exit(constants.EXIT_FAILURE)
Iustin Pop's avatar
Iustin Pop committed
446

447

Iustin Pop's avatar
Iustin Pop committed
448
449
if __name__ == '__main__':
  main()