ganeti-masterd 15.7 KB
Newer Older
1
#!/usr/bin/python -u
Iustin Pop's avatar
Iustin Pop committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Master daemon program.

Some classes deviates from the standard style guide since the
inheritance from parent classes requires it.

"""


30
31
import os
import errno
32
import sys
Iustin Pop's avatar
Iustin Pop committed
33
34
35
36
37
38
import SocketServer
import time
import collections
import Queue
import random
import signal
39
import logging
Iustin Pop's avatar
Iustin Pop committed
40
41

from cStringIO import StringIO
42
from optparse import OptionParser
Iustin Pop's avatar
Iustin Pop committed
43

44
from ganeti import config
Iustin Pop's avatar
Iustin Pop committed
45
46
47
48
from ganeti import constants
from ganeti import mcpu
from ganeti import opcodes
from ganeti import jqueue
49
from ganeti import locking
Iustin Pop's avatar
Iustin Pop committed
50
51
from ganeti import luxi
from ganeti import utils
52
53
from ganeti import errors
from ganeti import ssconf
54
from ganeti import workerpool
55
from ganeti import rpc
56
from ganeti import bootstrap
57
from ganeti import serializer
58
59


60
61
CLIENT_REQUEST_WORKERS = 16

62
63
EXIT_NOTMASTER = constants.EXIT_NOTMASTER
EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR
Iustin Pop's avatar
Iustin Pop committed
64
65


66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class ClientRequestWorker(workerpool.BaseWorker):
  def RunTask(self, server, request, client_address):
    """Process the request.

    This is copied from the code in ThreadingMixIn.

    """
    try:
      server.finish_request(request, client_address)
      server.close_request(request)
    except:
      server.handle_error(request, client_address)
      server.close_request(request)


Iustin Pop's avatar
Iustin Pop committed
81
82
83
84
85
86
87
88
class IOServer(SocketServer.UnixStreamServer):
  """IO thread class.

  This class takes care of initializing the other threads, setting
  signal handlers (which are processed only in this thread), and doing
  cleanup at shutdown.

  """
89
  def __init__(self, address, rqhandler):
90
91
    """IOServer constructor

Iustin Pop's avatar
Iustin Pop committed
92
93
    @param address: the address to bind this IOServer to
    @param rqhandler: RequestHandler type object
94
95

    """
Iustin Pop's avatar
Iustin Pop committed
96
    SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
97
98

    # We'll only start threads once we've forked.
99
    self.context = None
100
    self.request_workers = None
101
102

  def setup_queue(self):
103
    self.context = GanetiContext()
104
105
    self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS,
                                                 ClientRequestWorker)
Iustin Pop's avatar
Iustin Pop committed
106
107

  def process_request(self, request, client_address):
108
    """Add task to workerpool to process request.
Iustin Pop's avatar
Iustin Pop committed
109
110

    """
111
    self.request_workers.AddTask(self, request, client_address)
Iustin Pop's avatar
Iustin Pop committed
112
113
114

  def serve_forever(self):
    """Handle one request at a time until told to quit."""
115
116
117
118
119
120
    sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM])
    try:
      while not sighandler.called:
        self.handle_request()
    finally:
      sighandler.Reset()
121
122
123
124
125
126
127
128

  def server_cleanup(self):
    """Cleanup the server.

    This involves shutting down the processor threads and the master
    socket.

    """
129
130
131
    try:
      self.server_close()
    finally:
132
      if self.request_workers:
133
        self.request_workers.TerminateWorkers()
134
135
      if self.context:
        self.context.jobqueue.Shutdown()
Iustin Pop's avatar
Iustin Pop committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151


class ClientRqHandler(SocketServer.BaseRequestHandler):
  """Client handler"""
  EOM = '\3'
  READ_SIZE = 4096

  def setup(self):
    self._buffer = ""
    self._msgs = collections.deque()
    self._ops = ClientOps(self.server)

  def handle(self):
    while True:
      msg = self.read_message()
      if msg is None:
152
        logging.debug("client closed connection")
Iustin Pop's avatar
Iustin Pop committed
153
        break
154

155
      request = serializer.LoadJson(msg)
156
      logging.debug("request: %s", request)
Iustin Pop's avatar
Iustin Pop committed
157
      if not isinstance(request, dict):
158
        logging.error("wrong request received: %s", msg)
Iustin Pop's avatar
Iustin Pop committed
159
        break
160
161
162
163
164

      method = request.get(luxi.KEY_METHOD, None)
      args = request.get(luxi.KEY_ARGS, None)
      if method is None or args is None:
        logging.error("no method or args in request")
Iustin Pop's avatar
Iustin Pop committed
165
        break
166
167
168
169
170

      success = False
      try:
        result = self._ops.handle_request(method, args)
        success = True
171
172
173
      except errors.GenericError, err:
        success = False
        result = (err.__class__.__name__, err.args)
174
175
176
177
178
179
180
181
182
183
      except:
        logging.error("Unexpected exception", exc_info=True)
        err = sys.exc_info()
        result = "Caught exception: %s" % str(err[1])

      response = {
        luxi.KEY_SUCCESS: success,
        luxi.KEY_RESULT: result,
        }
      logging.debug("response: %s", response)
184
      self.send_message(serializer.DumpJson(response))
Iustin Pop's avatar
Iustin Pop committed
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

  def read_message(self):
    while not self._msgs:
      data = self.request.recv(self.READ_SIZE)
      if not data:
        return None
      new_msgs = (self._buffer + data).split(self.EOM)
      self._buffer = new_msgs.pop()
      self._msgs.extend(new_msgs)
    return self._msgs.popleft()

  def send_message(self, msg):
    #print "sending", msg
    self.request.sendall(msg + self.EOM)


class ClientOps:
  """Class holding high-level client operations."""
  def __init__(self, server):
    self.server = server

206
  def handle_request(self, method, args):
207
    queue = self.server.context.jobqueue
208
209
210
211

    # TODO: Parameter validation

    if method == luxi.REQ_SUBMIT_JOB:
212
      logging.info("Received new job")
213
      ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
214
      return queue.SubmitJob(ops)
Iustin Pop's avatar
Iustin Pop committed
215

216
    elif method == luxi.REQ_CANCEL_JOB:
217
      job_id = args
218
      logging.info("Received job cancel request for %s", job_id)
219
      return queue.CancelJob(job_id)
Iustin Pop's avatar
Iustin Pop committed
220

221
    elif method == luxi.REQ_ARCHIVE_JOB:
222
      job_id = args
223
      logging.info("Received job archive request for %s", job_id)
224
225
      return queue.ArchiveJob(job_id)

Iustin Pop's avatar
Iustin Pop committed
226
    elif method == luxi.REQ_AUTOARCHIVE_JOBS:
227
      (age, timeout) = args
228
229
      logging.info("Received job autoarchive request for age %s, timeout %s",
                   age, timeout)
230
      return queue.AutoArchiveJobs(age, timeout)
Iustin Pop's avatar
Iustin Pop committed
231

232
    elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE:
233
      (job_id, fields, prev_job_info, prev_log_serial, timeout) = args
234
      logging.info("Received job poll request for %s", job_id)
235
      return queue.WaitForJobChanges(job_id, fields, prev_job_info,
236
                                     prev_log_serial, timeout)
237

238
239
    elif method == luxi.REQ_QUERY_JOBS:
      (job_ids, fields) = args
240
241
242
243
244
      if isinstance(job_ids, (tuple, list)) and job_ids:
        msg = ", ".join(job_ids)
      else:
        msg = str(job_ids)
      logging.info("Received job query request for %s", msg)
245
246
      return queue.QueryJobs(job_ids, fields)

247
    elif method == luxi.REQ_QUERY_INSTANCES:
248
      (names, fields, use_locking) = args
249
      logging.info("Received instance query request for %s", names)
250
251
      if use_locking:
        raise errors.OpPrereqError("Sync queries are not allowed")
252
253
      op = opcodes.OpQueryInstances(names=names, output_fields=fields,
                                    use_locking=use_locking)
254
255
      return self._Query(op)

Michael Hanselmann's avatar
Michael Hanselmann committed
256
    elif method == luxi.REQ_QUERY_NODES:
257
      (names, fields, use_locking) = args
258
      logging.info("Received node query request for %s", names)
259
260
      if use_locking:
        raise errors.OpPrereqError("Sync queries are not allowed")
261
262
      op = opcodes.OpQueryNodes(names=names, output_fields=fields,
                                use_locking=use_locking)
Michael Hanselmann's avatar
Michael Hanselmann committed
263
264
      return self._Query(op)

265
    elif method == luxi.REQ_QUERY_EXPORTS:
266
      nodes, use_locking = args
267
268
      if use_locking:
        raise errors.OpPrereqError("Sync queries are not allowed")
269
      logging.info("Received exports query request")
270
      op = opcodes.OpQueryExports(nodes=nodes, use_locking=use_locking)
271
272
      return self._Query(op)

273
274
    elif method == luxi.REQ_QUERY_CONFIG_VALUES:
      fields = args
275
      logging.info("Received config values query request for %s", fields)
276
277
278
      op = opcodes.OpQueryConfigValues(output_fields=fields)
      return self._Query(op)

279
    elif method == luxi.REQ_QUERY_CLUSTER_INFO:
280
      logging.info("Received cluster info query request")
281
282
283
      op = opcodes.OpQueryClusterInfo()
      return self._Query(op)

284
285
    elif method == luxi.REQ_QUEUE_SET_DRAIN_FLAG:
      drain_flag = args
286
287
      logging.info("Received queue drain flag change request to %s",
                   drain_flag)
288
289
      return queue.SetDrainFlag(drain_flag)

290
    else:
291
292
      logging.info("Received invalid request '%s'", method)
      raise ValueError("Invalid operation '%s'" % method)
Iustin Pop's avatar
Iustin Pop committed
293

294
295
296
297
298
299
300
301
302
  def _DummyLog(self, *args):
    pass

  def _Query(self, op):
    """Runs the specified opcode and returns the result.

    """
    proc = mcpu.Processor(self.server.context)
    # TODO: Where should log messages go?
Iustin Pop's avatar
Iustin Pop committed
303
    return proc.ExecOpCode(op, self._DummyLog, None)
304

Iustin Pop's avatar
Iustin Pop committed
305

306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
class GanetiContext(object):
  """Context common to all ganeti threads.

  This class creates and holds common objects shared by all threads.

  """
  _instance = None

  def __init__(self):
    """Constructs a new GanetiContext object.

    There should be only a GanetiContext object at any time, so this
    function raises an error if this is not the case.

    """
    assert self.__class__._instance is None, "double GanetiContext instance"

323
    # Create global configuration object
324
    self.cfg = config.ConfigWriter()
325
326

    # Locking manager
Guido Trotter's avatar
Guido Trotter committed
327
    self.glm = locking.GanetiLockManager(
328
329
330
                self.cfg.GetNodeList(),
                self.cfg.GetInstanceList())

331
332
333
    # Job queue
    self.jobqueue = jqueue.JobQueue(self)

334
335
336
337
338
339
340
341
342
343
    # setting this also locks the class against attribute modifications
    self.__class__._instance = self

  def __setattr__(self, name, value):
    """Setting GanetiContext attributes is forbidden after initialization.

    """
    assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
    object.__setattr__(self, name, value)

344
345
346
347
348
349
350
  def AddNode(self, node):
    """Adds a node to the configuration and lock manager.

    """
    # Add it to the configuration
    self.cfg.AddNode(node)

351
    # If preseeding fails it'll not be added
352
    self.jobqueue.AddNode(node)
353

354
355
356
357
358
359
360
    # Add the new node to the Ganeti Lock Manager
    self.glm.add(locking.LEVEL_NODE, node.name)

  def ReaddNode(self, node):
    """Updates a node that's already in the configuration

    """
361
    # Synchronize the queue again
362
    self.jobqueue.AddNode(node)
363
364
365
366
367
368
369
370

  def RemoveNode(self, name):
    """Removes a node from the configuration and lock manager.

    """
    # Remove node from configuration
    self.cfg.RemoveNode(name)

371
372
373
    # Notify job queue
    self.jobqueue.RemoveNode(name)

374
375
376
    # Remove the node from the Ganeti Lock Manager
    self.glm.remove(locking.LEVEL_NODE, name)

377

378
379
380
def ParseOptions():
  """Parse the command line options.

Iustin Pop's avatar
Iustin Pop committed
381
  @return: (options, args) as from OptionParser.parse_args()
382
383
384
385
386
387
388
389
390
391
392
393
394

  """
  parser = OptionParser(description="Ganeti master daemon",
                        usage="%prog [-f] [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-f", "--foreground", dest="fork",
                    help="Don't detach from the current terminal",
                    default=True, action="store_false")
  parser.add_option("-d", "--debug", dest="debug",
                    help="Enable some debug messages",
                    default=False, action="store_true")
395
396
397
398
  parser.add_option("--no-voting", dest="no_voting",
                    help="Do not check that the nodes agree on this node"
                    " being the master and start the daemon unconditionally",
                    default=False, action="store_true")
399
400
401
402
  options, args = parser.parse_args()
  return options, args


403
404
405
406
407
408
409
410
411
def CheckAgreement():
  """Check the agreement on who is the master.

  The function uses a very simple algorithm: we must get more positive
  than negative answers. Since in most of the cases we are the master,
  we'll use our own config file for getting the node list. In the
  future we could collect the current node list from our (possibly
  obsolete) known nodes.

412
413
414
415
416
417
418
419
420
421
  In order to account for cold-start of all nodes, we retry for up to
  a minute until we get a real answer as the top-voted one. If the
  nodes are more out-of-sync, for now manual startup of the master
  should be attempted.

  Note that for a even number of nodes cluster, we need at least half
  of the nodes (beside ourselves) to vote for us. This creates a
  problem on two-node clusters, since in this case we require the
  other node to be up too to confirm our status.

422
423
424
425
426
427
  """
  myself = utils.HostInfo().name
  #temp instantiation of a config writer, used only to get the node list
  cfg = config.ConfigWriter()
  node_list = cfg.GetNodeList()
  del cfg
428
429
430
431
432
433
434
435
436
  retries = 6
  while retries > 0:
    votes = bootstrap.GatherMasterVotes(node_list)
    if not votes:
      # empty node list, this is a one node cluster
      return True
    if votes[0][0] is None:
      retries -= 1
      time.sleep(10)
437
      continue
438
439
    break
  if retries == 0:
Iustin Pop's avatar
Iustin Pop committed
440
441
442
    logging.critical("Cluster inconsistent, most of the nodes didn't answer"
                     " after multiple retries. Aborting startup")
    return False
443
444
445
446
447
448
  # here a real node is at the top of the list
  all_votes = sum(item[1] for item in votes)
  top_node, top_votes = votes[0]
  result = False
  if top_node != myself:
    logging.critical("It seems we are not the master (top-voted node"
Iustin Pop's avatar
Iustin Pop committed
449
450
                     " is %s with %d out of %d votes)", top_node, top_votes,
                     all_votes)
451
  elif top_votes < all_votes - top_votes:
452
    logging.critical("It seems we are not the master (%d votes for,"
453
454
455
456
457
                     " %d votes against)", top_votes, all_votes - top_votes)
  else:
    result = True

  return result
458
459


Iustin Pop's avatar
Iustin Pop committed
460
461
462
def main():
  """Main function"""

463
464
  options, args = ParseOptions()
  utils.debug = options.debug
465
  utils.no_fork = True
466

Iustin Pop's avatar
Iustin Pop committed
467
468
469
  if options.fork:
    utils.CloseFDs()

470
471
472
  rpc.Init()
  try:
    ssconf.CheckMaster(options.debug)
473

474
    # we believe we are the master, let's ask the other nodes...
475
476
477
478
479
480
481
482
483
484
485
486
    if options.no_voting:
      sys.stdout.write("The 'no voting' option has been selected.\n")
      sys.stdout.write("This is dangerous, please confirm by"
                       " typing uppercase 'yes': ")
      sys.stdout.flush()
      confirmation = sys.stdin.readline().strip()
      if confirmation != "YES":
        print "Aborting."
        return
    else:
      if not CheckAgreement():
        return
487

488
489
490
    dirs = [(constants.RUN_GANETI_DIR, constants.RUN_DIRS_MODE),
            (constants.SOCKET_DIR, constants.SOCKET_DIR_MODE),
           ]
491
    utils.EnsureDirs(dirs)
492

493
494
495
496
    # This is safe to do as the pid file guarantees against
    # concurrent execution.
    utils.RemoveFile(constants.MASTER_SOCKET)

497
498
499
    master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
  finally:
    rpc.Shutdown()
Iustin Pop's avatar
Iustin Pop committed
500

501
502
  # become a daemon
  if options.fork:
Iustin Pop's avatar
Iustin Pop committed
503
    utils.Daemonize(logfile=constants.LOG_MASTERDAEMON)
504

505
  utils.WritePidFile(constants.MASTERD_PID)
506
  try:
507
    utils.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug,
508
                       stderr_logging=not options.fork, multithreaded=True)
509

510
    logging.info("Ganeti master daemon startup")
511

512
    rpc.Init()
513
    try:
514
515
516
517
518
519
520
521
522
523
      # activate ip
      master_node = ssconf.SimpleConfigReader().GetMasterNode()
      if not rpc.RpcRunner.call_node_start_master(master_node, False):
        logging.error("Can't activate master IP address")

      master.setup_queue()
      try:
        master.serve_forever()
      finally:
        master.server_cleanup()
524
    finally:
525
      rpc.Shutdown()
526
  finally:
527
    utils.RemovePidFile(constants.MASTERD_PID)
528
    utils.RemoveFile(constants.MASTER_SOCKET)
529

Iustin Pop's avatar
Iustin Pop committed
530
531
532

if __name__ == "__main__":
  main()