ganeti-masterd 12.7 KB
Newer Older
1
#!/usr/bin/python -u
Iustin Pop's avatar
Iustin Pop committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Master daemon program.

Some classes deviates from the standard style guide since the
inheritance from parent classes requires it.

"""


30
import sys
Iustin Pop's avatar
Iustin Pop committed
31
32
33
34
35
36
37
import SocketServer
import time
import collections
import Queue
import random
import signal
import simplejson
38
import logging
Iustin Pop's avatar
Iustin Pop committed
39
40

from cStringIO import StringIO
41
from optparse import OptionParser
Iustin Pop's avatar
Iustin Pop committed
42

43
from ganeti import config
Iustin Pop's avatar
Iustin Pop committed
44
45
46
47
from ganeti import constants
from ganeti import mcpu
from ganeti import opcodes
from ganeti import jqueue
48
from ganeti import locking
Iustin Pop's avatar
Iustin Pop committed
49
50
from ganeti import luxi
from ganeti import utils
51
52
from ganeti import errors
from ganeti import ssconf
53
from ganeti import logger
54
from ganeti import workerpool
55
from ganeti import rpc
56
57


58
59
CLIENT_REQUEST_WORKERS = 16

60
61
EXIT_NOTMASTER = constants.EXIT_NOTMASTER
EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR
Iustin Pop's avatar
Iustin Pop committed
62
63


64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class ClientRequestWorker(workerpool.BaseWorker):
  def RunTask(self, server, request, client_address):
    """Process the request.

    This is copied from the code in ThreadingMixIn.

    """
    try:
      server.finish_request(request, client_address)
      server.close_request(request)
    except:
      server.handle_error(request, client_address)
      server.close_request(request)


Iustin Pop's avatar
Iustin Pop committed
79
80
81
82
83
84
85
86
class IOServer(SocketServer.UnixStreamServer):
  """IO thread class.

  This class takes care of initializing the other threads, setting
  signal handlers (which are processed only in this thread), and doing
  cleanup at shutdown.

  """
87
  def __init__(self, address, rqhandler):
88
89
90
91
92
93
94
    """IOServer constructor

    Args:
      address: the address to bind this IOServer to
      rqhandler: RequestHandler type object

    """
Iustin Pop's avatar
Iustin Pop committed
95
    SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
96
97

    # We'll only start threads once we've forked.
98
    self.context = None
99
    self.request_workers = None
100
101

  def setup_queue(self):
102
    self.context = GanetiContext()
103
104
    self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS,
                                                 ClientRequestWorker)
Iustin Pop's avatar
Iustin Pop committed
105
106

  def process_request(self, request, client_address):
107
    """Add task to workerpool to process request.
Iustin Pop's avatar
Iustin Pop committed
108
109

    """
110
    self.request_workers.AddTask(self, request, client_address)
Iustin Pop's avatar
Iustin Pop committed
111
112
113

  def serve_forever(self):
    """Handle one request at a time until told to quit."""
114
115
116
117
118
119
    sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM])
    try:
      while not sighandler.called:
        self.handle_request()
    finally:
      sighandler.Reset()
120
121
122
123
124
125
126
127

  def server_cleanup(self):
    """Cleanup the server.

    This involves shutting down the processor threads and the master
    socket.

    """
128
129
130
    try:
      self.server_close()
    finally:
131
      if self.request_workers:
132
        self.request_workers.TerminateWorkers()
133
134
      if self.context:
        self.context.jobqueue.Shutdown()
Iustin Pop's avatar
Iustin Pop committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150


class ClientRqHandler(SocketServer.BaseRequestHandler):
  """Client handler"""
  EOM = '\3'
  READ_SIZE = 4096

  def setup(self):
    self._buffer = ""
    self._msgs = collections.deque()
    self._ops = ClientOps(self.server)

  def handle(self):
    while True:
      msg = self.read_message()
      if msg is None:
151
        logging.info("client closed connection")
Iustin Pop's avatar
Iustin Pop committed
152
        break
153

Iustin Pop's avatar
Iustin Pop committed
154
      request = simplejson.loads(msg)
155
      logging.debug("request: %s", request)
Iustin Pop's avatar
Iustin Pop committed
156
      if not isinstance(request, dict):
157
        logging.error("wrong request received: %s", msg)
Iustin Pop's avatar
Iustin Pop committed
158
        break
159
160
161
162
163

      method = request.get(luxi.KEY_METHOD, None)
      args = request.get(luxi.KEY_ARGS, None)
      if method is None or args is None:
        logging.error("no method or args in request")
Iustin Pop's avatar
Iustin Pop committed
164
        break
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

      success = False
      try:
        result = self._ops.handle_request(method, args)
        success = True
      except:
        logging.error("Unexpected exception", exc_info=True)
        err = sys.exc_info()
        result = "Caught exception: %s" % str(err[1])

      response = {
        luxi.KEY_SUCCESS: success,
        luxi.KEY_RESULT: result,
        }
      logging.debug("response: %s", response)
      self.send_message(simplejson.dumps(response))
Iustin Pop's avatar
Iustin Pop committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

  def read_message(self):
    while not self._msgs:
      data = self.request.recv(self.READ_SIZE)
      if not data:
        return None
      new_msgs = (self._buffer + data).split(self.EOM)
      self._buffer = new_msgs.pop()
      self._msgs.extend(new_msgs)
    return self._msgs.popleft()

  def send_message(self, msg):
    #print "sending", msg
    self.request.sendall(msg + self.EOM)


class ClientOps:
  """Class holding high-level client operations."""
  def __init__(self, server):
    self.server = server

202
  def handle_request(self, method, args):
203
    queue = self.server.context.jobqueue
204
205
206
207
208

    # TODO: Parameter validation

    if method == luxi.REQ_SUBMIT_JOB:
      ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
209
      return queue.SubmitJob(ops)
Iustin Pop's avatar
Iustin Pop committed
210

211
    elif method == luxi.REQ_CANCEL_JOB:
212
      job_id = args
213
      return queue.CancelJob(job_id)
Iustin Pop's avatar
Iustin Pop committed
214

215
    elif method == luxi.REQ_ARCHIVE_JOB:
216
      job_id = args
217
218
      return queue.ArchiveJob(job_id)

Iustin Pop's avatar
Iustin Pop committed
219
220
221
222
    elif method == luxi.REQ_AUTOARCHIVE_JOBS:
      age = args
      return queue.AutoArchiveJobs(age)

223
    elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE:
224
      (job_id, fields, prev_job_info, prev_log_serial, timeout) = args
225
      return queue.WaitForJobChanges(job_id, fields, prev_job_info,
226
                                     prev_log_serial, timeout)
227

228
229
230
231
    elif method == luxi.REQ_QUERY_JOBS:
      (job_ids, fields) = args
      return queue.QueryJobs(job_ids, fields)

232
233
234
235
236
    elif method == luxi.REQ_QUERY_INSTANCES:
      (names, fields) = args
      op = opcodes.OpQueryInstances(names=names, output_fields=fields)
      return self._Query(op)

Michael Hanselmann's avatar
Michael Hanselmann committed
237
238
239
240
241
    elif method == luxi.REQ_QUERY_NODES:
      (names, fields) = args
      op = opcodes.OpQueryNodes(names=names, output_fields=fields)
      return self._Query(op)

242
243
244
245
246
    elif method == luxi.REQ_QUERY_EXPORTS:
      nodes = args
      op = opcodes.OpQueryExports(nodes=nodes)
      return self._Query(op)

247
248
249
250
251
    elif method == luxi.REQ_QUERY_CONFIG_VALUES:
      fields = args
      op = opcodes.OpQueryConfigValues(output_fields=fields)
      return self._Query(op)

252
253
    else:
      raise ValueError("Invalid operation")
Iustin Pop's avatar
Iustin Pop committed
254

255
256
257
258
259
260
261
262
263
  def _DummyLog(self, *args):
    pass

  def _Query(self, op):
    """Runs the specified opcode and returns the result.

    """
    proc = mcpu.Processor(self.server.context)
    # TODO: Where should log messages go?
Iustin Pop's avatar
Iustin Pop committed
264
    return proc.ExecOpCode(op, self._DummyLog, None)
265

Iustin Pop's avatar
Iustin Pop committed
266

267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
class GanetiContext(object):
  """Context common to all ganeti threads.

  This class creates and holds common objects shared by all threads.

  """
  _instance = None

  def __init__(self):
    """Constructs a new GanetiContext object.

    There should be only a GanetiContext object at any time, so this
    function raises an error if this is not the case.

    """
    assert self.__class__._instance is None, "double GanetiContext instance"

284
    # Create global configuration object
285
    self.cfg = config.ConfigWriter()
286
287

    # Locking manager
Guido Trotter's avatar
Guido Trotter committed
288
    self.glm = locking.GanetiLockManager(
289
290
291
                self.cfg.GetNodeList(),
                self.cfg.GetInstanceList())

292
293
294
    # Job queue
    self.jobqueue = jqueue.JobQueue(self)

295
296
297
298
299
300
301
302
303
304
    # setting this also locks the class against attribute modifications
    self.__class__._instance = self

  def __setattr__(self, name, value):
    """Setting GanetiContext attributes is forbidden after initialization.

    """
    assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
    object.__setattr__(self, name, value)

305
306
307
308
309
310
311
  def AddNode(self, node):
    """Adds a node to the configuration and lock manager.

    """
    # Add it to the configuration
    self.cfg.AddNode(node)

312
313
314
    # If preseeding fails it'll not be added
    self.jobqueue.AddNode(node.name)

315
316
317
318
319
320
321
    # Add the new node to the Ganeti Lock Manager
    self.glm.add(locking.LEVEL_NODE, node.name)

  def ReaddNode(self, node):
    """Updates a node that's already in the configuration

    """
322
323
    # Synchronize the queue again
    self.jobqueue.AddNode(node.name)
324
325
326
327
328
329
330
331

  def RemoveNode(self, name):
    """Removes a node from the configuration and lock manager.

    """
    # Remove node from configuration
    self.cfg.RemoveNode(name)

332
333
334
    # Notify job queue
    self.jobqueue.RemoveNode(name)

335
336
337
    # Remove the node from the Ganeti Lock Manager
    self.glm.remove(locking.LEVEL_NODE, name)

338

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
def ParseOptions():
  """Parse the command line options.

  Returns:
    (options, args) as from OptionParser.parse_args()

  """
  parser = OptionParser(description="Ganeti master daemon",
                        usage="%prog [-f] [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-f", "--foreground", dest="fork",
                    help="Don't detach from the current terminal",
                    default=True, action="store_false")
  parser.add_option("-d", "--debug", dest="debug",
                    help="Enable some debug messages",
                    default=False, action="store_true")
  options, args = parser.parse_args()
  return options, args


361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
def CheckAgreement():
  """Check the agreement on who is the master.

  The function uses a very simple algorithm: we must get more positive
  than negative answers. Since in most of the cases we are the master,
  we'll use our own config file for getting the node list. In the
  future we could collect the current node list from our (possibly
  obsolete) known nodes.

  """
  myself = utils.HostInfo().name
  #temp instantiation of a config writer, used only to get the node list
  cfg = config.ConfigWriter()
  node_list = cfg.GetNodeList()
  del cfg
  try:
    node_list.remove(myself)
  except KeyError:
    pass
  if not node_list:
    # either single node cluster, or a misconfiguration, but I won't
    # break any other node, so I can proceed
    return True
  results = rpc.call_master_info(node_list)
  if not isinstance(results, dict):
    # this should not happen (unless internal error in rpc)
    logging.critical("Can't complete rpc call, aborting master startup")
    return False
  positive = negative = 0
  other_masters = {}
  for node in results:
    if not isinstance(results[node], (tuple, list)) or len(results[node]) < 3:
      logging.warning("Can't contact node %s", node)
      continue
    master_node = results[node][2]
    if master_node == myself:
      positive += 1
    else:
      negative += 1
      if not master_node in other_masters:
        other_masters[master_node] = 0
      other_masters[master_node] += 1
  if positive <= negative:
    # bad!
    logging.critical("It seems we are not the master (%d votes for,"
                     " %d votes against)", positive, negative)
    if len(other_masters) > 1:
      logging.critical("The other nodes do not agree on a single master")
    elif other_masters:
      # TODO: resync my files from the master
      logging.critical("It seems the real master is %s",
                       other_masters.keys()[0])
    else:
      logging.critical("Can't contact any node for data, aborting startup")
    return False
  return True


Iustin Pop's avatar
Iustin Pop committed
419
420
421
def main():
  """Main function"""

422
423
  options, args = ParseOptions()
  utils.debug = options.debug
424
  utils.no_fork = True
425

426
  ssconf.CheckMaster(options.debug)
427

428
429
430
431
  # we believe we are the master, let's ask the other nodes...
  if not CheckAgreement():
    return

432
  master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
Iustin Pop's avatar
Iustin Pop committed
433

434
435
436
437
438
  # become a daemon
  if options.fork:
    utils.Daemonize(logfile=constants.LOG_MASTERDAEMON,
                    noclose_fds=[master.fileno()])

439
  utils.WritePidFile(constants.MASTERD_PID)
440

Iustin Pop's avatar
Iustin Pop committed
441
442
  logger.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug,
                      stderr_logging=not options.fork)
443

Iustin Pop's avatar
Iustin Pop committed
444
  logging.info("ganeti master daemon startup")
445

446
  # activate ip
Michael Hanselmann's avatar
Michael Hanselmann committed
447
  master_node = ssconf.SimpleConfigReader().GetMasterNode()
448
449
450
  if not rpc.call_node_start_master(master_node, False):
    logging.error("Can't activate master IP address")

Iustin Pop's avatar
Iustin Pop committed
451
  master.setup_queue()
452
  try:
Iustin Pop's avatar
Iustin Pop committed
453
    master.serve_forever()
454
  finally:
Iustin Pop's avatar
Iustin Pop committed
455
    master.server_cleanup()
456
    utils.RemovePidFile(constants.MASTERD_PID)
457

Iustin Pop's avatar
Iustin Pop committed
458
459
460

if __name__ == "__main__":
  main()