Newer
Older
#
# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
"""Master daemon program.
Some classes deviates from the standard style guide since the
inheritance from parent classes requires it.
"""
import SocketServer
import threading
import time
import collections
import Queue
import random
import signal
import simplejson
from optparse import OptionParser
from ganeti import config
from ganeti import constants
from ganeti import mcpu
from ganeti import opcodes
from ganeti import jqueue
from ganeti import locking
from ganeti import luxi
from ganeti import utils
from ganeti import errors
from ganeti import ssconf
EXIT_NOTMASTER = constants.EXIT_NOTMASTER
EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR
class IOServer(SocketServer.UnixStreamServer):
"""IO thread class.
This class takes care of initializing the other threads, setting
signal handlers (which are processed only in this thread), and doing
cleanup at shutdown.
"""
def __init__(self, address, rqhandler, context):
"""IOServer constructor
Args:
address: the address to bind this IOServer to
rqhandler: RequestHandler type object
context: Context Object common to all worker threads
SocketServer.UnixStreamServer.__init__(self, address, rqhandler)
self.do_quit = False
self.context = context
# We'll only start threads once we've forked.
self.jobqueue = None
signal.signal(signal.SIGINT, self.handle_quit_signals)
signal.signal(signal.SIGTERM, self.handle_quit_signals)
def setup_queue(self):
self.jobqueue = jqueue.JobQueue(self.context)
def process_request_thread(self, request, client_address):
"""Process the request.
This is copied from the code in ThreadingMixIn.
"""
try:
self.finish_request(request, client_address)
self.close_request(request)
except:
self.handle_error(request, client_address)
self.close_request(request)
def process_request(self, request, client_address):
"""Start a new thread to process the request.
This is copied from the coode in ThreadingMixIn.
"""
t = threading.Thread(target=self.process_request_thread,
args=(request, client_address))
t.start()
def handle_quit_signals(self, signum, frame):
print "received %s in %s" % (signum, frame)
self.do_quit = True
def serve_forever(self):
"""Handle one request at a time until told to quit."""
while not self.do_quit:
self.handle_request()
print "served request, quit=%s" % (self.do_quit)
def server_cleanup(self):
"""Cleanup the server.
This involves shutting down the processor threads and the master
socket.
"""
try:
self.server_close()
utils.RemoveFile(constants.MASTER_SOCKET)
finally:
if self.jobqueue:
self.jobqueue.Shutdown()
class ClientRqHandler(SocketServer.BaseRequestHandler):
"""Client handler"""
EOM = '\3'
READ_SIZE = 4096
def setup(self):
self._buffer = ""
self._msgs = collections.deque()
self._ops = ClientOps(self.server)
def handle(self):
while True:
msg = self.read_message()
if msg is None:
logging.info("client closed connection")
logging.error("wrong request received: %s", msg)
method = request.get(luxi.KEY_METHOD, None)
args = request.get(luxi.KEY_ARGS, None)
if method is None or args is None:
logging.error("no method or args in request")
success = False
try:
result = self._ops.handle_request(method, args)
success = True
except:
logging.error("Unexpected exception", exc_info=True)
err = sys.exc_info()
result = "Caught exception: %s" % str(err[1])
response = {
luxi.KEY_SUCCESS: success,
luxi.KEY_RESULT: result,
}
logging.debug("response: %s", response)
self.send_message(simplejson.dumps(response))
def read_message(self):
while not self._msgs:
data = self.request.recv(self.READ_SIZE)
if not data:
return None
new_msgs = (self._buffer + data).split(self.EOM)
self._buffer = new_msgs.pop()
self._msgs.extend(new_msgs)
return self._msgs.popleft()
def send_message(self, msg):
#print "sending", msg
self.request.sendall(msg + self.EOM)
class ClientOps:
"""Class holding high-level client operations."""
def __init__(self, server):
self.server = server
def handle_request(self, method, args):
queue = self.server.jobqueue
# TODO: Parameter validation
if method == luxi.REQ_SUBMIT_JOB:
ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
return queue.SubmitJob(ops)
elif method == luxi.REQ_CANCEL_JOB:
(job_id, ) = args
return queue.CancelJob(job_id)
elif method == luxi.REQ_ARCHIVE_JOB:
(job_id, ) = args
return queue.ArchiveJob(job_id)
elif method == luxi.REQ_QUERY_JOBS:
(job_ids, fields) = args
return queue.QueryJobs(job_ids, fields)
else:
raise ValueError("Invalid operation")
def JobRunner(proc, job, context):
"""Job executor.
This functions processes a single job in the context of given
processor instance.
Args:
proc: Ganeti Processor to run the job on
job: The job to run (unserialized format)
context: Ganeti shared context
"""
job.SetStatus(opcodes.Job.STATUS_RUNNING)
fail = False
for idx, op in enumerate(job.data.op_list):
job.data.op_status[idx] = opcodes.Job.STATUS_RUNNING
try:
job.data.op_result[idx] = proc.ExecOpCode(op)
job.data.op_status[idx] = opcodes.Job.STATUS_SUCCESS
except (errors.OpPrereqError, errors.OpExecError), err:
fail = True
job.data.op_result[idx] = str(err)
job.data.op_status[idx] = opcodes.Job.STATUS_FAIL
if fail:
job.SetStatus(opcodes.Job.STATUS_FAIL)
else:
job.SetStatus(opcodes.Job.STATUS_SUCCESS)
def PoolWorker(worker_id, incoming_queue, context):
"""A worker thread function.
This is the actual processor of a single thread of Job execution.
Args:
worker_id: the unique id for this worker
incoming_queue: a queue to get jobs from
context: the common server context, containing all shared data and
synchronization structures.
logging.debug("worker %s sleeping", worker_id)
item = incoming_queue.get(True)
if item is None:
break
logging.debug("worker %s processing job %s", worker_id, item.data.job_id)
proc = mcpu.Processor(context, feedback=lambda x: None)
JobRunner(proc, item, context)
except errors.GenericError, err:
msg = "ganeti exception"
logging.error(msg, exc_info=err)
item.SetStatus(opcodes.Job.STATUS_FAIL, result=[msg])
except Exception, err:
msg = "unhandled exception"
logging.error(msg, exc_info=err)
item.SetStatus(opcodes.Job.STATUS_FAIL, result=[msg])
except:
msg = "unhandled unknown exception"
logging.error(msg, exc_info=True)
item.SetStatus(opcodes.Job.STATUS_FAIL, result=[msg])
logging.debug("worker %s finish job %s", worker_id, item.data.job_id)
logging.debug("worker %s exiting", worker_id)
class GanetiContext(object):
"""Context common to all ganeti threads.
This class creates and holds common objects shared by all threads.
"""
_instance = None
def __init__(self):
"""Constructs a new GanetiContext object.
There should be only a GanetiContext object at any time, so this
function raises an error if this is not the case.
"""
assert self.__class__._instance is None, "double GanetiContext instance"
# Create a ConfigWriter...
self.cfg = config.ConfigWriter()
# And a GanetiLockingManager...
self.cfg.GetNodeList(),
self.cfg.GetInstanceList())
# setting this also locks the class against attribute modifications
self.__class__._instance = self
def __setattr__(self, name, value):
"""Setting GanetiContext attributes is forbidden after initialization.
"""
assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
object.__setattr__(self, name, value)
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def CheckMaster(debug):
"""Checks the node setup.
If this is the master, the function will return. Otherwise it will
exit with an exit code based on the node status.
"""
try:
ss = ssconf.SimpleStore()
master_name = ss.GetMasterNode()
except errors.ConfigurationError, err:
print "Cluster configuration incomplete: '%s'" % str(err)
sys.exit(EXIT_NODESETUP_ERROR)
try:
myself = utils.HostInfo()
except errors.ResolverError, err:
sys.stderr.write("Cannot resolve my own name (%s)\n" % err.args[0])
sys.exit(EXIT_NODESETUP_ERROR)
if myself.name != master_name:
if debug:
sys.stderr.write("Not master, exiting.\n")
sys.exit(EXIT_NOTMASTER)
def ParseOptions():
"""Parse the command line options.
Returns:
(options, args) as from OptionParser.parse_args()
"""
parser = OptionParser(description="Ganeti master daemon",
usage="%prog [-f] [-d]",
version="%%prog (ganeti) %s" %
constants.RELEASE_VERSION)
parser.add_option("-f", "--foreground", dest="fork",
help="Don't detach from the current terminal",
default=True, action="store_false")
parser.add_option("-d", "--debug", dest="debug",
help="Enable some debug messages",
default=False, action="store_true")
options, args = parser.parse_args()
return options, args
options, args = ParseOptions()
utils.debug = options.debug
CheckMaster(options.debug)
master = IOServer(constants.MASTER_SOCKET, ClientRqHandler, GanetiContext())
# become a daemon
if options.fork:
utils.Daemonize(logfile=constants.LOG_MASTERDAEMON,
noclose_fds=[master.fileno()])
logger.SetupDaemon(constants.LOG_MASTERDAEMON, debug=options.debug)
logger.Info("ganeti master daemon startup")
utils.Lock('cmd', debug=options.debug)
except errors.LockError, err:
print >> sys.stderr, str(err)
try:
master.serve_forever()
finally:
master.server_cleanup()
finally:
utils.Unlock('cmd')
utils.LockCleanup()
if __name__ == "__main__":
main()