Skip to content
Snippets Groups Projects
ganeti-masterd 12.4 KiB
Newer Older
#!/usr/bin/python -u
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Master daemon program.

Some classes deviates from the standard style guide since the
inheritance from parent classes requires it.

"""


import SocketServer
import time
import collections
import Queue
import random
import signal
import simplejson
import logging

from cStringIO import StringIO
from optparse import OptionParser
from ganeti import config
from ganeti import constants
from ganeti import mcpu
from ganeti import opcodes
from ganeti import jqueue
from ganeti import locking
from ganeti import luxi
from ganeti import utils
from ganeti import errors
from ganeti import ssconf
from ganeti import logger
from ganeti import workerpool
from ganeti import rpc
CLIENT_REQUEST_WORKERS = 16

EXIT_NOTMASTER = constants.EXIT_NOTMASTER
EXIT_NODESETUP_ERROR = constants.EXIT_NODESETUP_ERROR
class ClientRequestWorker(workerpool.BaseWorker):
  def RunTask(self, server, request, client_address):
    """Process the request.

    This is copied from the code in ThreadingMixIn.

    """
    try:
      server.finish_request(request, client_address)
      server.close_request(request)
    except:
      server.handle_error(request, client_address)
      server.close_request(request)


class IOServer(SocketServer.UnixStreamServer):
  """IO thread class.

  This class takes care of initializing the other threads, setting
  signal handlers (which are processed only in this thread), and doing
  cleanup at shutdown.

  """
  def __init__(self, address, rqhandler):
    """IOServer constructor

    Args:
      address: the address to bind this IOServer to
      rqhandler: RequestHandler type object

    """
    SocketServer.UnixStreamServer.__init__(self, address, rqhandler)

    # We'll only start threads once we've forked.
    self.context = None
    self.request_workers = None

  def setup_queue(self):
    self.context = GanetiContext()
    self.request_workers = workerpool.WorkerPool(CLIENT_REQUEST_WORKERS,
                                                 ClientRequestWorker)

  def process_request(self, request, client_address):
    """Add task to workerpool to process request.
    self.request_workers.AddTask(self, request, client_address)

  def serve_forever(self):
    """Handle one request at a time until told to quit."""
    sighandler = utils.SignalHandler([signal.SIGINT, signal.SIGTERM])
    try:
      while not sighandler.called:
        self.handle_request()
    finally:
      sighandler.Reset()

  def server_cleanup(self):
    """Cleanup the server.

    This involves shutting down the processor threads and the master
    socket.

    """
    try:
      self.server_close()
    finally:
      if self.request_workers:
        self.request_workers.TerminateWorkers()
      if self.context:
        self.context.jobqueue.Shutdown()


class ClientRqHandler(SocketServer.BaseRequestHandler):
  """Client handler"""
  EOM = '\3'
  READ_SIZE = 4096

  def setup(self):
    self._buffer = ""
    self._msgs = collections.deque()
    self._ops = ClientOps(self.server)

  def handle(self):
    while True:
      msg = self.read_message()
      if msg is None:
        logging.info("client closed connection")
      request = simplejson.loads(msg)
      logging.debug("request: %s", request)
      if not isinstance(request, dict):
        logging.error("wrong request received: %s", msg)

      method = request.get(luxi.KEY_METHOD, None)
      args = request.get(luxi.KEY_ARGS, None)
      if method is None or args is None:
        logging.error("no method or args in request")

      success = False
      try:
        result = self._ops.handle_request(method, args)
        success = True
      except:
        logging.error("Unexpected exception", exc_info=True)
        err = sys.exc_info()
        result = "Caught exception: %s" % str(err[1])

      response = {
        luxi.KEY_SUCCESS: success,
        luxi.KEY_RESULT: result,
        }
      logging.debug("response: %s", response)
      self.send_message(simplejson.dumps(response))

  def read_message(self):
    while not self._msgs:
      data = self.request.recv(self.READ_SIZE)
      if not data:
        return None
      new_msgs = (self._buffer + data).split(self.EOM)
      self._buffer = new_msgs.pop()
      self._msgs.extend(new_msgs)
    return self._msgs.popleft()

  def send_message(self, msg):
    #print "sending", msg
    self.request.sendall(msg + self.EOM)


class ClientOps:
  """Class holding high-level client operations."""
  def __init__(self, server):
    self.server = server

  def handle_request(self, method, args):
    queue = self.server.context.jobqueue

    # TODO: Parameter validation

    if method == luxi.REQ_SUBMIT_JOB:
      ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
    elif method == luxi.REQ_CANCEL_JOB:
      return queue.CancelJob(job_id)
    elif method == luxi.REQ_ARCHIVE_JOB:
      return queue.ArchiveJob(job_id)

    elif method == luxi.REQ_WAIT_FOR_JOB_CHANGE:
      (job_id, fields, prev_job_info, prev_log_serial, timeout) = args
      return queue.WaitForJobChanges(job_id, fields, prev_job_info,
                                     prev_log_serial, timeout)
    elif method == luxi.REQ_QUERY_JOBS:
      (job_ids, fields) = args
      return queue.QueryJobs(job_ids, fields)

    elif method == luxi.REQ_QUERY_INSTANCES:
      (names, fields) = args
      op = opcodes.OpQueryInstances(names=names, output_fields=fields)
      return self._Query(op)

    elif method == luxi.REQ_QUERY_NODES:
      (names, fields) = args
      op = opcodes.OpQueryNodes(names=names, output_fields=fields)
      return self._Query(op)

    elif method == luxi.REQ_QUERY_EXPORTS:
      nodes = args
      op = opcodes.OpQueryExports(nodes=nodes)
      return self._Query(op)

    else:
      raise ValueError("Invalid operation")
  def _DummyLog(self, *args):
    pass

  def _Query(self, op):
    """Runs the specified opcode and returns the result.

    """
    proc = mcpu.Processor(self.server.context)
    # TODO: Where should log messages go?
    return proc.ExecOpCode(op, self._DummyLog)

class GanetiContext(object):
  """Context common to all ganeti threads.

  This class creates and holds common objects shared by all threads.

  """
  _instance = None

  def __init__(self):
    """Constructs a new GanetiContext object.

    There should be only a GanetiContext object at any time, so this
    function raises an error if this is not the case.

    """
    assert self.__class__._instance is None, "double GanetiContext instance"

    # Create global configuration object
    self.cfg = config.ConfigWriter()
Guido Trotter's avatar
Guido Trotter committed
    self.glm = locking.GanetiLockManager(
                self.cfg.GetNodeList(),
                self.cfg.GetInstanceList())

    # Job queue
    self.jobqueue = jqueue.JobQueue(self)

    # setting this also locks the class against attribute modifications
    self.__class__._instance = self

  def __setattr__(self, name, value):
    """Setting GanetiContext attributes is forbidden after initialization.

    """
    assert self.__class__._instance is None, "Attempt to modify Ganeti Context"
    object.__setattr__(self, name, value)

  def AddNode(self, node):
    """Adds a node to the configuration and lock manager.

    """
    # Add it to the configuration
    self.cfg.AddNode(node)

    # If preseeding fails it'll not be added
    self.jobqueue.AddNode(node.name)

    # Add the new node to the Ganeti Lock Manager
    self.glm.add(locking.LEVEL_NODE, node.name)

  def ReaddNode(self, node):
    """Updates a node that's already in the configuration

    """
    # Synchronize the queue again
    self.jobqueue.AddNode(node.name)

  def RemoveNode(self, name):
    """Removes a node from the configuration and lock manager.

    """
    # Remove node from configuration
    self.cfg.RemoveNode(name)

    # Notify job queue
    self.jobqueue.RemoveNode(name)

    # Remove the node from the Ganeti Lock Manager
    self.glm.remove(locking.LEVEL_NODE, name)

def ParseOptions():
  """Parse the command line options.

  Returns:
    (options, args) as from OptionParser.parse_args()

  """
  parser = OptionParser(description="Ganeti master daemon",
                        usage="%prog [-f] [-d]",
                        version="%%prog (ganeti) %s" %
                        constants.RELEASE_VERSION)

  parser.add_option("-f", "--foreground", dest="fork",
                    help="Don't detach from the current terminal",
                    default=True, action="store_false")
  parser.add_option("-d", "--debug", dest="debug",
                    help="Enable some debug messages",
                    default=False, action="store_true")
  options, args = parser.parse_args()
  return options, args


def CheckAgreement():
  """Check the agreement on who is the master.

  The function uses a very simple algorithm: we must get more positive
  than negative answers. Since in most of the cases we are the master,
  we'll use our own config file for getting the node list. In the
  future we could collect the current node list from our (possibly
  obsolete) known nodes.

  """
  myself = utils.HostInfo().name
  #temp instantiation of a config writer, used only to get the node list
  cfg = config.ConfigWriter()
  node_list = cfg.GetNodeList()
  del cfg
  try:
    node_list.remove(myself)
  except KeyError:
    pass
  if not node_list:
    # either single node cluster, or a misconfiguration, but I won't
    # break any other node, so I can proceed
    return True
  results = rpc.call_master_info(node_list)
  if not isinstance(results, dict):
    # this should not happen (unless internal error in rpc)
    logging.critical("Can't complete rpc call, aborting master startup")
    return False
  positive = negative = 0
  other_masters = {}
  for node in results:
    if not isinstance(results[node], (tuple, list)) or len(results[node]) < 3:
      logging.warning("Can't contact node %s", node)
      continue
    master_node = results[node][2]
    if master_node == myself:
      positive += 1
    else:
      negative += 1
      if not master_node in other_masters:
        other_masters[master_node] = 0
      other_masters[master_node] += 1
  if positive <= negative:
    # bad!
    logging.critical("It seems we are not the master (%d votes for,"
                     " %d votes against)", positive, negative)
    if len(other_masters) > 1:
      logging.critical("The other nodes do not agree on a single master")
    elif other_masters:
      # TODO: resync my files from the master
      logging.critical("It seems the real master is %s",
                       other_masters.keys()[0])
    else:
      logging.critical("Can't contact any node for data, aborting startup")
    return False
  return True


def main():
  """Main function"""

  options, args = ParseOptions()
  utils.debug = options.debug
  utils.no_fork = True
  ssconf.CheckMaster(options.debug)
  # we believe we are the master, let's ask the other nodes...
  if not CheckAgreement():
    return

  master = IOServer(constants.MASTER_SOCKET, ClientRqHandler)
  # become a daemon
  if options.fork:
    utils.Daemonize(logfile=constants.LOG_MASTERDAEMON,
                    noclose_fds=[master.fileno()])

  utils.WritePidFile(constants.MASTERD_PID)
  logger.SetupLogging(constants.LOG_MASTERDAEMON, debug=options.debug,
                      stderr_logging=not options.fork)
  logging.info("ganeti master daemon startup")
  # activate ip
  master_node = ssconf.SimpleStore().GetMasterNode()
  if not rpc.call_node_start_master(master_node, False):
    logging.error("Can't activate master IP address")

  master.setup_queue()
    master.serve_forever()
    master.server_cleanup()
    utils.RemovePidFile(constants.MASTERD_PID)

if __name__ == "__main__":
  main()