From 81198f6eb6f9f3070cf946ffbd35bf4964b49049 Mon Sep 17 00:00:00 2001 From: Iustin Pop <iustin@google.com> Date: Wed, 27 Jan 2010 13:01:52 +0100 Subject: [PATCH] node daemon: allow working with broken queue dir In case the queue dir cannot be create/initialized, currently ganeti-noded exits. This means that a read-only filesystem or a permission error breaks all node daemon functionality, including powercycle. This is not good for the usual failure case for nodes. To workaround this, we don't require successful initialization at node daemon startup; if we can't init the queue dir/lock, we retry at every RPC call requiring a job queue lock, and if we still can't acquire the lock, we raise an exception (which is catched in HandleRequest and transformed into an RPC failure). This allows the node daemon to start in face of queue issues, and the master node to power-cycle it. Signed-off-by: Iustin Pop <iustin@google.com> Reviewed-by: Michael Hanselmann <hansmi@google.com> --- daemons/ganeti-noded | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded index fce0d2904..7b030a6ee 100755 --- a/daemons/ganeti-noded +++ b/daemons/ganeti-noded @@ -52,6 +52,25 @@ import ganeti.http.server # pylint: disable-msg=W0611 queue_lock = None +def _PrepareQueueLock(): + """Try to prepare the queue lock. + + @return: None for success, otherwise an exception object + + """ + global queue_lock # pylint: disable-msg=W0603 + + if queue_lock is not None: + return None + + # Prepare job queue + try: + queue_lock = jstore.InitAndVerifyQueue(must_lock=False) + return None + except EnvironmentError, err: + return err + + def _RequireJobQueueLock(fn): """Decorator for job queue manipulating functions. @@ -61,6 +80,9 @@ def _RequireJobQueueLock(fn): def wrapper(*args, **kwargs): # Locking in exclusive, blocking mode because there could be several # children running at the same time. Waiting up to 10 seconds. + if _PrepareQueueLock() is not None: + raise errors.JobQueueError("Job queue failed initialization," + " cannot update jobs") queue_lock.Exclusive(blocking=True, timeout=QUEUE_LOCK_TIMEOUT) try: return fn(*args, **kwargs) @@ -803,8 +825,6 @@ def ExecNoded(options, _): """Main node daemon function, executed with the PID file held. """ - global queue_lock # pylint: disable-msg=W0603 - # Read SSL certificate if options.ssl: ssl_params = http.HttpSslParams(ssl_key_path=options.ssl_key, @@ -812,8 +832,12 @@ def ExecNoded(options, _): else: ssl_params = None - # Prepare job queue - queue_lock = jstore.InitAndVerifyQueue(must_lock=False) + err = _PrepareQueueLock() + if err is not None: + # this might be some kind of file-system/permission error; while + # this breaks the job queue functionality, we shouldn't prevent + # startup of the whole node daemon because of this + logging.critical("Can't init/verify the queue, proceeding anyway: %s", err) mainloop = daemon.Mainloop() server = NodeHttpServer(mainloop, options.bind_address, options.port, -- GitLab