From 81198f6eb6f9f3070cf946ffbd35bf4964b49049 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Wed, 27 Jan 2010 13:01:52 +0100
Subject: [PATCH] node daemon: allow working with broken queue dir

In case the queue dir cannot be create/initialized, currently
ganeti-noded exits. This means that a read-only filesystem or a
permission error breaks all node daemon functionality, including
powercycle. This is not good for the usual failure case for nodes.

To workaround this, we don't require successful initialization at node
daemon startup; if we can't init the queue dir/lock, we retry at every
RPC call requiring a job queue lock, and if we still can't acquire the
lock, we raise an exception (which is catched in HandleRequest and
transformed into an RPC failure).

This allows the node daemon to start in face of queue issues, and the
master node to power-cycle it.

Signed-off-by: Iustin Pop <iustin@google.com>
Reviewed-by: Michael Hanselmann <hansmi@google.com>
---
 daemons/ganeti-noded | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded
index fce0d2904..7b030a6ee 100755
--- a/daemons/ganeti-noded
+++ b/daemons/ganeti-noded
@@ -52,6 +52,25 @@ import ganeti.http.server # pylint: disable-msg=W0611
 queue_lock = None
 
 
+def _PrepareQueueLock():
+  """Try to prepare the queue lock.
+
+  @return: None for success, otherwise an exception object
+
+  """
+  global queue_lock # pylint: disable-msg=W0603
+
+  if queue_lock is not None:
+    return None
+
+  # Prepare job queue
+  try:
+    queue_lock = jstore.InitAndVerifyQueue(must_lock=False)
+    return None
+  except EnvironmentError, err:
+    return err
+
+
 def _RequireJobQueueLock(fn):
   """Decorator for job queue manipulating functions.
 
@@ -61,6 +80,9 @@ def _RequireJobQueueLock(fn):
   def wrapper(*args, **kwargs):
     # Locking in exclusive, blocking mode because there could be several
     # children running at the same time. Waiting up to 10 seconds.
+    if _PrepareQueueLock() is not None:
+      raise errors.JobQueueError("Job queue failed initialization,"
+                                 " cannot update jobs")
     queue_lock.Exclusive(blocking=True, timeout=QUEUE_LOCK_TIMEOUT)
     try:
       return fn(*args, **kwargs)
@@ -803,8 +825,6 @@ def ExecNoded(options, _):
   """Main node daemon function, executed with the PID file held.
 
   """
-  global queue_lock # pylint: disable-msg=W0603
-
   # Read SSL certificate
   if options.ssl:
     ssl_params = http.HttpSslParams(ssl_key_path=options.ssl_key,
@@ -812,8 +832,12 @@ def ExecNoded(options, _):
   else:
     ssl_params = None
 
-  # Prepare job queue
-  queue_lock = jstore.InitAndVerifyQueue(must_lock=False)
+  err = _PrepareQueueLock()
+  if err is not None:
+    # this might be some kind of file-system/permission error; while
+    # this breaks the job queue functionality, we shouldn't prevent
+    # startup of the whole node daemon because of this
+    logging.critical("Can't init/verify the queue, proceeding anyway: %s", err)
 
   mainloop = daemon.Mainloop()
   server = NodeHttpServer(mainloop, options.bind_address, options.port,
-- 
GitLab