From 001b3825794f9306d7c973711768dd3abfd4e8d6 Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <hansmi@google.com>
Date: Fri, 26 Feb 2010 16:42:13 +0100
Subject: [PATCH] watcher: Acquire lock early and give more friendly message

By opening the lock file early, other programs can lock the
state file to prevent ganeti-watcher from restarting daemons.
Using the pause feature is inherently prone to race conditions.

Before a traceback was logged when the lock file couldn't
be acquired. Now it'll be a more friendly message.

Signed-off-by: Michael Hanselmann <hansmi@google.com>
Reviewed-by: Guido Trotter <ultrotter@google.com>
Reviewed-by: Iustin Pop <iustin@google.com>
---
 daemons/ganeti-watcher | 46 ++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/daemons/ganeti-watcher b/daemons/ganeti-watcher
index 7fc2dc537..c56186143 100755
--- a/daemons/ganeti-watcher
+++ b/daemons/ganeti-watcher
@@ -106,23 +106,19 @@ def RunWatcherHooks():
         logging.debug("Watcher hook %s: success (output: %s)", relname,
                       runresult.output)
 
+
 class WatcherState(object):
   """Interface to a state file recording restart attempts.
 
   """
-  def __init__(self):
+  def __init__(self, statefile):
     """Open, lock, read and parse the file.
 
-    Raises exception on lock contention.
+    @type statefile: file
+    @param statefile: State file object
 
     """
-    # The two-step dance below is necessary to allow both opening existing
-    # file read/write and creating if not existing.  Vanilla open will truncate
-    # an existing file -or- allow creating if not existing.
-    fd = os.open(constants.WATCHER_STATEFILE, os.O_RDWR | os.O_CREAT)
-    self.statefile = os.fdopen(fd, 'w+')
-
-    utils.LockFile(self.statefile.fileno())
+    self.statefile = statefile
 
     try:
       state_data = self.statefile.read()
@@ -458,6 +454,30 @@ class Watcher(object):
       logging.exception("Error while activating disks")
 
 
+def OpenStateFile(path):
+  """Opens the state file and acquires a lock on it.
+
+  @type path: string
+  @param path: Path to state file
+
+  """
+  # The two-step dance below is necessary to allow both opening existing
+  # file read/write and creating if not existing. Vanilla open will truncate
+  # an existing file -or- allow creating if not existing.
+  statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
+
+  # Try to acquire lock on state file. If this fails, another watcher instance
+  # might already be running or another program is temporarily blocking the
+  # watcher from running.
+  try:
+    utils.LockFile(statefile_fd)
+  except errors.LockError, err:
+    logging.error("Can't acquire lock on state file %s: %s", path, err)
+    return None
+
+  return os.fdopen(statefile_fd, "w+")
+
+
 def ParseOptions():
   """Parse the command line options.
 
@@ -497,12 +517,16 @@ def main():
     logging.debug("Pause has been set, exiting")
     sys.exit(constants.EXIT_SUCCESS)
 
+  statefile = OpenStateFile(constants.WATCHER_STATEFILE)
+  if not statefile:
+    sys.exit(constants.EXIT_FAILURE)
+
   update_file = False
   try:
     StartNodeDaemons()
     RunWatcherHooks()
 
-    notepad = WatcherState()
+    notepad = WatcherState(statefile)
     try:
       try:
         client = cli.GetClient()
@@ -551,7 +575,7 @@ def main():
   except errors.JobQueueDrainError:
     logging.error("Job queue is drained, can't maintain cluster state")
   except Exception, err:
-    logging.error(str(err), exc_info=True)
+    logging.exception(str(err))
     sys.exit(constants.EXIT_FAILURE)
 
 
-- 
GitLab