Commit cc962d58 authored by Iustin Pop's avatar Iustin Pop
Browse files

watcher: fix startup sequence locking the master

Currently, the watcher startup sequence does:
  - open a luxi client
  - get the instance list
  - get the node boot ids
  - open and lock the status file, and:
    - archive jobs
    - restart the down instances
    - check disks

This, of course, can lead to problems when a node is (genuinely or not)
locked for more than (watcher interval * maximum query clients) time. At
that time, the master is completely unresponsive until the node is
unlocked and all the watchers exit with error due to the state file
being locked by the first instance.

This patch reworks the startup sequence to first open/lock the status
file, and only then open a luxi client. This should prevent the above

Reviewed-by: ultrotter
parent c614e5fb
...@@ -274,7 +274,8 @@ class Watcher(object): ...@@ -274,7 +274,8 @@ class Watcher(object):
to restart machines that are down. to restart machines that are down.
""" """
def __init__(self, opts): def __init__(self, opts, notepad):
self.notepad = notepad
master = client.QueryConfigValues(["master_node"])[0] master = client.QueryConfigValues(["master_node"])[0]
if master != utils.HostInfo().name: if master != utils.HostInfo().name:
raise NotMasterError("This is not the master node") raise NotMasterError("This is not the master node")
...@@ -284,14 +285,14 @@ class Watcher(object): ...@@ -284,14 +285,14 @@ class Watcher(object):
self.opts = opts self.opts = opts
def Run(self): def Run(self):
notepad = WatcherState() """Watcher run sequence.
notepad = self.notepad
self.ArchiveJobs(self.opts.job_age) self.ArchiveJobs(self.opts.job_age)
self.CheckInstances(notepad) self.CheckInstances(notepad)
self.CheckDisks(notepad) self.CheckDisks(notepad)
self.VerifyDisks() self.VerifyDisks()
def ArchiveJobs(self, age): def ArchiveJobs(self, age):
"""Archive old jobs. """Archive old jobs.
...@@ -434,16 +435,20 @@ def main(): ...@@ -434,16 +435,20 @@ def main():
utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug, utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
stderr_logging=options.debug) stderr_logging=options.debug)
notepad = WatcherState()
try: try:
client = cli.GetClient() client = cli.GetClient()
try: try:
watcher = Watcher(options) watcher = Watcher(options, notepad)
except errors.ConfigurationError: except errors.ConfigurationError:
# Just exit if there's no configuration # Just exit if there's no configuration
sys.exit(constants.EXIT_SUCCESS) sys.exit(constants.EXIT_SUCCESS)
watcher.Run() watcher.Run()
except SystemExit: except SystemExit:
raise raise
except NotMasterError: except NotMasterError:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment