Commit e4e35357 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

locking: Fix race condition in lock monitor



In some rare cases it can happen that a lock is re-created very soon
after deletion, while the old instance hasn't been destructed yet. In
such a case the code would detect a duplicate name and raise an
exception.

We have seen at least one case where this happened during the creation
of many instances. It is not exactly clear how it came to be, but it
appears to have occurred while different jobs fought for locks with
short timeouts (in the case of instance creation locks are added at this
stage and removed shortly after if not all locks can be acquired).

The issue is fixed by removing the check for duplicate names. To still
guarantee a stable sort order for the lock information as shown by
“gnt-debug locks”, a registration number is recorded for each lock in
the monitor.

A unittest is included to check for the situation.
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parent 7d4da09e
......@@ -33,6 +33,7 @@ import weakref
import logging
import heapq
import operator
import itertools
from ganeti import errors
from ganeti import utils
......@@ -1514,6 +1515,17 @@ class GanetiLockManager:
return self.__keyring[level].remove(names)
def _MonitorSortKey((num, item)):
"""Sorting key function.
Sort by name, then by incoming order.
"""
(name, _, _, _) = item
return (utils.NiceSortKey(name), num)
class LockMonitor(object):
_LOCK_ATTR = "_lock"
......@@ -1523,6 +1535,9 @@ class LockMonitor(object):
"""
self._lock = SharedLock("LockMonitor")
# Counter for stable sorting
self._counter = itertools.count(0)
# Tracked locks. Weak references are used to avoid issues with circular
# references and deletion.
self._locks = weakref.WeakKeyDictionary()
......@@ -1534,16 +1549,21 @@ class LockMonitor(object):
"""
logging.debug("Registering lock %s", lock.name)
assert lock not in self._locks, "Duplicate lock registration"
assert not compat.any(lock.name == i.name for i in self._locks.keys()), \
"Found duplicate lock name"
self._locks[lock] = None
# There used to be a check for duplicate names here. As it turned out, when
# a lock is re-created with the same name in a very short timeframe, the
# previous instance might not yet be removed from the weakref dictionary.
# By keeping track of the order of incoming registrations, a stable sort
# ordering can still be guaranteed.
self._locks[lock] = self._counter.next()
@ssynchronized(_LOCK_ATTR)
def _GetLockInfo(self, requested):
"""Get information from all locks while the monitor lock is held.
"""
return [lock.GetInfo(requested) for lock in self._locks.keys()]
return [(num, lock.GetInfo(requested)) for lock, num in self._locks.items()]
def _Query(self, fields):
"""Queries information from all locks.
......@@ -1554,11 +1574,13 @@ class LockMonitor(object):
"""
qobj = query.Query(query.LOCK_FIELDS, fields)
# Get all data and sort by name
lockinfo = utils.NiceSort(self._GetLockInfo(qobj.RequestedData()),
key=operator.itemgetter(0))
# Get all data with internal lock held and then sort by name and incoming
# order
lockinfo = sorted(self._GetLockInfo(qobj.RequestedData()),
key=_MonitorSortKey)
return (qobj, query.LockQueryData(lockinfo))
# Extract lock information and build query data
return (qobj, query.LockQueryData(map(operator.itemgetter(1), lockinfo)))
def QueryLocks(self, fields):
"""Queries information from all locks.
......
......@@ -28,6 +28,7 @@ import time
import Queue
import threading
import random
import gc
import itertools
from ganeti import constants
......@@ -1914,6 +1915,80 @@ class TestLockMonitor(_ThreadedTestCase):
self.assertEqual(len(self.lm._locks), 1)
def testDeleteAndRecreate(self):
lname = "TestLock101923193"
# Create some locks with the same name and keep all references
locks = [locking.SharedLock(lname, monitor=self.lm)
for _ in range(5)]
self.assertEqual(len(self.lm._locks), len(locks))
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data,
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)]] * 5)
locks[2].delete()
# Check information order
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data,
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)]] * 2 +
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, "deleted"),
(constants.RS_NORMAL, None)]] +
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)]] * 2)
locks[1].acquire(shared=0)
last_status = [
[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)],
[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, "exclusive"),
(constants.RS_NORMAL, [threading.currentThread().getName()])],
[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, "deleted"),
(constants.RS_NORMAL, None)],
[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)],
[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)],
]
# Check information order
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data, last_status)
self.assertEqual(len(set(self.lm._locks.values())), len(locks))
self.assertEqual(len(self.lm._locks), len(locks))
# Check lock deletion
for idx in range(len(locks)):
del locks[0]
assert gc.isenabled()
gc.collect()
self.assertEqual(len(self.lm._locks), len(locks))
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data,
last_status[idx + 1:])
# All locks should have been deleted
assert not locks
self.assertFalse(self.lm._locks)
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data, [])
if __name__ == '__main__':
testutils.GanetiTestProgram()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment