Commit af7b6689 authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

Merge branch 'devel-2.4'



* devel-2.4:
  LUInstanceQueryData: Don't acquire locks unless requested
  Increase the lock timeouts before we block-acquire
  daemon.py: move startup log message before prep_fn
  Display the actual memory values in N+1 failures
  ssh.VerifyNodeHostname: remove the quiet flag
  Add error checking and merging for cluster params
  RAPI: Document need for Content-type header in requests
  Fix output for “gnt-job info”
  watcher: Fix misleading usage output
  Clarify --force-join parameter message
  locking: Fix race condition in lock monitor
  utils: Export NiceSortKey function
  Revert "Only merge nodes that are known to not be offline"
  cluster-merge: only operate on online nodes
  Only merge nodes that are known to not be offline
  Treat empty oob_program param as default
  Fix bug in instance listing with orphan instances
  Fix bug related to log opening failures
  Bump version for 2.4.1 release
  cfgupgrade: Fix critical bug overwriting RAPI users file

Conflicts:
	NEWS: Trivial
	lib/opcodes.py: Added parameter descriptions, used variable for
	  "use_locking"
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarRené Nussbaumer <rn@google.com>
parents 9470b6ee 76ae1d65
......@@ -18,6 +18,17 @@ Version 2.5.0 beta1
since Ganeti 2.1.3 and :doc:`documented <rapi>`, instead.
Version 2.4.1
-------------
*(Released Wed, 09 Mar 2011)*
Emergency bug-fix release. ``tools/cfgupgrade`` was broken and overwrote
the RAPI users file if run twice (even with ``-dry-run``).
The release fixes that bug (nothing else changed).
Version 2.4.0
-------------
......
# Configure script for Ganeti
m4_define([gnt_version_major], [2])
m4_define([gnt_version_minor], [4])
m4_define([gnt_version_revision], [0])
m4_define([gnt_version_revision], [1])
m4_define([gnt_version_suffix], [])
m4_define([gnt_version_full],
m4_format([%d.%d.%d%s],
......
......@@ -70,6 +70,10 @@ HTTP Basic authentication as per :rfc:`2617` is supported.
.. _JSON: http://www.json.org/
.. _REST: http://en.wikipedia.org/wiki/Representational_State_Transfer
HTTP requests with a body (e.g. ``PUT`` or ``POST``) require the request
header ``Content-type`` be set to ``application/json`` (see :rfc:`2616`
(HTTP/1.1), section 7.2.1).
A note on JSON as used by RAPI
++++++++++++++++++++++++++++++
......
......@@ -905,8 +905,7 @@ NOSSH_KEYCHECK_OPT = cli_option("--no-ssh-key-check", dest="ssh_key_check",
NODE_FORCE_JOIN_OPT = cli_option("--force-join", dest="force_join",
default=False, action="store_true",
help="Force the joining of a node,"
" needed when merging clusters")
help="Force the joining of a node")
MC_OPT = cli_option("-C", "--master-candidate", dest="master_candidate",
type="bool", default=None, metavar=_YORNO,
......
......@@ -317,6 +317,7 @@ def ShowJobs(opts, args):
if not result:
format_msg(3, "Result: empty dictionary")
else:
format_msg(3, "Result:")
for key, val in result.iteritems():
format_msg(4, "%s: %s" % (key, result_helper(val)))
else:
......
......@@ -1695,7 +1695,8 @@ class LUClusterVerify(LogicalUnit):
test = n_img.mfree < needed_mem
self._ErrorIf(test, self.ENODEN1, node,
"not enough memory to accomodate instance failovers"
" should node %s fail", prinode)
" should node %s fail (%dMiB needed, %dMiB available)",
prinode, needed_mem, n_img.mfree)
@classmethod
def _VerifyFiles(cls, errorif, nodeinfo, master_node, all_nvinfo,
......@@ -2912,6 +2913,12 @@ class LUClusterSetParams(LogicalUnit):
utils.ForceDictType(self.op.ndparams, constants.NDS_PARAMETER_TYPES)
self.new_ndparams = cluster.SimpleFillND(self.op.ndparams)
# TODO: we need a more general way to handle resetting
# cluster-level parameters to default values
if self.new_ndparams["oob_program"] == "":
self.new_ndparams["oob_program"] = \
constants.NDC_DEFAULTS[constants.ND_OOB_PROGRAM]
if self.op.nicparams:
utils.ForceDictType(self.op.nicparams, constants.NICS_PARAMETER_TYPES)
self.new_nicparams = cluster.SimpleFillNIC(self.op.nicparams)
......@@ -4124,10 +4131,16 @@ class _InstanceQuery(_QueryBase):
bad_nodes.append(name)
elif result.payload:
for inst in result.payload:
if all_info[inst].primary_node == name:
live_data.update(result.payload)
if inst in all_info:
if all_info[inst].primary_node == name:
live_data.update(result.payload)
else:
wrongnode_inst.add(inst)
else:
wrongnode_inst.add(inst)
# orphan instance; we don't list it here as we don't
# handle this case yet in the output of instance listing
logging.warning("Orphan instance '%s' found on node %s",
inst, name)
# else no instance is alive
else:
live_data = {}
......@@ -9423,23 +9436,33 @@ class LUInstanceQueryData(NoHooksLU):
def ExpandNames(self):
self.needed_locks = {}
self.share_locks = dict.fromkeys(locking.LEVELS, 1)
if self.op.instances:
self.wanted_names = []
for name in self.op.instances:
full_name = _ExpandInstanceName(self.cfg, name)
self.wanted_names.append(full_name)
self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
# Use locking if requested or when non-static information is wanted
if not (self.op.static or self.op.use_locking):
self.LogWarning("Non-static data requested, locks need to be acquired")
self.op.use_locking = True
if self.op.instances or not self.op.use_locking:
# Expand instance names right here
self.wanted_names = _GetWantedInstances(self, self.op.instances)
else:
# Will use acquired locks
self.wanted_names = None
self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
self.needed_locks[locking.LEVEL_NODE] = []
self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
if self.op.use_locking:
self.share_locks = dict.fromkeys(locking.LEVELS, 1)
if self.wanted_names is None:
self.needed_locks[locking.LEVEL_INSTANCE] = locking.ALL_SET
else:
self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names
self.needed_locks[locking.LEVEL_NODE] = []
self.share_locks = dict.fromkeys(locking.LEVELS, 1)
self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
def DeclareLocks(self, level):
if level == locking.LEVEL_NODE:
if self.op.use_locking and level == locking.LEVEL_NODE:
self._LockInstancesNodes()
def CheckPrereq(self):
......@@ -9449,10 +9472,11 @@ class LUInstanceQueryData(NoHooksLU):
"""
if self.wanted_names is None:
assert self.op.use_locking, "Locking was not used"
self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE]
self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name
in self.wanted_names]
self.wanted_instances = [self.cfg.GetInstanceInfo(name)
for name in self.wanted_names]
def _ComputeBlockdevStatus(self, node, instance_name, dev):
"""Returns the status of a block device
......@@ -9498,7 +9522,7 @@ class LUInstanceQueryData(NoHooksLU):
else:
dev_children = []
data = {
return {
"iv_name": dev.iv_name,
"dev_type": dev.dev_type,
"logical_id": dev.logical_id,
......@@ -9510,8 +9534,6 @@ class LUInstanceQueryData(NoHooksLU):
"size": dev.size,
}
return data
def Exec(self, feedback_fn):
"""Gather and return data"""
result = {}
......@@ -9539,7 +9561,7 @@ class LUInstanceQueryData(NoHooksLU):
disks = [self._ComputeDiskStatus(instance, None, device)
for device in instance.disks]
idict = {
result[instance.name] = {
"name": instance.name,
"config_state": config_state,
"run_state": remote_state,
......@@ -9564,8 +9586,6 @@ class LUInstanceQueryData(NoHooksLU):
"uuid": instance.uuid,
}
result[instance.name] = idict
return result
......
......@@ -432,10 +432,18 @@ EXPORT_MODES = frozenset([
EXPORT_MODE_REMOTE,
])
# lock recalculate mode
# Lock recalculate mode
LOCKS_REPLACE = 'replace'
LOCKS_APPEND = 'append'
# Lock timeout (sum) before we should go into blocking acquire (still
# can be reset by priority change); computed as max time (10 hours)
# before we should actually go into blocking acquire given that we
# start from default priority level; in seconds
LOCK_ATTEMPTS_TIMEOUT = 10 * 3600 / 20.0
LOCK_ATTEMPTS_MAXWAIT = 15.0
LOCK_ATTEMPTS_MINWAIT = 1.0
# instance creation modes
INSTANCE_CREATE = "create"
INSTANCE_IMPORT = "import"
......
#
#
# Copyright (C) 2006, 2007, 2008, 2010 Google Inc.
# Copyright (C) 2006, 2007, 2008, 2010, 2011 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
......@@ -682,11 +682,11 @@ def GenericMain(daemon_name, optionparser,
utils.WritePidFile(utils.DaemonPidFileName(daemon_name))
try:
try:
logging.info("%s daemon startup", daemon_name)
if callable(prepare_fn):
prep_results = prepare_fn(options, args)
else:
prep_results = None
logging.info("%s daemon startup", daemon_name)
except Exception, err:
utils.WriteErrorToFD(wpipe, _BeautifyError(err))
raise
......
......@@ -33,6 +33,7 @@ import weakref
import logging
import heapq
import operator
import itertools
from ganeti import errors
from ganeti import utils
......@@ -1514,6 +1515,17 @@ class GanetiLockManager:
return self.__keyring[level].remove(names)
def _MonitorSortKey((num, item)):
"""Sorting key function.
Sort by name, then by incoming order.
"""
(name, _, _, _) = item
return (utils.NiceSortKey(name), num)
class LockMonitor(object):
_LOCK_ATTR = "_lock"
......@@ -1523,6 +1535,9 @@ class LockMonitor(object):
"""
self._lock = SharedLock("LockMonitor")
# Counter for stable sorting
self._counter = itertools.count(0)
# Tracked locks. Weak references are used to avoid issues with circular
# references and deletion.
self._locks = weakref.WeakKeyDictionary()
......@@ -1534,16 +1549,21 @@ class LockMonitor(object):
"""
logging.debug("Registering lock %s", lock.name)
assert lock not in self._locks, "Duplicate lock registration"
assert not compat.any(lock.name == i.name for i in self._locks.keys()), \
"Found duplicate lock name"
self._locks[lock] = None
# There used to be a check for duplicate names here. As it turned out, when
# a lock is re-created with the same name in a very short timeframe, the
# previous instance might not yet be removed from the weakref dictionary.
# By keeping track of the order of incoming registrations, a stable sort
# ordering can still be guaranteed.
self._locks[lock] = self._counter.next()
@ssynchronized(_LOCK_ATTR)
def _GetLockInfo(self, requested):
"""Get information from all locks while the monitor lock is held.
"""
return [lock.GetInfo(requested) for lock in self._locks.keys()]
return [(num, lock.GetInfo(requested)) for lock, num in self._locks.items()]
def _Query(self, fields):
"""Queries information from all locks.
......@@ -1554,11 +1574,13 @@ class LockMonitor(object):
"""
qobj = query.Query(query.LOCK_FIELDS, fields)
# Get all data and sort by name
lockinfo = utils.NiceSort(self._GetLockInfo(qobj.RequestedData()),
key=operator.itemgetter(0))
# Get all data with internal lock held and then sort by name and incoming
# order
lockinfo = sorted(self._GetLockInfo(qobj.RequestedData()),
key=_MonitorSortKey)
return (qobj, query.LockQueryData(lockinfo))
# Extract lock information and build query data
return (qobj, query.LockQueryData(map(operator.itemgetter(1), lockinfo)))
def QueryLocks(self, fields):
"""Queries information from all locks.
......
......@@ -56,23 +56,23 @@ def _CalculateLockAttemptTimeouts():
"""Calculate timeouts for lock attempts.
"""
result = [1.0]
result = [constants.LOCK_ATTEMPTS_MINWAIT]
running_sum = result[0]
# Wait for a total of at least 150s before doing a blocking acquire
while sum(result) < 150.0:
# Wait for a total of at least LOCK_ATTEMPTS_TIMEOUT before doing a
# blocking acquire
while running_sum < constants.LOCK_ATTEMPTS_TIMEOUT:
timeout = (result[-1] * 1.05) ** 1.25
# Cap timeout at 10 seconds. This gives other jobs a chance to run
# even if we're still trying to get our locks, before finally moving
# to a blocking acquire.
if timeout > 10.0:
timeout = 10.0
elif timeout < 0.1:
# Lower boundary for safety
timeout = 0.1
# Cap max timeout. This gives other jobs a chance to run even if
# we're still trying to get our locks, before finally moving to a
# blocking acquire.
timeout = min(timeout, constants.LOCK_ATTEMPTS_MAXWAIT)
# And also cap the lower boundary for safety
timeout = max(timeout, constants.LOCK_ATTEMPTS_MINWAIT)
result.append(timeout)
running_sum += timeout
return result
......
......@@ -1119,8 +1119,12 @@ class OpInstanceQuery(OpCode):
class OpInstanceQueryData(OpCode):
"""Compute the run-time status of instances."""
OP_PARAMS = [
("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString), None),
("static", False, ht.TBool, None),
_PUseLocking,
("instances", ht.EmptyList, ht.TListOf(ht.TNonEmptyString),
"Instance names"),
("static", False, ht.TBool,
"Whether to only return configuration data without querying"
" nodes"),
]
......
#
#
# Copyright (C) 2006, 2007, 2010 Google Inc.
# Copyright (C) 2006, 2007, 2010, 2011 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
......@@ -254,7 +254,7 @@ class SshRunner:
- detail: string with details
"""
retval = self.Run(node, 'root', 'hostname --fqdn')
retval = self.Run(node, "root", "hostname --fqdn", quiet=False)
if retval.failed:
msg = "ssh problem"
......
......@@ -246,9 +246,18 @@ def SetupLogging(logfile, program, debug=0, stderr_logging=False,
# exception since otherwise we could run but without any logs at all
try:
if console_logging:
logfile_handler = _LogHandler(open(constants.DEV_CONSOLE, "a"), logfile)
logfile_handler = _LogHandler(open(constants.DEV_CONSOLE, "a"),
logfile)
else:
logfile_handler = _ReopenableLogHandler(logfile)
logfile_handler.setFormatter(formatter)
if debug:
logfile_handler.setLevel(logging.DEBUG)
else:
logfile_handler.setLevel(logging.INFO)
root_logger.addHandler(logfile_handler)
reopen_handlers.append(logfile_handler)
except EnvironmentError:
if stderr_logging or syslog == constants.SYSLOG_YES:
logging.exception("Failed to enable logging to file '%s'", logfile)
......@@ -256,13 +265,4 @@ def SetupLogging(logfile, program, debug=0, stderr_logging=False,
# we need to re-raise the exception
raise
logfile_handler.setFormatter(formatter)
if debug:
logfile_handler.setLevel(logging.DEBUG)
else:
logfile_handler.setLevel(logging.INFO)
root_logger.addHandler(logfile_handler)
reopen_handlers.append(logfile_handler)
return compat.partial(_ReopenLogFiles, reopen_handlers)
......@@ -694,14 +694,18 @@ def ParseOptions():
constants.RELEASE_VERSION)
parser.add_option(cli.DEBUG_OPT)
parser.add_option("-A", "--job-age", dest="job_age",
parser.add_option("-A", "--job-age", dest="job_age", default=6 * 3600,
help="Autoarchive jobs older than this age (default"
" 6 hours)", default=6*3600)
" 6 hours)")
parser.add_option("--ignore-pause", dest="ignore_pause", default=False,
action="store_true", help="Ignore cluster pause setting")
options, args = parser.parse_args()
options.job_age = cli.ParseTimespec(options.job_age)
return options, args
if args:
parser.error("No arguments expected")
return (options, args)
@rapi.client.UsesRapiClient
......@@ -711,11 +715,7 @@ def Main():
"""
global client # pylint: disable-msg=W0603
options, args = ParseOptions()
if args: # watcher doesn't take any arguments
print >> sys.stderr, ("Usage: %s [-f] " % sys.argv[0])
return constants.EXIT_FAILURE
(options, _) = ParseOptions()
utils.SetupLogging(constants.LOG_WATCHER, sys.argv[0],
debug=options.debug, stderr_logging=options.debug)
......
......@@ -161,12 +161,16 @@ class TestCfgupgrade(unittest.TestCase):
def testRapiUsers(self):
self.assertFalse(os.path.exists(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
self.assertFalse(os.path.exists(os.path.dirname(self.rapi_users_path)))
utils.WriteFile(self.rapi_users_path_pre24, data="some user\n")
self._TestSimpleUpgrade(constants.BuildVersion(2, 3, 0), False)
self.assertTrue(os.path.isdir(os.path.dirname(self.rapi_users_path)))
self.assert_(os.path.islink(self.rapi_users_path_pre24))
self.assert_(os.path.isfile(self.rapi_users_path))
self.assertEqual(os.readlink(self.rapi_users_path_pre24),
self.rapi_users_path)
for path in [self.rapi_users_path, self.rapi_users_path_pre24]:
self.assertEqual(utils.ReadFile(path), "some user\n")
......@@ -180,6 +184,8 @@ class TestCfgupgrade(unittest.TestCase):
self.assert_(os.path.islink(self.rapi_users_path_pre24))
self.assert_(os.path.isfile(self.rapi_users_path))
self.assertEqual(os.readlink(self.rapi_users_path_pre24),
self.rapi_users_path)
for path in [self.rapi_users_path, self.rapi_users_path_pre24]:
self.assertEqual(utils.ReadFile(path), "other user\n")
......@@ -187,13 +193,77 @@ class TestCfgupgrade(unittest.TestCase):
self.assertFalse(os.path.exists(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
os.mkdir(os.path.dirname(self.rapi_users_path))
os.symlink(self.rapi_users_path, self.rapi_users_path_pre24)
utils.WriteFile(self.rapi_users_path_pre24, data="hello world\n")
utils.WriteFile(self.rapi_users_path, data="hello world\n")
self._TestSimpleUpgrade(constants.BuildVersion(2, 2, 0), False)
self.assert_(os.path.isfile(self.rapi_users_path))
self.assert_(os.path.isfile(self.rapi_users_path) and
not os.path.islink(self.rapi_users_path))
self.assert_(os.path.islink(self.rapi_users_path_pre24))
self.assertEqual(os.readlink(self.rapi_users_path_pre24),
self.rapi_users_path)
for path in [self.rapi_users_path, self.rapi_users_path_pre24]:
self.assertEqual(utils.ReadFile(path), "hello world\n")
def testRapiUsersExistingTarget(self):
self.assertFalse(os.path.exists(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
os.mkdir(os.path.dirname(self.rapi_users_path))
utils.WriteFile(self.rapi_users_path, data="other user\n")
utils.WriteFile(self.rapi_users_path_pre24, data="hello world\n")
self.assertRaises(Exception, self._TestSimpleUpgrade,
constants.BuildVersion(2, 2, 0), False)
for path in [self.rapi_users_path, self.rapi_users_path_pre24]:
self.assert_(os.path.isfile(path) and not os.path.islink(path))
self.assertEqual(utils.ReadFile(self.rapi_users_path), "other user\n")
self.assertEqual(utils.ReadFile(self.rapi_users_path_pre24),
"hello world\n")
def testRapiUsersDryRun(self):
self.assertFalse(os.path.exists(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
utils.WriteFile(self.rapi_users_path_pre24, data="some user\n")
self._TestSimpleUpgrade(constants.BuildVersion(2, 3, 0), True)
self.assertFalse(os.path.isdir(os.path.dirname(self.rapi_users_path)))
self.assertTrue(os.path.isfile(self.rapi_users_path_pre24) and
not os.path.islink(self.rapi_users_path_pre24))
self.assertFalse(os.path.exists(self.rapi_users_path))
def testRapiUsers24AndAboveDryRun(self):
self.assertFalse(os.path.exists(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
os.mkdir(os.path.dirname(self.rapi_users_path))
utils.WriteFile(self.rapi_users_path, data="other user\n")
self._TestSimpleUpgrade(constants.BuildVersion(2, 3, 0), True)
self.assertTrue(os.path.isfile(self.rapi_users_path) and
not os.path.islink(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
self.assertEqual(utils.ReadFile(self.rapi_users_path), "other user\n")
def testRapiUsersExistingSymlinkDryRun(self):
self.assertFalse(os.path.exists(self.rapi_users_path))
self.assertFalse(os.path.exists(self.rapi_users_path_pre24))
os.mkdir(os.path.dirname(self.rapi_users_path))
os.symlink(self.rapi_users_path, self.rapi_users_path_pre24)
utils.WriteFile(self.rapi_users_path, data="hello world\n")
self._TestSimpleUpgrade(constants.BuildVersion(2, 2, 0), True)
self.assertTrue(os.path.islink(self.rapi_users_path_pre24))
self.assertTrue(os.path.isfile(self.rapi_users_path) and
not os.path.islink(self.rapi_users_path))
self.assertEqual(os.readlink(self.rapi_users_path_pre24),
self.rapi_users_path)
for path in [self.rapi_users_path, self.rapi_users_path_pre24]:
self.assertEqual(utils.ReadFile(path), "hello world\n")
......
......@@ -28,6 +28,7 @@ import time
import Queue
import threading
import random
import gc
import itertools
from ganeti import constants
......@@ -1914,6 +1915,80 @@ class TestLockMonitor(_ThreadedTestCase):
self.assertEqual(len(self.lm._locks), 1)
def testDeleteAndRecreate(self):
lname = "TestLock101923193"
# Create some locks with the same name and keep all references
locks = [locking.SharedLock(lname, monitor=self.lm)
for _ in range(5)]
self.assertEqual(len(self.lm._locks), len(locks))
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data,
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)]] * 5)
locks[2].delete()
# Check information order
result = self.lm.QueryLocks(["name", "mode", "owner"])
self.assertEqual(objects.QueryResponse.FromDict(result).data,
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)]] * 2 +
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, "deleted"),
(constants.RS_NORMAL, None)]] +
[[(constants.RS_NORMAL, lname),
(constants.RS_NORMAL, None),
(constants.RS_NORMAL, None)]] * 2)
locks[1].acquire(shared=0)