Commit fb87cbeb authored by Michael Hanselmann's avatar Michael Hanselmann
Browse files

Merge branch 'devel-2.3' into devel-2.4



* devel-2.3:
  Wait for master to become available on initialization
  Start all daemons on cluster initialization
  Clarify job processing order in admin guide
  Improve option descriptions
  Remove two unused variables
  Fix LUOSDiagnose and non-vm_capable nodes
  Rephrasing two error messages for auto promotion
  storage: Check that mapper is either used or None
  Fix bug in “gnt-node list-storage”
  Improve import/export timeout settings
  Increase remote import/export timeout

Conflicts:
	lib/constants.py: Trivial
	lib/objects.py: Trivial
	qa/qa_node.py: Trivial
Signed-off-by: default avatarMichael Hanselmann <hansmi@google.com>
Reviewed-by: default avatarIustin Pop <iustin@google.com>
parents 58a59652 3b6b6129
......@@ -181,7 +181,13 @@ basic cluster operation (e.g. starting an instance) is represented
internall by Ganeti as an *OpCode* (abbreviation from operation
code). These OpCodes are executed as part of a *Job*. The OpCodes in a
single Job are processed serially by Ganeti, but different Jobs will be
processed (depending on resource availability) in parallel.
processed (depending on resource availability) in parallel. They will
not be executed in the submission order, but depending on resource
availability, locks and (starting with Ganeti 2.3) priority. An earlier
job may have to wait for a lock while a newer job doesn't need any locks
and can be executed right away. Operations requiring a certain order
need to be submitted as a single job, or the client must submit one job
at a time and wait for it to finish before continuing.
For example, shutting down the entire cluster can be done by running the
command ``gnt-instance shutdown --all``, which will submit for each
......
......@@ -2982,6 +2982,15 @@ def StartImportExportDaemon(mode, opts, host, port, instance, ieio, ieioargs):
if cmd_suffix:
cmd.append("--cmd-suffix=%s" % cmd_suffix)
if mode == constants.IEM_EXPORT:
# Retry connection a few times when connecting to remote peer
cmd.append("--connect-retries=%s" % constants.RIE_CONNECT_RETRIES)
cmd.append("--connect-timeout=%s" % constants.RIE_CONNECT_ATTEMPT_TIMEOUT)
elif opts.connect_timeout is not None:
assert mode == constants.IEM_IMPORT
# Overall timeout for establishing connection while listening
cmd.append("--connect-timeout=%s" % opts.connect_timeout)
logfile = _InstanceLogName(prefix, instance.os, instance.name)
# TODO: Once _InstanceLogName uses tempfile.mkstemp, StartDaemon has
......
......@@ -42,10 +42,15 @@ from ganeti import hypervisor
from ganeti import bdev
from ganeti import netutils
from ganeti import backend
from ganeti import luxi
# ec_id for InitConfig's temporary reservation manager
_INITCONF_ECID = "initconfig-ecid"
#: After how many seconds daemon must be responsive
_DAEMON_READY_TIMEOUT = 10.0
def _InitSSHSetup():
"""Setup the SSH configuration for the cluster.
......@@ -181,10 +186,30 @@ def _WaitForNodeDaemon(node_name):
raise utils.RetryAgain()
try:
utils.Retry(_CheckNodeDaemon, 1.0, 10.0)
utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
except utils.RetryTimeout:
raise errors.OpExecError("Node daemon on %s didn't answer queries within"
" 10 seconds" % node_name)
" %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
def _WaitForMasterDaemon():
"""Wait for master daemon to become responsive.
"""
def _CheckMasterDaemon():
try:
cl = luxi.Client()
(cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
except Exception:
raise utils.RetryAgain()
logging.debug("Received cluster name %s from master", cluster_name)
try:
utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
except utils.RetryTimeout:
raise errors.OpExecError("Master daemon didn't answer queries within"
" %s seconds" % _DAEMON_READY_TIMEOUT)
def _InitFileStorage(file_storage_dir):
......@@ -219,8 +244,7 @@ def _InitFileStorage(file_storage_dir):
return file_storage_dir
#pylint: disable-msg=R0913
def InitCluster(cluster_name, mac_prefix,
def InitCluster(cluster_name, mac_prefix, # pylint: disable-msg=R0913
master_netdev, file_storage_dir, candidate_pool_size,
secondary_ip=None, vg_name=None, beparams=None,
nicparams=None, ndparams=None, hvparams=None,
......@@ -419,10 +443,14 @@ def InitCluster(cluster_name, mac_prefix,
# set up the inter-node password and certificate
_InitGanetiServerSetup(hostname.name)
# start the master ip
# TODO: Review rpc call from bootstrap
# TODO: Warn on failed start master
rpc.RpcRunner.call_node_start_master(hostname.name, True, False)
logging.debug("Starting daemons")
result = utils.RunCmd([constants.DAEMON_UTIL, "start-all"])
if result.failed:
raise errors.OpExecError("Could not start daemons, command %s"
" had exitcode %s and error %s" %
(result.cmd, result.exit_code, result.output))
_WaitForMasterDaemon()
def InitConfig(version, cluster_config, master_node_config,
......
......@@ -885,18 +885,20 @@ NOSSH_KEYCHECK_OPT = cli_option("--no-ssh-key-check", dest="ssh_key_check",
default=True, action="store_false",
help="Disable SSH key fingerprint checking")
MC_OPT = cli_option("-C", "--master-candidate", dest="master_candidate",
type="bool", default=None, metavar=_YORNO,
help="Set the master_candidate flag on the node")
OFFLINE_OPT = cli_option("-O", "--offline", dest="offline", metavar=_YORNO,
type="bool", default=None,
help="Set the offline flag on the node")
help=("Set the offline flag on the node"
" (cluster does not communicate with offline"
" nodes)"))
DRAINED_OPT = cli_option("-D", "--drained", dest="drained", metavar=_YORNO,
type="bool", default=None,
help="Set the drained flag on the node")
help=("Set the drained flag on the node"
" (excluded from allocation operations)"))
CAPAB_MASTER_OPT = cli_option("--master-capable", dest="master_capable",
type="bool", default=None, metavar=_YORNO,
......@@ -929,8 +931,9 @@ CP_SIZE_OPT = cli_option("-C", "--candidate-pool-size", default=None,
help="Set the candidate pool size")
VG_NAME_OPT = cli_option("--vg-name", dest="vg_name",
help="Enables LVM and specifies the volume group"
" name (cluster-wide) for disk allocation [xenvg]",
help=("Enables LVM and specifies the volume group"
" name (cluster-wide) for disk allocation"
" [%s]" % constants.DEFAULT_VG),
metavar="VG", default=None)
YES_DOIT_OPT = cli_option("--yes-do-it", dest="yes_do_it",
......
......@@ -32,9 +32,6 @@ from ganeti import constants
from ganeti import errors
_VALUE_TRUE = "true"
def PrintExportList(opts, args):
"""Prints a list of all the exported system images.
......
......@@ -57,8 +57,6 @@ _SHUTDOWN_NODES_TAGS_MODES = (
_SHUTDOWN_NODES_SEC_BY_TAGS)
_VALUE_TRUE = "true"
#: default list of options for L{ListInstances}
_LIST_DEF_FIELDS = [
"name", "hypervisor", "os", "pnode", "status", "oper_ram",
......
......@@ -3429,7 +3429,9 @@ class LUOsDiagnose(NoHooksLU):
"""Compute the list of OSes.
"""
valid_nodes = [node for node in self.cfg.GetOnlineNodeList()]
valid_nodes = [node.name
for node in self.cfg.GetAllNodesInfo().values()
if not node.offline and node.vm_capable]
node_data = self.rpc.call_os_diagnose(valid_nodes)
pol = self._DiagnoseByOS(node_data)
output = []
......@@ -4379,15 +4381,15 @@ class LUNodeSetParams(LogicalUnit):
errors.ECODE_STATE)
if node.master_candidate and self.might_demote and not self.lock_all:
assert not self.op.auto_promote, "auto-promote set but lock_all not"
assert not self.op.auto_promote, "auto_promote set but lock_all not"
# check if after removing the current node, we're missing master
# candidates
(mc_remaining, mc_should, _) = \
self.cfg.GetMasterCandidateStats(exceptions=[node.name])
if mc_remaining < mc_should:
raise errors.OpPrereqError("Not enough master candidates, please"
" pass auto_promote to allow promotion",
errors.ECODE_STATE)
" pass auto promote option to allow"
" promotion", errors.ECODE_STATE)
self.old_flags = old_flags = (node.master_candidate,
node.drained, node.offline)
......
......@@ -437,8 +437,15 @@ RIE_HANDSHAKE = "Hi, I'm Ganeti"
# Remote import/export certificate validity in seconds
RIE_CERT_VALIDITY = 24 * 60 * 60
# Remote import/export connect timeout for socat
RIE_CONNECT_TIMEOUT = 60
# Overall timeout for establishing connection
RIE_CONNECT_TIMEOUT = 180
# Export only: how long to wait per connection attempt (seconds)
RIE_CONNECT_ATTEMPT_TIMEOUT = 20
# Export only: number of attempts to connect
RIE_CONNECT_RETRIES = 10
#: Give child process up to 5 seconds to exit after sending a signal
CHILD_LINGER_TIMEOUT = 5.0
......
......@@ -161,12 +161,16 @@ class _DiskImportExportBase(object):
self._lu = lu
self.node_name = node_name
self._opts = opts
self._opts = opts.Copy()
self._instance = instance
self._timeouts = timeouts
self._cbs = cbs
self._private = private
# Set master daemon's timeout in options for import/export daemon
assert self._opts.connect_timeout is None
self._opts.connect_timeout = timeouts.connect
# Parent loop
self._loop = None
......
......@@ -1362,6 +1362,7 @@ class ImportExportOptions(ConfigObject):
@ivar compress: Compression method (one of L{constants.IEC_ALL})
@ivar magic: Used to ensure the connection goes to the right disk
@ivar ipv6: Whether to use IPv6
@ivar connect_timeout: Number of seconds for establishing connection
"""
__slots__ = [
......@@ -1370,6 +1371,7 @@ class ImportExportOptions(ConfigObject):
"compress",
"magic",
"ipv6",
"connect_timeout",
]
......
#
#
# Copyright (C) 2009 Google Inc.
# Copyright (C) 2009, 2011 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
......@@ -258,6 +258,8 @@ class _LvmBase(_Base): # pylint: disable-msg=W0223
# we got a function, call it with all the declared fields
val = mapper(*values) # pylint: disable-msg=W0142
elif len(values) == 1:
assert mapper is None, ("Invalid mapper value (neither callable"
" nor None) for one-element fields")
# we don't have a function, but we had a single field
# declared, pass it unchanged
val = values[0]
......@@ -332,18 +334,23 @@ class _LvmBase(_Base): # pylint: disable-msg=W0223
yield fields
def _LvmPvGetAllocatable(attr):
"""Determines whether LVM PV is allocatable.
@rtype: bool
"""
if attr:
return (attr[0] == "a")
else:
logging.warning("Invalid PV attribute: %r", attr)
return False
class LvmPvStorage(_LvmBase): # pylint: disable-msg=W0223
"""LVM Physical Volume storage unit.
"""
@staticmethod
def _GetAllocatable(attr):
if attr:
return (attr[0] == "a")
else:
logging.warning("Invalid PV attribute: %r", attr)
return False
LIST_COMMAND = "pvs"
# Make sure to update constants.VALID_STORAGE_FIELDS when changing field
......@@ -353,7 +360,7 @@ class LvmPvStorage(_LvmBase): # pylint: disable-msg=W0223
(constants.SF_SIZE, ["pv_size"], _ParseSize),
(constants.SF_USED, ["pv_used"], _ParseSize),
(constants.SF_FREE, ["pv_free"], _ParseSize),
(constants.SF_ALLOCATABLE, ["pv_attr"], _GetAllocatable),
(constants.SF_ALLOCATABLE, ["pv_attr"], _LvmPvGetAllocatable),
]
def _SetAllocatable(self, name, allocatable):
......
......@@ -32,7 +32,7 @@ import qa_config
import qa_error
import qa_utils
from qa_utils import AssertCommand
from qa_utils import AssertCommand, AssertEqual
def _NodeAdd(node, readd=False):
......@@ -145,6 +145,19 @@ def TestNodeStorage():
AssertCommand(["gnt-node", "modify-storage", "--allocatable", i,
node_name, storage_type, st_name], fail=fail)
# Verify list output
cmd = ["gnt-node", "list-storage", "--storage-type", storage_type,
"--output=name,allocatable", "--separator=|",
"--no-headers", node_name]
listout = qa_utils.GetCommandOutput(master["primary"],
utils.ShellQuoteArgs(cmd))
for line in listout.splitlines():
(vfy_name, vfy_allocatable) = line.split("|")
if vfy_name == st_name and not fail:
AssertEqual(vfy_allocatable, i[0].upper())
else:
AssertEqual(vfy_allocatable, st_allocatable)
# Test repair functionality
fail = (constants.SO_FIX_CONSISTENCY not in
constants.VALID_STORAGE_OPERATIONS.get(storage_type, []))
......@@ -229,7 +242,7 @@ def _AssertOobCall(verify_path, expected_args):
verify_output_cmd = utils.ShellQuoteArgs(["cat", verify_path])
output = qa_utils.GetCommandOutput(master["primary"], verify_output_cmd)
qa_utils.AssertEqual(expected_args, output.strip())
AssertEqual(expected_args, output.strip())
def TestOutOfBand():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment