Commit 3398bff1 authored by Andrea Spadaccini's avatar Andrea Spadaccini
Browse files

Merge branch 'devel-2.5'



* devel-2.5:
  Use --yes to deactivate master ip in cluster merge
  Use deactivate-master-ip in cluster-merge
  Add gnt-cluster commands to toggle the master IP
  Split starting and stopping master IP and daemons
  listrunner: Don't pass arguments if there are none
  ssh: Quote strings in error message
  utils.log: Write error messages to stderr
  Add signal handling doc to hbal man page
  Migration: warn the user about hv version mismatch
  Fix handling of cluster verify hooks
  Redistribute the RAPI certificate
  QA: Add tests for instance start/stop via RAPI
  RAPI: Fix wrong check on instance shutdown
  baserlib: Accept empty body in FillOpcode

Conflicts:
	lib/backend.py
   - no real conflicts
	lib/constants.py
   - preserve both changes
	lib/rapi/rlib2.py
   - keep master
	lib/rpc.py
   - no real conflicts
	tools/cluster-merge
   - keep devel-2.5
Signed-off-by: default avatarAndrea Spadaccini <spadaccio@google.com>
Reviewed-by: default avatarGuido Trotter <ultrotter@google.com>
parents e87e5afb cea3abbd
......@@ -440,6 +440,7 @@ def GetNodeInfo(vgname, hypervisor_type):
- memory_dom0 is the memory allocated for domain0 in MiB
- memory_free is the currently available (free) ram in MiB
- memory_total is the total number of ram in MiB
- hv_version: the hypervisor version, if available
"""
outputarray = {}
......
......@@ -2973,10 +2973,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
self._ErrorIf(test, self.ENODEHOOKS, node_name,
"Communication failure in hooks execution: %s", msg)
if res.offline or msg:
# No need to investigate payload if node is offline or gave an error.
# override manually lu_result here as _ErrorIf only
# overrides self.bad
lu_result = 1
# No need to investigate payload if node is offline or gave
# an error.
continue
for script, hkr, output in res.payload:
test = hkr == constants.HKR_FAIL
......@@ -2985,7 +2983,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
if test:
output = self._HOOKS_INDENT_RE.sub(" ", output)
feedback_fn("%s" % output)
lu_result = 0
lu_result = False
return lu_result
......@@ -3697,6 +3695,9 @@ def _ComputeAncillaryFiles(cluster, redist):
if not redist:
files_all.update(constants.ALL_CERT_FILES)
files_all.update(ssconf.SimpleStore().GetFileList())
else:
# we need to ship at least the RAPI certificate
files_all.add(constants.RAPI_CERT_FILE)
if cluster.modify_etc_hosts:
files_all.add(constants.ETC_HOSTS)
......@@ -7414,6 +7415,21 @@ class TLMigrateInstance(Tasklet):
target_node = self.target_node
source_node = self.source_node
# Check for hypervisor version mismatch and warn the user.
nodeinfo = self.rpc.call_node_info([source_node, target_node],
None, self.instance.hypervisor)
src_info = nodeinfo[source_node]
dst_info = nodeinfo[target_node]
if ((constants.HV_NODEINFO_KEY_VERSION in src_info.payload) and
(constants.HV_NODEINFO_KEY_VERSION in dst_info.payload)):
src_version = src_info.payload[constants.HV_NODEINFO_KEY_VERSION]
dst_version = dst_info.payload[constants.HV_NODEINFO_KEY_VERSION]
if src_version != dst_version:
self.feedback_fn("* warning: hypervisor version mismatch between"
" source (%s) and target (%s) node" %
(src_version, dst_version))
self.feedback_fn("* checking disk consistency between source and target")
for dev in instance.disks:
if not _CheckDiskConsistency(self.lu, dev, target_node, False):
......
......@@ -818,6 +818,9 @@ HV_MIGRATION_FAILED_STATUSES = frozenset([
# KVM-specific statuses
HV_KVM_MIGRATION_VALID_STATUSES = HV_MIGRATION_VALID_STATUSES
# Node info keys
HV_NODEINFO_KEY_VERSION = "hv_version"
# Backend parameter names
BE_MEMORY = "memory"
BE_VCPUS = "vcpus"
......
......@@ -1705,15 +1705,18 @@ class KVMHypervisor(hv_base.BaseHypervisor):
def GetNodeInfo(self):
"""Return information about the node.
This is just a wrapper over the base GetLinuxNodeInfo method.
@return: a dict with the following keys (values in MiB):
- memory_total: the total memory size on the node
- memory_free: the available memory on the node for instances
- memory_dom0: the memory used by the node itself, if available
- hv_version: the hypervisor version in the form (major, minor,
revision)
"""
return self.GetLinuxNodeInfo()
result = self.GetLinuxNodeInfo()
_, v_major, v_min, v_rev = self._GetKVMVersion()
result[constants.HV_NODEINFO_KEY_VERSION] = (v_major, v_min, v_rev)
return result
@classmethod
def GetInstanceConsole(cls, instance, hvparams, beparams):
......
......@@ -289,6 +289,7 @@ class XenHypervisor(hv_base.BaseHypervisor):
- nr_cpus: total number of CPUs
- nr_nodes: in a NUMA system, the number of domains
- nr_sockets: the number of physical CPU sockets in the node
- hv_version: the hypervisor version in the form (major, minor)
"""
# note: in xen 3, memory has changed to total_memory
......@@ -301,6 +302,7 @@ class XenHypervisor(hv_base.BaseHypervisor):
xmoutput = result.stdout.splitlines()
result = {}
cores_per_socket = threads_per_core = nr_cpus = None
xen_major, xen_minor = None, None
for line in xmoutput:
splitfields = line.split(":", 1)
......@@ -319,6 +321,10 @@ class XenHypervisor(hv_base.BaseHypervisor):
cores_per_socket = int(val)
elif key == "threads_per_core":
threads_per_core = int(val)
elif key == "xen_major":
xen_major = int(val)
elif key == "xen_minor":
xen_minor = int(val)
if (cores_per_socket is not None and
threads_per_core is not None and nr_cpus is not None):
......@@ -328,6 +334,9 @@ class XenHypervisor(hv_base.BaseHypervisor):
if dom0_info is not None:
result["memory_dom0"] = dom0_info[2]
if not (xen_major is None or xen_minor is None):
result[constants.HV_NODEINFO_KEY_VERSION] = (xen_major, xen_minor)
return result
@classmethod
......
......@@ -229,8 +229,8 @@ class SshRunner:
result = utils.RunCmd(command)
if result.failed:
logging.error("Copy to node %s failed (%s) error %s,"
" command was %s",
logging.error("Copy to node %s failed (%s) error '%s',"
" command was '%s'",
node, result.fail_reason, result.output, result.cmd)
return not result.failed
......
......@@ -230,7 +230,7 @@ def SetupLogging(logfile, program, debug=0, stderr_logging=False,
if debug:
stderr_handler.setLevel(logging.NOTSET)
else:
stderr_handler.setLevel(logging.CRITICAL)
stderr_handler.setLevel(logging.ERROR)
root_logger.addHandler(stderr_handler)
if syslog in (constants.SYSLOG_YES, constants.SYSLOG_ONLY):
......
......@@ -359,6 +359,9 @@ The options that can be passed to the program are as follows:
jobset will be executed in parallel. The jobsets themselves are
executed serially.
The execution of the job series can be interrupted, see below for
signal handling.
-l *N*, --max-length=*N*
Restrict the solution to this length. This can be used for example
to automate the execution of the balancing.
......@@ -396,25 +399,45 @@ The options that can be passed to the program are as follows:
-V, --version
Just show the program version and exit.
SIGNAL HANDLING
---------------
When executing jobs via LUXI (using the ``-X`` option), normally hbal
will execute all jobs until either one errors out or all the jobs finish
successfully.
Since balancing can take a long time, it is possible to stop hbal early
in two ways:
- by sending a ``SIGINT`` (``^C``), hbal will register the termination
request, and will wait until the currently submitted jobs finish, at
which point it will exit (with exit code 1)
- by sending a ``SIGTERM``, hbal will immediately exit (with exit code
2); it is the responsibility of the user to follow up with Ganeti the
result of the currently-executing jobs
Note that in any situation, it's perfectly safe to kill hbal, either via
the above signals or via any other signal (e.g. ``SIGQUIT``,
``SIGKILL``), since the jobs themselves are processed by Ganeti whereas
hbal (after submission) only watches their progression. In this case,
the use will again have to query Ganeti for job results.
EXIT STATUS
-----------
The exit status of the command will be zero, unless for some reason
the algorithm fatally failed (e.g. wrong node or instance data), or
(in case of job execution) any job has failed.
The exit status of the command will be zero, unless for some reason the
algorithm fatally failed (e.g. wrong node or instance data), or (in case
of job execution) either one of the jobs has failed or the balancing was
interrupted early.
BUGS
----
The program does not check its input data for consistency, and aborts
with cryptic errors messages in this case.
The program does not check all its input data for consistency, and
sometime aborts with cryptic errors messages with invalid data.
The algorithm is not perfect.
The output format is not easily scriptable, and the program should
feed moves directly into Ganeti (either via RAPI or via a gnt-debug
input file).
EXAMPLE
-------
......
......@@ -225,6 +225,12 @@ def RunCommonInstanceTests(instance):
qa_rapi.TestRapiStoppedInstanceConsole, instance)
RunTestIf("instance-shutdown", qa_instance.TestInstanceStartup, instance)
# Test shutdown/start via RAPI
RunTestIf(["instance-shutdown", "rapi"],
qa_rapi.TestRapiInstanceShutdown, instance)
RunTestIf(["instance-shutdown", "rapi"],
qa_rapi.TestRapiInstanceStartup, instance)
RunTestIf("instance-list", qa_instance.TestInstanceList)
RunTestIf("instance-info", qa_instance.TestInstanceInfo, instance)
......
......@@ -598,6 +598,16 @@ def TestRapiInstanceFailover(instance):
_WaitForRapiJob(_rapi_client.FailoverInstance(instance["name"]))
def TestRapiInstanceShutdown(instance):
"""Test stopping an instance via RAPI"""
_WaitForRapiJob(_rapi_client.ShutdownInstance(instance["name"]))
def TestRapiInstanceStartup(instance):
"""Test starting an instance via RAPI"""
_WaitForRapiJob(_rapi_client.StartupInstance(instance["name"]))
def TestRapiInstanceRename(rename_source, rename_target):
"""Test renaming instance via RAPI"""
_WaitForRapiJob(_rapi_client.RenameInstance(rename_source, rename_target))
......
......@@ -308,7 +308,7 @@ class Merger(object):
"""
for data in self.merger_data:
result = self._RunCmd(data.master_node,
"gnt-cluster deactivate-master-ip")
"gnt-cluster deactivate-master-ip --yes")
if result.failed:
raise errors.RemoteError("Unable to remove master IP on %s."
......
......@@ -376,8 +376,10 @@ def HostWorker(logdir, username, password, use_agent, hostname,
print " %s: uploading files" % hostname
upload_dir = UploadFiles(connection, executable,
filelist, logfile)
command = ("cd %s && ./%s %s" %
(upload_dir, os.path.basename(executable), exec_args))
command = ("cd %s && ./%s" %
(upload_dir, os.path.basename(executable)))
if exec_args:
command += " %s" % exec_args
print " %s: executing remote command" % hostname
cmd_result = RunRemoteCommand(connection, command, logfile)
if cmd_result is True:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment