diff --git a/.gitignore b/.gitignore index aac33b9075ce9126ab36fbd3f34a4edcf29dab3c..e4bd31eb1f2f984c14e9bc62a8e25da77b1efcab 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ # global ignores *.py[co] +*.swp # / /Makefile diff --git a/daemons/ganeti-masterd b/daemons/ganeti-masterd index bf7146067e89dcc81e468e07b7d48de636aa2838..8ae1d4cf01c0c6c7d251743b218030ef9cb5f43f 100755 --- a/daemons/ganeti-masterd +++ b/daemons/ganeti-masterd @@ -473,7 +473,6 @@ def main(): """Main function""" options, args = ParseOptions() - utils.debug = options.debug utils.no_fork = True if options.fork: diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded index 876321d4ba82ba9e09faf52a5693f3b69d9016da..89dba035c3c12277a7899d3d75dcdb55d0603281 100755 --- a/daemons/ganeti-noded +++ b/daemons/ganeti-noded @@ -764,7 +764,6 @@ def main(): global queue_lock options, args = ParseOptions() - utils.debug = options.debug if options.fork: utils.CloseFDs() @@ -772,13 +771,9 @@ def main(): for fname in (constants.SSL_CERT_FILE,): if not os.path.isfile(fname): print "config %s not there, will not run." % fname - sys.exit(5) + sys.exit(constants.EXIT_NOTCLUSTER) - try: - port = utils.GetNodeDaemonPort() - except errors.ConfigurationError, err: - print "Cluster configuration incomplete: '%s'" % str(err) - sys.exit(5) + port = utils.GetNodeDaemonPort() dirs = [(val, constants.RUN_DIRS_MODE) for val in constants.SUB_RUN_DIRS] dirs.append((constants.LOG_OS_DIR, 0750)) diff --git a/daemons/ganeti-rapi b/daemons/ganeti-rapi index 7b9710a9bf46930beeda8342d10f32a263c96939..2288f17f4d20e2b858d5eedf4f676dbc65507c00 100755 --- a/daemons/ganeti-rapi +++ b/daemons/ganeti-rapi @@ -206,17 +206,20 @@ def ParseOptions(): parser.add_option("-f", "--foreground", dest="fork", help="Don't detach from the current terminal", default=True, action="store_false") + parser.add_option("-b", "--bind", dest="bind_address", + help="Bind address", + default="", metavar="ADDRESS") options, args = parser.parse_args() if len(args) != 0: print >> sys.stderr, "Usage: %s [-d] [-p port]" % sys.argv[0] - sys.exit(1) + sys.exit(constants.EXIT_FAILURE) if options.ssl and not (options.ssl_cert and options.ssl_key): print >> sys.stderr, ("For secure mode please provide " - "--ssl-key and --ssl-cert arguments") - sys.exit(1) + "--ssl-key and --ssl-cert arguments") + sys.exit(constants.EXIT_FAILURE) return options, args @@ -237,7 +240,7 @@ def main(): ssl_cert_path=options.ssl_cert) except Exception, err: sys.stderr.write("Can't load the SSL certificate/key: %s\n" % (err,)) - sys.exit(1) + sys.exit(constants.EXIT_FAILURE) else: ssl_params = None @@ -252,7 +255,7 @@ def main(): utils.WritePidFile(constants.RAPI_PID) try: mainloop = daemon.Mainloop() - server = RemoteApiHttpServer(mainloop, "", options.port, + server = RemoteApiHttpServer(mainloop, options.bind_address, options.port, ssl_params=ssl_params, ssl_verify_peer=False, request_executor_class= JsonErrorRequestExecutor) diff --git a/doc/examples/hooks/ethers b/doc/examples/hooks/ethers new file mode 100755 index 0000000000000000000000000000000000000000..35a66b0f8179c5ff10b5910a4e79423245f0e206 --- /dev/null +++ b/doc/examples/hooks/ethers @@ -0,0 +1,85 @@ +#!/bin/bash + +# Copyright (C) 2009 Google Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. + +# This is an example ganeti hook that writes the instance mac addresses in the +# node's /etc/ether file. It will pic up the first nic connected to the +# TARGET_BRIDGE bridge, and write it down with the syntax "MAC INSTANCE_NAME". + +# The hook will also send a HUP signal the daemon whose PID is in +# DAEMON_PID_FILE, so that it can load the new /etc/ethers file and use it. +# This has been tested in conjunction with dnsmasq's dhcp implementation. + +# It will also remove any other occurrences for the same instance in the +# aformentioned file. This hook supports the "instance-add", "instance-modify" +# "instance-remove", and "instance-mirror-replace" ganeti post hook paths. To +# install it add a symlink from those hooks' directories to where this file is +# installed (with a mode which permits execution). + +# TARGET_BRIDGE: We'll only add the first nic which gets connected to this +# bridge to /etc/ethers. +TARGET_BRIDGE="br0" +DAEMON_PID_FILE="/var/run/dnsmasq.pid" +LOCKFILE="/var/lock/ganeti_ethers.lock" + +hooks_path=$GANETI_HOOKS_PATH +[ -n "$hooks_path" ] || exit 1 +instance=$GANETI_INSTANCE_NAME +[ -n "$instance" ] || exit 1 +nic_count=$GANETI_INSTANCE_NIC_COUNT + +acquire_lockfile() { + if ! ( set -o noclobber; echo "$$" > $LOCKFILE) 2> /dev/null; then + logger -s "Cannot acquire lockfile for ethers update" + exit 1 + fi + trap "rm -f $LOCKFILE" EXIT +} + +update_ethers_from_new() { + chmod 644 /etc/ethers.new + mv /etc/ethers.new /etc/ethers + [ -f "$DAEMON_PID_FILE" ] && kill -HUP $(< $DAEMON_PID_FILE) +} + +if [ "$hooks_path" = "instance-add" -o \ + "$hooks_path" = "instance-modify" -o \ + "$hooks_path" = "instance-mirror-replace" ] +then + for i in $(seq 0 $((nic_count - 1)) ); do + bridge_var="GANETI_INSTANCE_NIC${i}_BRIDGE" + bridge=${!bridge_var} + if [ -n "$bridge" -a "$bridge" = "$TARGET_BRIDGE" ]; then + mac_var="GANETI_INSTANCE_NIC${i}_MAC" + mac=${!mac_var} + acquire_lockfile + cat /etc/ethers | awk -- "! /^([[:xdigit:]:]*)[[:blank:]]+$instance\>/; + END {print \"$mac\t$instance\"}" > /etc/ethers.new + update_ethers_from_new + break + fi + done +fi +if [ "$hooks_path" = "instance-remove" -o \ + \( "$hooks_path" = "instance-modify" -a "$nic_count" -eq 0 \) ]; then + acquire_lockfile + cat /etc/ethers | awk -- "! /^([[:xdigit:]:]*)[[:blank:]]+$instance\>/" \ + > /etc/ethers.new + update_ethers_from_new +fi + diff --git a/lib/cli.py b/lib/cli.py index b08506a42309a9ac11384b1233575c8854e981b2..fd329695f742895b008f9a309ed2e5a7af0d78e9 100644 --- a/lib/cli.py +++ b/lib/cli.py @@ -766,8 +766,6 @@ def GenericMain(commands, override=None, aliases=None): utils.SetupLogging(constants.LOG_COMMANDS, debug=options.debug, stderr_logging=True, program=binary) - utils.debug = options.debug - if old_cmdline: logging.info("run with arguments '%s'", old_cmdline) else: diff --git a/lib/constants.py b/lib/constants.py index ca1427538b2f108570438ef3de8a81ed19f586c1..787508bc2cd00ca39f2ad205b981eac1755b4877 100644 --- a/lib/constants.py +++ b/lib/constants.py @@ -228,6 +228,7 @@ DDM_REMOVE = 'remove' # common exit codes EXIT_SUCCESS = 0 EXIT_FAILURE = 1 +EXIT_NOTCLUSTER = 5 EXIT_NOTMASTER = 11 EXIT_NODESETUP_ERROR = 12 EXIT_CONFIRMATION = 13 # need user confirmation @@ -487,8 +488,6 @@ OPS_FINALIZED = frozenset([OP_STATUS_CANCELED, ELOG_MESSAGE = "message" ELOG_PROGRESS = "progress" -# Temporary RAPI constants until we have cluster parameters -RAPI_ENABLE = True RAPI_PORT = 5080 # max dynamic devices diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py index 7c668c5184436215900922150357364a6d2b50ff..76b96f10b74f2894dff97599006a837ebb4f131f 100644 --- a/qa/qa_rapi.py +++ b/qa/qa_rapi.py @@ -58,19 +58,7 @@ def Enabled(): """Return whether remote API tests should be run. """ - return constants.RAPI_ENABLE and qa_config.TestEnabled('rapi') - - -def PrintRemoteAPIWarning(): - """Print warning if remote API is not enabled. - - """ - if constants.RAPI_ENABLE or not qa_config.TestEnabled('rapi'): - return - msg = ("Remote API is not enabled in this Ganeti build. Please run" - " `configure [...] --enable-rapi'.") - print - print qa_utils.FormatWarning(msg) + return qa_config.TestEnabled('rapi') def _DoTests(uris): diff --git a/tools/burnin b/tools/burnin index d0ef8775385b58581ceee1e5bd59ddcde93f9280..b38680fc4b53ddcc809a98a7ee3253afd9c023d2 100755 --- a/tools/burnin +++ b/tools/burnin @@ -41,11 +41,16 @@ from ganeti import utils USAGE = ("\tburnin -o OS_NAME [options...] instance_name ...") +MAX_RETRIES = 3 class InstanceDown(Exception): """The checked instance was not up""" +class BurninFailure(Exception): + """Failure detected during burning""" + + def Usage(): """Shows program usage information and exits the program.""" @@ -106,6 +111,9 @@ class Burner(object): self.to_rem = [] self.queued_ops = [] self.opts = None + self.queue_retry = False + self.disk_count = self.disk_growth = self.disk_size = None + self.hvp = self.bep = None self.ParseOptions() self.cl = cli.GetClient() self.GetState() @@ -125,7 +133,39 @@ class Burner(object): if self.opts.verbose: Log(msg, indent=3) - def ExecOp(self, *ops): + def MaybeRetry(self, retry_count, msg, fn, *args): + """Possibly retry a given function execution. + + @type retry_count: int + @param retry_count: retry counter: + - 0: non-retryable action + - 1: last retry for a retryable action + - MAX_RETRIES: original try for a retryable action + @type msg: str + @param msg: the kind of the operation + @type fn: callable + @param fn: the function to be called + + """ + try: + val = fn(*args) + if retry_count > 0 and retry_count < MAX_RETRIES: + Log("Idempotent %s succeeded after %d retries" % + (msg, MAX_RETRIES - retry_count)) + return val + except Exception, err: + if retry_count == 0: + Log("Non-idempotent %s failed, aborting" % (msg, )) + raise + elif retry_count == 1: + Log("Idempotent %s repeated failure, aborting" % (msg, )) + raise + else: + Log("Idempotent %s failed, retry #%d/%d: %s" % + (msg, MAX_RETRIES - retry_count + 1, MAX_RETRIES, err)) + self.MaybeRetry(retry_count - 1, msg, fn, *args) + + def _ExecOp(self, *ops): """Execute one or more opcodes and manage the exec buffer. @result: if only opcode has been passed, we return its result; @@ -139,20 +179,48 @@ class Burner(object): else: return results + def ExecOp(self, retry, *ops): + """Execute one or more opcodes and manage the exec buffer. + + @result: if only opcode has been passed, we return its result; + otherwise we return the list of results + + """ + if retry: + rval = MAX_RETRIES + else: + rval = 0 + return self.MaybeRetry(rval, "opcode", self._ExecOp, *ops) + def ExecOrQueue(self, name, *ops): """Execute an opcode and manage the exec buffer.""" if self.opts.parallel: self.queued_ops.append((ops, name)) else: - return self.ExecOp(*ops) + return self.ExecOp(self.queue_retry, *ops) + + def StartBatch(self, retry): + """Start a new batch of jobs. + + @param retry: whether this is a retryable batch + + """ + self.queued_ops = [] + self.queue_retry = retry def CommitQueue(self): """Execute all submitted opcodes in case of parallel burnin""" if not self.opts.parallel: return + if self.queue_retry: + rval = MAX_RETRIES + else: + rval = 0 + try: - results = self.ExecJobSet(self.queued_ops) + results = self.MaybeRetry(rval, "jobset", self.ExecJobSet, + self.queued_ops) finally: self.queued_ops = [] return results @@ -171,10 +239,45 @@ class Burner(object): results = [] for jid, (_, iname) in zip(job_ids, jobs): Log("waiting for job %s for %s" % (jid, iname), indent=2) - results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback)) - + try: + results.append(cli.PollJob(jid, cl=self.cl, feedback_fn=self.Feedback)) + except Exception, err: + Log("Job for %s failed: %s" % (iname, err)) + if len(results) != len(jobs): + raise BurninFailure() return results + def _DoCheckInstances(fn): + """Decorator for checking instances. + + """ + def wrapper(self, *args, **kwargs): + val = fn(self, *args, **kwargs) + for instance in self.instances: + self._CheckInstanceAlive(instance) + return val + + return wrapper + + def _DoBatch(retry): + """Decorator for possible batch operations. + + Must come after the _DoCheckInstances decorator (if any). + + @param retry: whether this is a retryable batch, will be + passed to StartBatch + + """ + def wrap(fn): + def batched(self, *args, **kwargs): + self.StartBatch(retry) + val = fn(self, *args, **kwargs) + self.CommitQueue() + return val + return batched + + return wrap + def ParseOptions(self): """Parses the command line options. @@ -325,14 +428,14 @@ class Burner(object): try: op = opcodes.OpQueryNodes(output_fields=["name", "offline", "drained"], names=names, use_locking=True) - result = self.ExecOp(op) + result = self.ExecOp(True, op) except errors.GenericError, err: err_code, msg = cli.FormatError(err) Err(msg, exit_code=err_code) self.nodes = [data[0] for data in result if not (data[1] or data[2])] - result = self.ExecOp(opcodes.OpDiagnoseOS(output_fields=["name", "valid"], - names=[])) + op_diagos = opcodes.OpDiagnoseOS(output_fields=["name", "valid"], names=[]) + result = self.ExecOp(True, op_diagos) if not result: Err("Can't get the OS list") @@ -343,6 +446,8 @@ class Burner(object): if self.opts.os not in os_set: Err("OS '%s' not found" % self.opts.os) + @_DoCheckInstances + @_DoBatch(False) def BurnCreateInstances(self): """Create the given instances. @@ -388,11 +493,7 @@ class Burner(object): self.ExecOrQueue(instance, op) self.to_rem.append(instance) - self.CommitQueue() - - for instance in self.instances: - self._CheckInstanceAlive(instance) - + @_DoBatch(False) def BurnGrowDisks(self): """Grow both the os and the swap disks by the requested amount, if any.""" Log("Growing disks") @@ -404,8 +505,8 @@ class Burner(object): amount=growth, wait_for_sync=True) Log("increase disk/%s by %s MB" % (idx, growth), indent=2) self.ExecOrQueue(instance, op) - self.CommitQueue() + @_DoBatch(True) def BurnReplaceDisks1D8(self): """Replace disks on primary and secondary for drbd8.""" Log("Replacing disks on the same nodes") @@ -419,8 +520,8 @@ class Burner(object): Log("run %s" % mode, indent=2) ops.append(op) self.ExecOrQueue(instance, *ops) - self.CommitQueue() + @_DoBatch(True) def BurnReplaceDisks2(self): """Replace secondary node.""" Log("Changing the secondary node") @@ -442,8 +543,9 @@ class Burner(object): disks=[i for i in range(self.disk_count)]) Log("run %s %s" % (mode, msg), indent=2) self.ExecOrQueue(instance, op) - self.CommitQueue() + @_DoCheckInstances + @_DoBatch(False) def BurnFailover(self): """Failover the instances.""" Log("Failing over instances") @@ -453,10 +555,8 @@ class Burner(object): ignore_consistency=False) self.ExecOrQueue(instance, op) - self.CommitQueue() - for instance in self.instances: - self._CheckInstanceAlive(instance) + @_DoBatch(False) def BurnMigrate(self): """Migrate the instances.""" Log("Migrating instances") @@ -469,8 +569,9 @@ class Burner(object): cleanup=True) Log("migration and migration cleanup", indent=2) self.ExecOrQueue(instance, op1, op2) - self.CommitQueue() + @_DoCheckInstances + @_DoBatch(False) def BurnImportExport(self): """Export the instance, delete it, and import it back. @@ -486,7 +587,7 @@ class Burner(object): # read the full name of the instance nam_op = opcodes.OpQueryInstances(output_fields=["name"], names=[instance], use_locking=True) - full_name = self.ExecOp(nam_op)[0][0] + full_name = self.ExecOp(False, nam_op)[0][0] if self.opts.iallocator: pnode = snode = None @@ -535,10 +636,6 @@ class Burner(object): Log("remove export", indent=2) self.ExecOrQueue(instance, exp_op, rem_op, imp_op, erem_op) - self.CommitQueue() - for instance in self.instances: - self._CheckInstanceAlive(instance) - def StopInstanceOp(self, instance): """Stop given instance.""" return opcodes.OpShutdownInstance(instance_name=instance) @@ -552,6 +649,8 @@ class Burner(object): return opcodes.OpRenameInstance(instance_name=instance, new_name=instance_new) + @_DoCheckInstances + @_DoBatch(True) def BurnStopStart(self): """Stop/start the instances.""" Log("Stopping and starting instances") @@ -561,11 +660,7 @@ class Burner(object): op2 = self.StartInstanceOp(instance) self.ExecOrQueue(instance, op1, op2) - self.CommitQueue() - - for instance in self.instances: - self._CheckInstanceAlive(instance) - + @_DoBatch(False) def BurnRemove(self): """Remove the instances.""" Log("Removing instances") @@ -575,8 +670,6 @@ class Burner(object): ignore_failures=True) self.ExecOrQueue(instance, op) - self.CommitQueue() - def BurnRename(self): """Rename the instances. @@ -594,11 +687,13 @@ class Burner(object): op_rename2 = self.RenameInstanceOp(rename, instance) op_start1 = self.StartInstanceOp(rename) op_start2 = self.StartInstanceOp(instance) - self.ExecOp(op_stop1, op_rename1, op_start1) + self.ExecOp(False, op_stop1, op_rename1, op_start1) self._CheckInstanceAlive(rename) - self.ExecOp(op_stop2, op_rename2, op_start2) + self.ExecOp(False, op_stop2, op_rename2, op_start2) self._CheckInstanceAlive(instance) + @_DoCheckInstances + @_DoBatch(True) def BurnReinstall(self): """Reinstall the instances.""" Log("Reinstalling instances") @@ -613,11 +708,8 @@ class Burner(object): op4 = self.StartInstanceOp(instance) self.ExecOrQueue(instance, op1, op2, op3, op4) - self.CommitQueue() - - for instance in self.instances: - self._CheckInstanceAlive(instance) - + @_DoCheckInstances + @_DoBatch(True) def BurnReboot(self): """Reboot the instances.""" Log("Rebooting instances") @@ -632,11 +724,8 @@ class Burner(object): ops.append(op) self.ExecOrQueue(instance, *ops) - self.CommitQueue() - - for instance in self.instances: - self._CheckInstanceAlive(instance) - + @_DoCheckInstances + @_DoBatch(True) def BurnActivateDisks(self): """Activate and deactivate disks of the instances.""" Log("Activating/deactivating disks") @@ -650,10 +739,9 @@ class Burner(object): Log("activate disks when offline", indent=2) Log("deactivate disks (when offline)", indent=2) self.ExecOrQueue(instance, op_act, op_stop, op_act, op_deact, op_start) - self.CommitQueue() - for instance in self.instances: - self._CheckInstanceAlive(instance) + @_DoCheckInstances + @_DoBatch(False) def BurnAddRemoveDisks(self): """Add and remove an extra disk for the instances.""" Log("Adding and removing disks") @@ -669,10 +757,8 @@ class Burner(object): Log("adding a disk", indent=2) Log("removing last disk", indent=2) self.ExecOrQueue(instance, op_add, op_stop, op_rem, op_start) - self.CommitQueue() - for instance in self.instances: - self._CheckInstanceAlive(instance) + @_DoBatch(False) def BurnAddRemoveNICs(self): """Add and remove an extra NIC for the instances.""" Log("Adding and removing NICs") @@ -685,7 +771,6 @@ class Burner(object): Log("adding a NIC", indent=2) Log("removing last NIC", indent=2) self.ExecOrQueue(instance, op_add, op_rem) - self.CommitQueue() def _CheckInstanceAlive(self, instance): """Check if an instance is alive by doing http checks. @@ -784,7 +869,14 @@ class Burner(object): Log(self.GetFeedbackBuf()) Log("\n\n") if not self.opts.keep_instances: - self.BurnRemove() + try: + self.BurnRemove() + except Exception, err: + if has_err: # already detected errors, so errors in removal + # are quite expected + Log("Note: error detected during instance remove: %s" % str(err)) + else: # non-expected error + raise return 0 diff --git a/tools/lvmstrap b/tools/lvmstrap index ed92a12789f1feef1b7c79159d7d8e293a071c9d..73b4834e65e6ffe6c5b7570112fe66e7c0dd51bb 100755 --- a/tools/lvmstrap +++ b/tools/lvmstrap @@ -46,6 +46,7 @@ import time from ganeti.utils import RunCmd, ReadFile from ganeti import constants +from ganeti import cli USAGE = ("\tlvmstrap diskinfo\n" "\tlvmstrap [--vgname=NAME] [--allow-removable]" @@ -485,8 +486,15 @@ def ShowDiskInfo(opts): dlist = GetDiskList(opts) print "------- Disk information -------" - print ("%5s %7s %4s %5s %-10s %s" % - ("Name", "Size[M]", "Used", "Mount", "LVM?", "Info")) + headers = { + "name": "Name", + "size": "Size[M]", + "used": "Used", + "mount": "Mount", + "lvm": "LVM?", + "info": "Info" + } + fields = ["name", "size", "used", "mount", "lvm", "info"] flatlist = [] # Flatten the [(disk, [partition,...]), ...] list @@ -499,6 +507,7 @@ def ShowDiskInfo(opts): for partname, partsize, partdev in parts: flatlist.append((partname, partsize, partdev, "")) + strlist = [] for name, size, dev, in_use in flatlist: mp, vgname, fileinfo = DevInfo(name, dev, mounts) if mp is None: @@ -513,8 +522,15 @@ def ShowDiskInfo(opts): if len(name) > 3: # Indent partitions name = " %s" % name - print ("%-5s %7.2f %-4s %-5s %-10s %s" % - (name, float(size) / 1024 / 1024, in_use, mp, lvminfo, fileinfo)) + + strlist.append([name, "%.2f" % (float(size) / 1024 / 1024), + in_use, mp, lvminfo, fileinfo]) + + data = cli.GenerateTable(headers, fields, None, + strlist, numfields=["size"]) + + for line in data: + print line def CheckReread(name):