diff --git a/Makefile.am b/Makefile.am index b803aaab070db4266984a641d4fca54bd131336b..b90074cac847cc643677f3253e7392af44bf14dc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -330,6 +330,7 @@ $(REPLACE_VARS_SED): Makefile stamp-directories echo 's#@CUSTOM_XEN_KERNEL@#$(XEN_KERNEL)#g'; \ echo 's#@CUSTOM_XEN_INITRD@#$(XEN_INITRD)#g'; \ echo 's#@RPL_FILE_STORAGE_DIR@#$(FILE_STORAGE_DIR)#g'; \ + echo 's#@PKGLIBDIR@#$(pkglibdir)#g'; \ } > $@ # We need to create symlinks because "make distcheck" will not install Python diff --git a/NEWS b/NEWS index cbbf76a78c842add206e4ed73dc8dd07847c047d..f1852058928bc3d3952286c098ae7309067ad30b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,15 @@ +Version 2.0.3 + - Added β--ignore-sizeβ to the βgnt-instance activate-disksβ command + to allow using the pre-2.0.2 behaviour in activation, if any + existing instances have mismatched disk sizes in the configuration + - Added βgnt-cluster repair-disk-sizesβ command to check and update + any configuration mismatches for disk sizes + - Added βgnt-master cluste-failover --no-votingβ to allow master + failover to work on two-node clusters + - Fixed the β--netβ option of βgnt-backup importβ, which was unusable + - Fixed detection of OS script errors in βgnt-backup exportβ + - Fixed exit code of βgnt-backup exportβ + Version 2.0.2 - Added experimental support for stripped logical volumes; this should enhance performance but comes with a higher complexity in the block diff --git a/configure.ac b/configure.ac index 12ba06df08c9a98d679eb62c99f6bb6c5bb2166b..6d3b906c1e6da552068b78fe9b05b01cd443ccb9 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ # Configure script for Ganeti m4_define([gnt_version_major], [2]) m4_define([gnt_version_minor], [0]) -m4_define([gnt_version_revision], [2]) +m4_define([gnt_version_revision], [3]) m4_define([gnt_version_suffix], []) m4_define([gnt_version_full], m4_format([%d.%d.%d%s], diff --git a/daemons/ganeti-masterd b/daemons/ganeti-masterd index d9b8e74021e1afcb6363ac9bd7c49230a9ba891c..51e01ba15b04a60abc8fed3826f3d6472b3a4ca9 100755 --- a/daemons/ganeti-masterd +++ b/daemons/ganeti-masterd @@ -524,7 +524,7 @@ def main(): try: # activate ip master_node = ssconf.SimpleStore().GetMasterNode() - if not rpc.RpcRunner.call_node_start_master(master_node, False): + if not rpc.RpcRunner.call_node_start_master(master_node, False, False): logging.error("Can't activate master IP address") master.setup_queue() diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded index fada1fe514cd3ebfd10efa75172816db730d2931..34904724fb9ff9df6144f205a0800dad44d5f2bc 100755 --- a/daemons/ganeti-noded +++ b/daemons/ganeti-noded @@ -232,6 +232,14 @@ class NodeHttpServer(http.server.HttpServer): disks = [objects.Disk.FromDict(cf) for cf in params[1]] return backend.BlockdevClose(params[0], disks) + @staticmethod + def perspective_blockdev_getsize(params): + """Compute the sizes of the given block devices. + + """ + disks = [objects.Disk.FromDict(cf) for cf in params[0]] + return backend.BlockdevGetsize(disks) + # blockdev/drbd specific methods ---------- @staticmethod @@ -520,7 +528,7 @@ class NodeHttpServer(http.server.HttpServer): """Promote this node to master status. """ - return backend.StartMaster(params[0]) + return backend.StartMaster(params[0], params[1]) @staticmethod def perspective_node_stop_master(params): diff --git a/devel/upload.in b/devel/upload.in index 94759e60f9c7b8276c4ccb583aae446b7cf334c4..733befa338d9e9d7c50a9340297fbeab602cb942 100644 --- a/devel/upload.in +++ b/devel/upload.in @@ -27,14 +27,22 @@ set -e +PREFIX='@PREFIX@' +SYSCONFDIR='@SYSCONFDIR@' +PKGLIBDIR='@PKGLIBDIR@' + NO_RESTART= +NO_CRON= hosts= while [ "$#" -gt 0 ]; do opt="$1" case "$opt" in --no-restart) NO_RESTART=1 - ;; + ;; + --no-cron) + NO_CRON=1 + ;; -h|--help) echo "Usage: $0 [--no-restart] hosts..." exit 0 @@ -42,10 +50,10 @@ while [ "$#" -gt 0 ]; do -*) echo "Unknown option: $opt" >&2 exit 1 - ;; + ;; *) hosts="$hosts $opt" - ;; + ;; esac shift done @@ -58,39 +66,36 @@ trap 'rm -rf $TXD' EXIT # install ganeti as a real tree make install DESTDIR="$TXD" +# copy additional needed files +install -D --mode=0755 doc/examples/ganeti.initd \ + "$TXD/$SYSCONFDIR/init.d/ganeti" + +install -D --mode=0644 doc/examples/bash_completion \ + "$TXD/$SYSCONFDIR/bash_completion.d/ganeti" + +if [ -z "$NO_CRON" ]; then + install -D --mode=0644 doc/examples/ganeti.cron \ + "$TXD/$SYSCONFDIR/cron.d/ganeti" +fi + +install -D --mode=0755 doc/examples/dumb-allocator \ + "$TXD/$PKGLIBDIR/iallocators/dumb" + echo --- ( cd "$TXD" && find; ) echo --- -PREFIX='@PREFIX@' - # and now put it under $prefix on the target node(s) for host; do echo Uploading code to ${host}... rsync -v -rlDc --exclude="*.py[oc]" --exclude="*.pdf" --exclude="*.html" \ - "$TXD/$PREFIX/" \ - root@${host}:$PREFIX/ & -done -wait - -INIT_SCRIPT="$TXD/ganeti.initd" -install --mode=0755 doc/examples/ganeti.initd $INIT_SCRIPT -for host; do - echo Uploading init script to ${host}... - scp $INIT_SCRIPT root@${host}:/etc/init.d/ganeti & + "$TXD/" \ + root@${host}:/ & done wait -if [ -f ganeti-master-cron ]; then - for host; do - echo Uploading cron files to ${host}... - scp ganeti-master-cron root@${host}:/etc/ganeti/master-cron & - done -fi -wait - if test -z "${NO_RESTART}"; then for host; do echo Restarting ganeti-noded on ${host}... diff --git a/doc/examples/bash_completion.in b/doc/examples/bash_completion.in index 09fe0eafb0ee9989854b2ddbdd562dabccf2dd5f..b52c10d6c280d7f07cefbedc462c5848add2450b 100644 --- a/doc/examples/bash_completion.in +++ b/doc/examples/bash_completion.in @@ -90,7 +90,7 @@ _gnt_cluster() if [[ -e "@LOCALSTATEDIR@/lib/ganeti/ssconf_cluster_name" ]]; then cmds="add-tags command copyfile destroy getmaster info list-tags \ masterfailover modify queue redist-conf remove-tags rename \ - search-tags verify verify-disks version" + repair-disk-sizes search-tags verify verify-disks version" else cmds="init" fi diff --git a/lib/backend.py b/lib/backend.py index dbc32f6f7b67cc38417f96fb5ac00161fd725db3..f4e308e651ad505818c89543ee98c202863ba5e1 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -169,7 +169,7 @@ def GetMasterInfo(): return (master_netdev, master_ip, master_node) -def StartMaster(start_daemons): +def StartMaster(start_daemons, no_voting): """Activate local node as master node. The function will always try activate the IP address of the master @@ -179,6 +179,9 @@ def StartMaster(start_daemons): @type start_daemons: boolean @param start_daemons: whther to also start the master daemons (ganeti-masterd and ganeti-rapi) + @type no_voting: boolean + @param no_voting: whether to start ganeti-masterd without a node vote + (if start_daemons is True), but still non-interactively @rtype: None """ @@ -208,8 +211,17 @@ def StartMaster(start_daemons): # and now start the master and rapi daemons if start_daemons: - for daemon in 'ganeti-masterd', 'ganeti-rapi': - result = utils.RunCmd([daemon]) + daemons_params = { + 'ganeti-masterd': [], + 'ganeti-rapi': [], + } + if no_voting: + daemons_params['ganeti-masterd'].append('--no-voting') + daemons_params['ganeti-masterd'].append('--yes-do-it') + for daemon in daemons_params: + cmd = [daemon] + cmd.extend(daemons_params[daemon]) + result = utils.RunCmd(cmd) if result.failed: logging.error("Can't start daemon %s: %s", daemon, result.output) ok = False @@ -1452,6 +1464,32 @@ def BlockdevFind(disk): return (True, (rbd.dev_path, rbd.major, rbd.minor) + rbd.GetSyncStatus()) +def BlockdevGetsize(disks): + """Computes the size of the given disks. + + If a disk is not found, returns None instead. + + @type disks: list of L{objects.Disk} + @param disks: the list of disk to compute the size for + @rtype: list + @return: list with elements None if the disk cannot be found, + otherwise the size + + """ + result = [] + for cf in disks: + try: + rbd = _RecursiveFindBD(cf) + except errors.BlockDeviceError, err: + result.append(None) + continue + if rbd is None: + result.append(None) + else: + result.append(rbd.GetActualSize()) + return result + + def UploadFile(file_name, data, mode, uid, gid, atime, mtime): """Write a file to the filesystem. @@ -1815,8 +1853,8 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): # the target command is built out of three individual commands, # which are joined by pipes; we check each individual command for # valid parameters - expcmd = utils.BuildShellCmd("cd %s; %s 2>%s", inst_os.path, - export_script, logfile) + expcmd = utils.BuildShellCmd("set -e; set -o pipefail; cd %s; %s 2>%s", + inst_os.path, export_script, logfile) comprcmd = "gzip" @@ -1829,7 +1867,7 @@ def ExportSnapshot(disk, dest_node, instance, cluster_name, idx): # all commands have been checked, so we're safe to combine them command = '|'.join([expcmd, comprcmd, utils.ShellQuoteArgs(remotecmd)]) - result = utils.RunCmd(command, env=export_env) + result = utils.RunCmd(["bash", "-c", command], env=export_env) if result.failed: logging.error("os snapshot export command '%s' returned error: %s" diff --git a/lib/bdev.py b/lib/bdev.py index 9ac41e50e6db08d3b7d99c5bf0d2ed185f5825cc..7055ccb01262f76afa34bad6f50f65d3d5527863 100644 --- a/lib/bdev.py +++ b/lib/bdev.py @@ -277,6 +277,23 @@ class BlockDev(object): """ raise NotImplementedError + def GetActualSize(self): + """Return the actual disk size. + + @note: the device needs to be active when this is called + + """ + assert self.attached, "BlockDevice not attached in GetActualSize()" + result = utils.RunCmd(["blockdev", "--getsize64", self.dev_path]) + if result.failed: + _ThrowError("blockdev failed (%s): %s", + result.fail_reason, result.output) + try: + sz = int(result.output.strip()) + except (ValueError, TypeError), err: + _ThrowError("Failed to parse blockdev output: %s", str(err)) + return sz + def __repr__(self): return ("<%s: unique_id: %s, children: %s, %s:%s, %s>" % (self.__class__, self.unique_id, self._children, @@ -1129,9 +1146,10 @@ class DRBD8(BaseDRBD): """ args = ["drbdsetup", cls._DevPath(minor), "disk", backend, meta, "0", - "-d", "%sm" % size, "-e", "detach", "--create-device"] + if size: + args.extend(["-d", "%sm" % size]) result = utils.RunCmd(args) if result.failed: _ThrowError("drbd%d: can't attach local disk: %s", minor, result.output) @@ -1727,6 +1745,19 @@ class FileStorage(BlockDev): self.attached = os.path.exists(self.dev_path) return self.attached + def GetActualSize(self): + """Return the actual disk size. + + @note: the device needs to be active when this is called + + """ + assert self.attached, "BlockDevice not attached in GetActualSize()" + try: + st = os.stat(self.dev_path) + return st.st_size + except OSError, err: + _ThrowError("Can't stat %s: %s", self.dev_path, err) + @classmethod def Create(cls, unique_id, children, size): """Create a new file. diff --git a/lib/bootstrap.py b/lib/bootstrap.py index 7cd0d92ef9ddb97a37c774f47f56883fa7dee8d4..fda1c2ee64dc0f7b711c2c2c6668fcee637c5762 100644 --- a/lib/bootstrap.py +++ b/lib/bootstrap.py @@ -275,7 +275,7 @@ def InitCluster(cluster_name, mac_prefix, def_bridge, # start the master ip # TODO: Review rpc call from bootstrap - rpc.RpcRunner.call_node_start_master(hostname.name, True) + rpc.RpcRunner.call_node_start_master(hostname.name, True, False) def InitConfig(version, cluster_config, master_node_config, @@ -453,8 +453,7 @@ def MasterFailover(no_voting=False): # cluster info cfg.Update(cluster_info) - # 2.0.X: Don't start the master if no_voting is true - result = rpc.RpcRunner.call_node_start_master(new_master, not no_voting) + result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting) if result.failed or not result.data: logging.error("Could not start the master role on the new master" " %s, please check", new_master) diff --git a/lib/cmdlib.py b/lib/cmdlib.py index 67535363aad147bee70009965d4a9456559f8145..ae2267ad560c75fb0cd57b47bf1ea02eddbbdf32 100644 --- a/lib/cmdlib.py +++ b/lib/cmdlib.py @@ -1325,6 +1325,100 @@ class LUVerifyDisks(NoHooksLU): return result +class LURepairDiskSizes(NoHooksLU): + """Verifies the cluster disks sizes. + + """ + _OP_REQP = ["instances"] + REQ_BGL = False + + def ExpandNames(self): + + if not isinstance(self.op.instances, list): + raise errors.OpPrereqError("Invalid argument type 'instances'") + + if self.op.instances: + self.wanted_names = [] + for name in self.op.instances: + full_name = self.cfg.ExpandInstanceName(name) + if full_name is None: + raise errors.OpPrereqError("Instance '%s' not known" % name) + self.wanted_names.append(full_name) + self.needed_locks[locking.LEVEL_INSTANCE] = self.wanted_names + self.needed_locks = { + locking.LEVEL_NODE: [], + locking.LEVEL_INSTANCE: self.wanted_names, + } + self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE + else: + self.wanted_names = None + self.needed_locks = { + locking.LEVEL_NODE: locking.ALL_SET, + locking.LEVEL_INSTANCE: locking.ALL_SET, + } + self.share_locks = dict(((i, 1) for i in locking.LEVELS)) + + def DeclareLocks(self, level): + if level == locking.LEVEL_NODE and self.wanted_names is not None: + self._LockInstancesNodes(primary_only=True) + + def CheckPrereq(self): + """Check prerequisites. + + This only checks the optional instance list against the existing names. + + """ + if self.wanted_names is None: + self.wanted_names = self.acquired_locks[locking.LEVEL_INSTANCE] + + self.wanted_instances = [self.cfg.GetInstanceInfo(name) for name + in self.wanted_names] + + def Exec(self, feedback_fn): + """Verify the size of cluster disks. + + """ + # TODO: check child disks too + # TODO: check differences in size between primary/secondary nodes + per_node_disks = {} + for instance in self.wanted_instances: + pnode = instance.primary_node + if pnode not in per_node_disks: + per_node_disks[pnode] = [] + for idx, disk in enumerate(instance.disks): + per_node_disks[pnode].append((instance, idx, disk)) + + changed = [] + for node, dskl in per_node_disks.items(): + result = self.rpc.call_blockdev_getsizes(node, [v[2] for v in dskl]) + if result.failed: + self.LogWarning("Failure in blockdev_getsizes call to node" + " %s, ignoring", node) + continue + if len(result.data) != len(dskl): + self.LogWarning("Invalid result from node %s, ignoring node results", + node) + continue + for ((instance, idx, disk), size) in zip(dskl, result.data): + if size is None: + self.LogWarning("Disk %d of instance %s did not return size" + " information, ignoring", idx, instance.name) + continue + if not isinstance(size, (int, long)): + self.LogWarning("Disk %d of instance %s did not return valid" + " size information, ignoring", idx, instance.name) + continue + size = size >> 20 + if size != disk.size: + self.LogInfo("Disk %d of instance %s has mismatched size," + " correcting: recorded %d, actual %d", idx, + instance.name, disk.size, size) + disk.size = size + self.cfg.Update(instance) + changed.append((instance.name, idx, size)) + return changed + + class LURenameCluster(LogicalUnit): """Rename the cluster. @@ -1399,7 +1493,7 @@ class LURenameCluster(LogicalUnit): constants.SSH_KNOWN_HOSTS_FILE, to_node) finally: - result = self.rpc.call_node_start_master(master, False) + result = self.rpc.call_node_start_master(master, False, False) if result.failed or not result.data: self.LogWarning("Could not re-enable the master role on" " the master, please restart manually.") @@ -2595,19 +2689,24 @@ class LUActivateInstanceDisks(NoHooksLU): assert self.instance is not None, \ "Cannot retrieve locked instance %s" % self.op.instance_name _CheckNodeOnline(self, self.instance.primary_node) + if not hasattr(self.op, "ignore_size"): + self.op.ignore_size = False def Exec(self, feedback_fn): """Activate the disks. """ - disks_ok, disks_info = _AssembleInstanceDisks(self, self.instance) + disks_ok, disks_info = \ + _AssembleInstanceDisks(self, self.instance, + ignore_size=self.op.ignore_size) if not disks_ok: raise errors.OpExecError("Cannot activate block devices") return disks_info -def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): +def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False, + ignore_size=False): """Prepare the block devices for an instance. This sets up the block devices on all nodes. @@ -2619,6 +2718,10 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): @type ignore_secondaries: boolean @param ignore_secondaries: if true, errors on secondary nodes won't result in an error return from the function + @type ignore_size: boolean + @param ignore_size: if true, the current known size of the disk + will not be used during the disk activation, useful for cases + when the size is wrong @return: False if the operation failed, otherwise a list of (host, instance_visible_name, node_visible_name) with the mapping from node devices to instance devices @@ -2639,6 +2742,9 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): # 1st pass, assemble on all nodes in secondary mode for inst_disk in instance.disks: for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node): + if ignore_size: + node_disk = node_disk.Copy() + node_disk.UnsetSize() lu.cfg.SetDiskID(node_disk, node) result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, False) msg = result.RemoteFailMsg() @@ -2656,6 +2762,9 @@ def _AssembleInstanceDisks(lu, instance, ignore_secondaries=False): for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node): if node != instance.primary_node: continue + if ignore_size: + node_disk = node_disk.Copy() + node_disk.UnsetSize() lu.cfg.SetDiskID(node_disk, node) result = lu.rpc.call_blockdev_assemble(node, node_disk, iname, True) msg = result.RemoteFailMsg() @@ -6365,6 +6474,8 @@ class LUExportInstance(LogicalUnit): for disk in instance.disks: self.cfg.SetDiskID(disk, src_node) + # per-disk results + dresults = [] try: for idx, disk in enumerate(instance.disks): # new_dev_name will be a snapshot of an lvm leaf of the one we passed @@ -6398,15 +6509,22 @@ class LUExportInstance(LogicalUnit): if result.failed or not result.data: self.LogWarning("Could not export disk/%d from node %s to" " node %s", idx, src_node, dst_node.name) + dresults.append(False) + else: + dresults.append(True) msg = self.rpc.call_blockdev_remove(src_node, dev).RemoteFailMsg() if msg: self.LogWarning("Could not remove snapshot for disk/%d from node" " %s: %s", idx, src_node, msg) + else: + dresults.append(False) result = self.rpc.call_finalize_export(dst_node.name, instance, snap_disks) + fin_resu = True if result.failed or not result.data: self.LogWarning("Could not finalize export for instance %s on node %s", instance.name, dst_node.name) + fin_resu = False nodelist = self.cfg.GetNodeList() nodelist.remove(dst_node.name) @@ -6423,6 +6541,7 @@ class LUExportInstance(LogicalUnit): if not self.rpc.call_export_remove(node, instance.name): self.LogWarning("Could not remove older export for instance %s" " on node %s", instance.name, node) + return fin_resu, dresults class LURemoveExport(NoHooksLU): diff --git a/lib/mcpu.py b/lib/mcpu.py index 67c5e95770bd0f1741586fcac85fe869484c09d4..959a83785331c847569a3f2f0c083f838a76cdf9 100644 --- a/lib/mcpu.py +++ b/lib/mcpu.py @@ -50,6 +50,7 @@ class Processor(object): opcodes.OpVerifyDisks: cmdlib.LUVerifyDisks, opcodes.OpSetClusterParams: cmdlib.LUSetClusterParams, opcodes.OpRedistributeConfig: cmdlib.LURedistributeConfig, + opcodes.OpRepairDiskSizes: cmdlib.LURepairDiskSizes, # node lu opcodes.OpAddNode: cmdlib.LUAddNode, opcodes.OpQueryNodes: cmdlib.LUQueryNodes, diff --git a/lib/objects.py b/lib/objects.py index 5c2ff3d1aeefa64c7c9506da6e0ba524535e635a..acb24d84b6c6f0654dfc045668b35af5df012052 100644 --- a/lib/objects.py +++ b/lib/objects.py @@ -153,6 +153,14 @@ class ConfigObject(object): " _ContainerFromDicts" % c_type) return ret + def Copy(self): + """Makes a deep copy of the current object and its children. + + """ + dict_form = self.ToDict() + clone_obj = self.__class__.FromDict(dict_form) + return clone_obj + def __repr__(self): """Implement __repr__ for ConfigObjects.""" return repr(self.ToDict()) @@ -388,6 +396,15 @@ class Disk(ConfigObject): raise errors.ProgrammerError("Disk.RecordGrow called for unsupported" " disk type %s" % self.dev_type) + def UnsetSize(self): + """Sets recursively the size to zero for the disk and its children. + + """ + if self.children: + for child in self.children: + child.UnsetSize() + self.size = 0 + def SetPhysicalID(self, target_node, nodes_ip): """Convert the logical ID to the physical ID. diff --git a/lib/opcodes.py b/lib/opcodes.py index 6ba30f614e2adde716d4f0642e869c05c28f4f2e..6a86477adfdc7163672e6952c47bc3433be5102a 100644 --- a/lib/opcodes.py +++ b/lib/opcodes.py @@ -224,6 +224,26 @@ class OpVerifyDisks(OpCode): __slots__ = [] +class OpRepairDiskSizes(OpCode): + """Verify the disk sizes of the instances and fixes configuration + mimatches. + + Parameters: optional instances list, in case we want to restrict the + checks to only a subset of the instances. + + Result: a list of tuples, (instance, disk, new-size) for changed + configurations. + + In normal operation, the list should be empty. + + @type instances: list + @ivar instances: the list of instances to check, or empty for all instances + + """ + OP_ID = "OP_CLUSTER_REPAIR_DISK_SIZES" + __slots__ = ["instances"] + + class OpQueryConfigValues(OpCode): """Query cluster configuration values.""" OP_ID = "OP_CLUSTER_CONFIG_QUERY" @@ -433,7 +453,7 @@ class OpActivateInstanceDisks(OpCode): """Activate an instance's disks.""" OP_ID = "OP_INSTANCE_ACTIVATE_DISKS" OP_DSC_FIELD = "instance_name" - __slots__ = ["instance_name"] + __slots__ = ["instance_name", "ignore_size"] class OpDeactivateInstanceDisks(OpCode): diff --git a/lib/rpc.py b/lib/rpc.py index b654f6f1d1c6c63e96be88b07affecf000d8a818..e13885903c597c7ffdab6b252ad8efdec48a6750 100644 --- a/lib/rpc.py +++ b/lib/rpc.py @@ -681,14 +681,14 @@ class RpcRunner(object): [checkdict, cluster_name]) @classmethod - def call_node_start_master(cls, node, start_daemons): + def call_node_start_master(cls, node, start_daemons, no_voting): """Tells a node to activate itself as a master. This is a single-node call. """ return cls._StaticSingleNodeCall(node, "node_start_master", - [start_daemons]) + [start_daemons, no_voting]) @classmethod def call_node_stop_master(cls, node, stop_daemons): @@ -806,6 +806,15 @@ class RpcRunner(object): params = [instance_name, [cf.ToDict() for cf in disks]] return self._SingleNodeCall(node, "blockdev_close", params) + def call_blockdev_getsizes(self, node, disks): + """Returns the size of the given disks. + + This is a single-node call. + + """ + params = [[cf.ToDict() for cf in disks]] + return self._SingleNodeCall(node, "blockdev_getsize", params) + def call_drbd_disconnect_net(self, node_list, nodes_ip, disks): """Disconnects the network of the given drbd devices. diff --git a/man/gnt-backup.sgml b/man/gnt-backup.sgml index d432b6ef6eda9190005f9752ce3df65579ea7c5c..ae3122ac661d4892b0c2064e38177e55b94c06e2 100644 --- a/man/gnt-backup.sgml +++ b/man/gnt-backup.sgml @@ -82,6 +82,16 @@ in the exported dump. </para> + <para> + The exit code of the command is 0 if all disks were backed up + successfully, 1 if no data was backed up or if the + configuration export failed, and 2 if just some of the disks + failed to backup. The exact details of the failures will be + shown during the command execution (and will be stored in the + job log). It is recommended that for any non-zero exit code, + the backup is considered invalid, and retried. + </para> + <para> Example: <screen> diff --git a/man/gnt-cluster.sgml b/man/gnt-cluster.sgml index 98ec7db76eb603337294d680669b7b7991f990f4..b7fecf0bef6ff76fb133e98bdbf8d3a55e5b2945 100644 --- a/man/gnt-cluster.sgml +++ b/man/gnt-cluster.sgml @@ -460,14 +460,6 @@ and gnt-cluster redist-conf to make sure the cluster is consistent again. </para> - <para> - In version 2.0.X ganeti-masterd will not be able to start if - masterfailover is called with the --no-voting option (which, again, - should only be used on 2 nodes clusters with the former master being - down). In that case just start it manually passing --no-voting to it - as well, until you have restored cluster redundancy. - </para> - </refsect2> <refsect2> @@ -618,6 +610,38 @@ </para> </refsect2> + <refsect2> + <title>REPAIR-DISK-SIZES</title> + + <cmdsynopsis> + <command>repair-disk-sizes</command> + <arg rep="repeat">instance</arg> + </cmdsynopsis> + + <para> + This command checks that the recorded size of the given + instance's disks matches the actual size and updates any + mismatches found. This is needed if the Ganeti configuration + is no longer consistent with reality, as it will impact some + disk operations. If no arguments are given, all instances will + be checked. + </para> + + <para> + Note that only active disks can be checked by this command; in + case a disk cannot be activated it's advised to use + <command>gnt-instance activate-disks --ignore-size + ...</command> to force activation without regard to the + current size. + </para> + + <para> + When the all disk sizes are consistent, the command will + return no output. Otherwise it will log details about the + inconsistencies in the configuration. + </para> + </refsect2> + <refsect2> <title>SEARCH-TAGS</title> diff --git a/man/gnt-instance.sgml b/man/gnt-instance.sgml index c4dd54df32d8ae4ff199bdfbbb9f7195fb858ebf..a703d68b6a21136284a21b52f7043328a7ca3374 100644 --- a/man/gnt-instance.sgml +++ b/man/gnt-instance.sgml @@ -1673,6 +1673,7 @@ instance5: 11225 <cmdsynopsis> <command>activate-disks</command> <arg>--submit</arg> + <arg>--ignore-size</arg> <arg choice="req"><replaceable>instance</replaceable></arg> </cmdsynopsis> <para> @@ -1701,6 +1702,16 @@ node1.example.com:disk/1:/dev/drbd1 <command>gnt-job info</command>. </para> + <para> + The <option>--ignore-size</option> option can be used to + activate disks ignoring the currently configured size in + Ganeti. This can be used in cases where the configuration + has gotten out of sync with the real-world (e.g. after a + partially-failed grow-disk operation or due to rounding in + LVM devices). This should not be used in normal cases, but + only when activate-disks fails without it. + </para> + <para> Note that it is safe to run this command while the instance is already running. diff --git a/scripts/gnt-backup b/scripts/gnt-backup index e78abbaee05178a57808167597523e47a324d0c0..cbbde51f4bfff6fb58b031f05ac40c6830444327 100755 --- a/scripts/gnt-backup +++ b/scripts/gnt-backup @@ -74,8 +74,27 @@ def ExportInstance(opts, args): target_node=opts.node, shutdown=opts.shutdown) - SubmitOpCode(op) - + fin_resu, dlist = SubmitOpCode(op) + if not isinstance(dlist, list): + ToStderr("Cannot parse execution results") + return 1 + tot_dsk = len(dlist) + # TODO: handle diskless instances + if dlist.count(False) == 0: + # all OK + rcode = 0 + elif dlist.count(True) == 0: + ToStderr("Error: No disks were backed up successfully." + " The export doesn't have any valid data," + " it is recommended to retry the operation.") + rcode = 1 + else: + ToStderr("Partial export failure: %d disks backed up, %d disks failed.", + dlist.count(True), dlist.count(False)) + rcode = 2 + if not fin_resu: + rcode = 1 + return rcode def ImportInstance(opts, args): """Add an instance to the cluster. @@ -102,7 +121,7 @@ def ImportInstance(opts, args): except ValueError, err: raise errors.OpPrereqError("Invalid NIC index passed: %s" % str(err)) nics = [{}] * nic_max - for nidx, ndict in opts.nics.items(): + for nidx, ndict in opts.nics: nidx = int(nidx) nics[nidx] = ndict elif opts.no_nics: diff --git a/scripts/gnt-cluster b/scripts/gnt-cluster index f1541670509a24afaf6f285d696440dff9161c56..e6cb4f70cbf4cfb0745dd928f504d76cde04614e 100755 --- a/scripts/gnt-cluster +++ b/scripts/gnt-cluster @@ -404,6 +404,20 @@ def VerifyDisks(opts, args): return retcode +def RepairDiskSizes(opts, args): + """Verify sizes of cluster disks. + + @param opts: the command line options selected by the user + @type args: list + @param args: optional list of instances to restrict check to + @rtype: int + @return: the desired exit code + + """ + op = opcodes.OpRepairDiskSizes(instances=args) + SubmitOpCode(op) + + @UsesRPC def MasterFailover(opts, args): """Failover the master node. @@ -616,6 +630,8 @@ commands = { "", "Does a check on the cluster configuration"), 'verify-disks': (VerifyDisks, ARGS_NONE, [DEBUG_OPT], "", "Does a check on the cluster disk status"), + 'repair-disk-sizes': (RepairDiskSizes, ARGS_ANY, [DEBUG_OPT], + "", "Updates mismatches in recorded disk sizes"), 'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT, make_option("--no-voting", dest="no_voting", help="Skip node agreement check (dangerous)", diff --git a/scripts/gnt-instance b/scripts/gnt-instance index 3e8ce3f358ef52d88ca523959effa79eb2308cab..056337700f6d61bc503d623bfca802e2baf98373 100755 --- a/scripts/gnt-instance +++ b/scripts/gnt-instance @@ -632,7 +632,8 @@ def ActivateDisks(opts, args): """ instance_name = args[0] - op = opcodes.OpActivateInstanceDisks(instance_name=instance_name) + op = opcodes.OpActivateInstanceDisks(instance_name=instance_name, + ignore_size=opts.ignore_size) disks_info = SubmitOrSend(op, opts) for host, iname, nname in disks_info: ToStdout("%s:%s:%s", host, iname, nname) @@ -1508,7 +1509,14 @@ commands = { SUBMIT_OPT, ], "<instance>", "Reboots an instance"), - 'activate-disks': (ActivateDisks, ARGS_ONE, [DEBUG_OPT, SUBMIT_OPT], + 'activate-disks': (ActivateDisks, ARGS_ONE, + [DEBUG_OPT, SUBMIT_OPT, + make_option("--ignore-size", dest="ignore_size", + default=False, action="store_true", + help="Ignore current recorded size" + " (useful for forcing activation when" + " the recorded size is wrong)"), + ], "<instance>", "Activate an instance's disks"), 'deactivate-disks': (DeactivateDisks, ARGS_ONE, [DEBUG_OPT, SUBMIT_OPT],