diff --git a/.gitignore b/.gitignore index 244b46c3eac20281f7c1655f740bfa7e0a67c67d..d27ccf01e111def18c6bf39e74542b0055010655 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ /doc/html /doc/install-quick.rst /doc/news.rst +/doc/upgrade.rst /doc/*.in /doc/*.png diff --git a/Makefile.am b/Makefile.am index fab0ee8dfd805fc43db9024713cff33b8b970655..68bf10f3b993ddac64522d34435b8637bc9458a5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -78,6 +78,7 @@ MAINTAINERCLEANFILES = \ $(maninput) \ doc/install-quick.rst \ doc/news.rst \ + doc/upgrade.rst \ vcs-version maintainer-clean-local: @@ -230,6 +231,7 @@ docrst = \ doc/news.rst \ doc/rapi.rst \ doc/security.rst \ + doc/upgrade.rst \ doc/walkthrough.rst $(RUN_IN_TEMPDIR): | $(all_dirfiles) @@ -249,15 +251,11 @@ doc/html/index.html: $(docrst) $(docpng) doc/conf.py configure.ac \ doc/html: doc/html/index.html +doc/install-quick.rst: INSTALL doc/news.rst: NEWS - set -e; \ - { echo '.. This file is automatically updated at build time from $<.'; \ - echo '.. Do not edit.'; \ - echo; \ - cat $<; \ - } > $@ +doc/upgrade.rst: UPGRADE -doc/install-quick.rst: INSTALL +doc/install-quick.rst doc/news.rst doc/upgrade.rst: set -e; \ { echo '.. This file is automatically updated at build time from $<.'; \ echo '.. Do not edit.'; \ @@ -353,6 +351,7 @@ pkglib_SCRIPTS = \ EXTRA_DIST = \ NEWS \ + UPGRADE \ pylintrc \ autotools/build-bash-completion \ autotools/check-python-code \ diff --git a/UPGRADE b/UPGRADE new file mode 100644 index 0000000000000000000000000000000000000000..21217a6938e1caf8a53213f1049ba41fc2e7eafb --- /dev/null +++ b/UPGRADE @@ -0,0 +1,259 @@ +Upgrade notes +============= + +.. highlight:: sh + +This document details the steps needed to upgrade a cluster to newer versions +of Ganeti. + +As a general rule the node daemons need to be restarted after each software +upgrade; if using the provided example init.d script, this means running the +following command on all nodes:: + + /etc/init.d/ganeti restart + + +2.1 and above +------------- + +Starting with Ganeti 2.0, upgrades between revisions (e.g. 2.1.0 to 2.1.1) +should not need manual intervention. As a safety measure, minor releases (e.g. +2.1.3 to 2.2.0) require the ``cfgupgrade`` command for changing the +configuration version. Below you find the steps necessary to upgrade between +minor releases. + +To run commands on all nodes, the `distributed shell (dsh) +<http://www.netfort.gr.jp/~dancer/software/dsh.html.en>`_ can be used, e.g. +``dsh -M -F 8 -f /var/lib/ganeti/ssconf_online_nodes gnt-cluster --version``. + +#. Ensure no jobs are running (master node only):: + + gnt-job list + +#. Stop all daemons on all nodes:: + + /etc/init.d/ganeti stop + +#. Backup old configuration (master node only):: + + tar czf /var/lib/ganeti-$(date +%FT%T).tar.gz -C /var/lib ganeti + +#. Install new Ganeti version on all nodes +#. Run cfgupgrade on the master node:: + + /usr/lib/ganeti/tools/cfgupgrade --verbose --dry-run + /usr/lib/ganeti/tools/cfgupgrade --verbose + + (``cfgupgrade`` supports a number of parameters, run it with + ``--help`` for more information) + +#. Restart daemons on all nodes:: + + /etc/init.d/ganeti restart + +#. Re-distribute configuration (master node only):: + + gnt-cluster redist-conf + +#. Restart daemons again on all nodes:: + + /etc/init.d/ganeti restart + +#. Verify cluster (master node only):: + + gnt-cluster verify + + +2.0 releases +------------ + +2.0.3 to 2.0.4 +~~~~~~~~~~~~~~ + +No changes needed except restarting the daemon; but rollback to 2.0.3 might +require configuration editing. + +If you're using Xen-HVM instances, please double-check the network +configuration (``nic_type`` parameter) as the defaults might have changed: +2.0.4 adds any missing configuration items and depending on the version of the +software the cluster has been installed with, some new keys might have been +added. + +2.0.1 to 2.0.2/2.0.3 +~~~~~~~~~~~~~~~~~~~~ + +Between 2.0.1 and 2.0.2 there have been some changes in the handling of block +devices, which can cause some issues. 2.0.3 was then released which adds two +new options/commands to fix this issue. + +If you use DRBD-type instances and see problems in instance start or +activate-disks with messages from DRBD about "lower device too small" or +similar, it is recoomended to: + +#. Run ``gnt-instance activate-disks --ignore-size $instance`` for each + of the affected instances +#. Then run ``gnt-cluster repair-disk-sizes`` which will check that + instances have the correct disk sizes + +1.2 to 2.0 +---------- + +Prerequisites: + +- Ganeti 1.2.7 is currently installed +- All instances have been migrated from DRBD 0.7 to DRBD 8.x (i.e. no + ``remote_raid1`` disk template) +- Upgrade to Ganeti 2.0.0~rc2 or later (~rc1 and earlier don't have the needed + upgrade tool) + +In the below steps, replace :file:`/var/lib` with ``$libdir`` if Ganeti was not +installed with this prefix (e.g. :file:`/usr/local/var`). Same for +:file:`/usr/lib`. + +Execution (all steps are required in the order given): + +#. Make a backup of the current configuration, for safety:: + + cp -a /var/lib/ganeti /var/lib/ganeti-1.2.backup + +#. Stop all instances:: + + gnt-instance stop --all + +#. Make sure no DRBD device are in use, the following command should show no + active minors:: + + gnt-cluster command grep cs: /proc/drbd \| grep -v cs:Unconf + +#. Stop the node daemons and rapi daemon on all nodes (note: should be logged + in not via the cluster name, but the master node name, as the command below + will remove the cluster ip from the master node):: + + gnt-cluster command /etc/init.d/ganeti stop + +#. Install the new software on all nodes, either from packaging (if available) + or from sources; the master daemon will not start but give error messages + about wrong configuration file, which is normal +#. Upgrade the configuration file:: + + /usr/lib/ganeti/tools/cfgupgrade12 -v --dry-run + /usr/lib/ganeti/tools/cfgupgrade12 -v + +#. Make sure ``ganeti-noded`` is running on all nodes (and start it if + not) +#. Start the master daemon:: + + ganeti-masterd + +#. Check that a simple node-list works:: + + gnt-node list + +#. Redistribute updated configuration to all nodes:: + + gnt-cluster redist-conf + gnt-cluster copyfile /var/lib/ganeti/known_hosts + +#. Optional: if needed, install RAPI-specific certificates under + :file:`/var/lib/ganeti/rapi.pem` and run:: + + gnt-cluster copyfile /var/lib/ganeti/rapi.pem + +#. Run a cluster verify, this should show no problems:: + + gnt-cluster verify + +#. Remove some obsolete files:: + + gnt-cluster command rm /var/lib/ganeti/ssconf_node_pass + gnt-cluster command rm /var/lib/ganeti/ssconf_hypervisor + +#. Update the xen pvm (if this was a pvm cluster) setting for 1.2 + compatibility:: + + gnt-cluster modify -H xen-pvm:root_path=/dev/sda + +#. Depending on your setup, you might also want to reset the initrd parameter:: + + gnt-cluster modify -H xen-pvm:initrd_path=/boot/initrd-2.6-xenU + +#. Reset the instance autobalance setting to default:: + + for i in $(gnt-instance list -o name --no-headers); do \ + gnt-instance modify -B auto_balance=default $i; \ + done + +#. Optional: start the RAPI demon:: + + ganeti-rapi + +#. Restart instances:: + + gnt-instance start --force-multiple --all + +At this point, ``gnt-cluster verify`` should show no errors and the migration +is complete. + +1.2 releases +------------ + +1.2.4 to any other higher 1.2 version +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +No changes needed. Rollback will usually require manual edit of the +configuration file. + +1.2.3 to 1.2.4 +~~~~~~~~~~~~~~ + +No changes needed. Note that going back from 1.2.4 to 1.2.3 will require manual +edit of the configuration file (since we added some HVM-related new +attributes). + +1.2.2 to 1.2.3 +~~~~~~~~~~~~~~ + +No changes needed. Note that the drbd7-to-8 upgrade tool does a disk format +change for the DRBD metadata, so in theory this might be **risky**. It is +advised to have (good) backups before doing the upgrade. + +1.2.1 to 1.2.2 +~~~~~~~~~~~~~~ + +No changes needed. + +1.2.0 to 1.2.1 +~~~~~~~~~~~~~~ + +No changes needed. Only some bugfixes and new additions that don't affect +existing clusters. + +1.2.0 beta 3 to 1.2.0 +~~~~~~~~~~~~~~~~~~~~~ + +No changes needed. + +1.2.0 beta 2 to beta 3 +~~~~~~~~~~~~~~~~~~~~~~ + +No changes needed. A new version of the debian-etch-instance OS (0.3) has been +released, but upgrading it is not required. + +1.2.0 beta 1 to beta 2 +~~~~~~~~~~~~~~~~~~~~~~ + +Beta 2 switched the config file format to JSON. Steps to upgrade: + +#. Stop the daemons (``/etc/init.d/ganeti stop``) on all nodes +#. Disable the cron job (default is :file:`/etc/cron.d/ganeti`) +#. Install the new version +#. Make a backup copy of the config file +#. Upgrade the config file using the following command:: + + /usr/share/ganeti/cfgupgrade --verbose /var/lib/ganeti/config.data + +#. Start the daemons and run ``gnt-cluster info``, ``gnt-node list`` and + ``gnt-instance list`` to check if the upgrade process finished successfully + +The OS definition also need to be upgraded. There is a new version of the +debian-etch-instance OS (0.2) that goes along with beta 2. diff --git a/doc/index.rst b/doc/index.rst index 0dcc5ee36a6cbbc661e40c6f535ccce5318808e4..86f24d7ffed1a20712ce2d8aec75c8be1950b938 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -11,6 +11,7 @@ Contents: install-quick.rst install.rst + upgrade.rst admin.rst walkthrough.rst security.rst diff --git a/test/cfgupgrade_unittest.py b/test/cfgupgrade_unittest.py index 06a6f2cd5de977ab7a0c8756ec3b79422870a77a..3bbddcf7e61a7cbd879caccae61d43fd4f91c943 100755 --- a/test/cfgupgrade_unittest.py +++ b/test/cfgupgrade_unittest.py @@ -32,13 +32,17 @@ from ganeti import constants from ganeti import utils from ganeti import errors from ganeti import serializer +from ganeti import netutils import testutils -def _RunUpgrade(path, dry_run, no_verify): +def _RunUpgrade(path, dry_run, no_verify, ignore_hostname=True): cmd = [sys.executable, "%s/tools/cfgupgrade" % testutils.GetSourceDir(), "--debug", "--force", "--path=%s" % path] + + if ignore_hostname: + cmd.append("--ignore-hostname") if dry_run: cmd.append("--dry-run") if no_verify: @@ -62,6 +66,7 @@ class TestCfgupgrade(unittest.TestCase): self.known_hosts_path = utils.PathJoin(self.tmpdir, "known_hosts") self.confd_hmac_path = utils.PathJoin(self.tmpdir, "hmac.key") self.cds_path = utils.PathJoin(self.tmpdir, "cluster-domain-secret") + self.ss_master_node_path = utils.PathJoin(self.tmpdir, "ssconf_master_node") def tearDown(self): shutil.rmtree(self.tmpdir) @@ -72,12 +77,41 @@ class TestCfgupgrade(unittest.TestCase): def _CreateValidConfigDir(self): utils.WriteFile(self.noded_cert_path, data="") utils.WriteFile(self.known_hosts_path, data="") + utils.WriteFile(self.ss_master_node_path, + data="node.has.another.name.example.net") def testNoConfigDir(self): self.assertFalse(utils.ListVisibleFiles(self.tmpdir)) self.assertRaises(Exception, _RunUpgrade, self.tmpdir, False, True) self.assertRaises(Exception, _RunUpgrade, self.tmpdir, True, True) + def testWrongHostname(self): + self._CreateValidConfigDir() + + utils.WriteFile(self.config_path, data=serializer.DumpJson({ + "version": constants.CONFIG_VERSION, + "cluster": {}, + })) + + hostname = netutils.GetHostname().name + assert hostname != utils.ReadOneLineFile(self.ss_master_node_path) + + self.assertRaises(Exception, _RunUpgrade, self.tmpdir, False, True, + ignore_hostname=False) + + def testCorrectHostname(self): + self._CreateValidConfigDir() + + utils.WriteFile(self.config_path, data=serializer.DumpJson({ + "version": constants.CONFIG_VERSION, + "cluster": {}, + })) + + utils.WriteFile(self.ss_master_node_path, + data="%s\n" % netutils.GetHostname().name) + + _RunUpgrade(self.tmpdir, False, True, ignore_hostname=False) + def testInconsistentConfig(self): self._CreateValidConfigDir() # There should be no "config_version" diff --git a/tools/cfgupgrade b/tools/cfgupgrade index 29f8ee8ea0da4f3522d12778ce1b5982d6877c01..9e4b02b6e9f3263559650c14ef2b97e5f18d08eb 100755 --- a/tools/cfgupgrade +++ b/tools/cfgupgrade @@ -39,6 +39,7 @@ from ganeti import utils from ganeti import cli from ganeti import bootstrap from ganeti import config +from ganeti import netutils options = None @@ -63,21 +64,37 @@ def SetupLogging(): elif options.verbose: stderr_handler.setLevel(logging.INFO) else: - stderr_handler.setLevel(logging.CRITICAL) + stderr_handler.setLevel(logging.WARNING) root_logger = logging.getLogger("") root_logger.setLevel(logging.NOTSET) root_logger.addHandler(stderr_handler) +def CheckHostname(path): + """Ensures hostname matches ssconf value. + + @param path: Path to ssconf file + + """ + ssconf_master_node = utils.ReadOneLineFile(path) + hostname = netutils.GetHostname().name + + if ssconf_master_node == hostname: + return True + + logging.warning("Warning: ssconf says master node is '%s', but this" + " machine's name is '%s'; this tool must be run on" + " the master node", ssconf_master_node, hostname) + return False + + def main(): """Main program. """ global options, args # pylint: disable-msg=W0603 - program = os.path.basename(sys.argv[0]) - # Option parsing parser = optparse.OptionParser(usage="%prog [--debug|--verbose] [--force]") parser.add_option('--dry-run', dest='dry_run', @@ -87,6 +104,9 @@ def main(): parser.add_option(cli.FORCE_OPT) parser.add_option(cli.DEBUG_OPT) parser.add_option(cli.VERBOSE_OPT) + parser.add_option("--ignore-hostname", dest="ignore_hostname", + action="store_true", default=False, + help="Don't abort if hostname doesn't match") parser.add_option('--path', help="Convert configuration in this" " directory instead of '%s'" % constants.DATA_DIR, default=constants.DATA_DIR, dest="data_dir") @@ -106,6 +126,7 @@ def main(): options.RAPI_USERS_FILE_PRE24 = options.data_dir + "/rapi_users" options.CONFD_HMAC_KEY = options.data_dir + "/hmac.key" options.CDS_FILE = options.data_dir + "/cluster-domain-secret" + options.SSCONF_MASTER_NODE = options.data_dir + "/ssconf_master_node" SetupLogging() @@ -113,9 +134,16 @@ def main(): if args: raise Error("No arguments expected") + # Check master name + if not (CheckHostname(options.SSCONF_MASTER_NODE) or options.ignore_hostname): + logging.error("Aborting due to hostname mismatch") + sys.exit(constants.EXIT_FAILURE) + if not options.force: - usertext = ("%s MUST be run on the master node. Is this the master" - " node and are ALL instances down?" % program) + usertext = ("Please make sure you have read the upgrade notes for" + " Ganeti %s (available in the UPGRADE file and included" + " in other documentation formats). Continue with upgrading" + " configuration?" % constants.RELEASE_VERSION) if not cli.AskUser(usertext): sys.exit(constants.EXIT_FAILURE)