Commit af0413bb authored by Guido Trotter's avatar Guido Trotter
Browse files

Merge branch 'next' into branch-2.1

* next: (22 commits)
  Update NEWS and version for 2.0.1 release
  gnt-{instance,backup}(8) --nic is actually --net
  Fix a wrong function name in backend.DrbdAttachNet
  GNT-CLUSTER(8) fix search-tags example
  Enable stripped LVs
  Add a lvm stripecount configure parameter
  Add more constants for DRBD and change sync tests
  Wait for a while in failed resyncs
  Assemble DRBD using the known size
  Fix two issues with exports and snapshot errors
  Set the size on new DRBDs in replace secondary
  Change the bdev init signatures
  Release 2.0.0 final
  watcher: automatically restart noded/rapi
  watcher: handle full and drained queue cases
  rapi: rework error handling
  Fix backend.OSEnvironment be/hv parameters
  rapi: make tags query not use jobs
  Change failover instance when instance is stopped
  Export more instance information in hooks


Signed-off-by: default avatarGuido Trotter <>
parents d019f8bd c57f169e
......@@ -315,6 +315,7 @@ lib/ Makefile stamp-directories
echo "KVM_PATH = '$(KVM_PATH)'"; \
echo "SOCAT_PATH = '$(SOCAT_PATH)'"; \
} > $@
$(REPLACE_VARS_SED): Makefile stamp-directories
Version 2.0.1
- added -H/-B startup parameters to gnt-instance, which will allow
re-adding the start in single-user option (regression from 1.2)
- the watcher writes the instance status to a file, to allow
monitoring to report the instance status (from the master) based on
cached results of the watcher's queries; while this can get stale if
the watcher is being locked due to other work on the cluster, this
is still an improvement
- the watcher now also restarts the node daemon and the rapi daemon if
they died
- fixed the watcher to handle full and drained queue cases
- hooks export more instance data in the environment, which helps if
hook scripts need to take action based on the instance's properties
(no longer need to query back into ganeti)
- instance failovers when the instance is stopped do not check for
free RAM, so that failing over a stopped instance is possible in low
memory situations
- rapi uses queries for tags instead of jobs (for less job traffic),
and for cluster tags it won't talk to masterd at all but read them
from ssconf
- a couple of error handling fixes in RAPI
- drbd handling: improved the error handling of inconsistent disks
after resync to reduce the frequency of "there are some degraded
disks for this instance" message
- fixed a bug in live migration when DRBD doesn't want to reconnect
(the error handling path called a wrong function name)
Version 2.0.0 final
- no changes from rc5
Version 2.0 release candidate 5
- fix a couple of bugs (validation, argument checks)
- fix gnt-cluster getmaster on non-master nodes (regression)
# Configure script for Ganeti
m4_define([gnt_version_major], [2])
m4_define([gnt_version_minor], [0])
m4_define([gnt_version_revision], [0])
m4_define([gnt_version_suffix], [~rc5])
m4_define([gnt_version_revision], [1])
m4_define([gnt_version_suffix], [])
gnt_version_major, gnt_version_minor,
......@@ -119,6 +119,16 @@ AC_ARG_WITH([socat-path],
AC_SUBST(SOCAT_PATH, $socat_path)
# ---with-lvm-stripecount=...
[the number of stripes to use for LVM volumes]
[ (default is 3)]
AC_SUBST(LVM_STRIPECOUNT, $lvm_stripecount)
# Check common programs
......@@ -80,6 +80,20 @@ def StartMaster():
return not result.failed
def EnsureDaemon(daemon):
"""Check for and start daemon if not alive.
pidfile = utils.DaemonPidFileName(daemon)
pid = utils.ReadPidFile(pidfile)
if pid == 0 or not utils.IsProcessAlive(pid): # no file or dead pid
logging.debug("Daemon '%s' not alive, trying to restart", daemon)
result = utils.RunCmd([daemon])
if not result:
logging.error("Can't start daemon '%s', failure %s, output: %s",
daemon, result.fail_reason, result.output)
class WatcherState(object):
"""Interface to a state file recording restart attempts.
......@@ -255,10 +269,17 @@ def GetClusterData():
all_results = cli.PollJob(job_id, cl=client, feedback_fn=logging.debug)
logging.debug("Got data from cluster, writing instance status file")
result = all_results[0]
smap = {}
instances = {}
# write the upfile
up_data = "".join(["%s %s\n" % (fields[0], fields[1]) for fields in result])
utils.WriteFile(file_name=constants.INSTANCE_UPFILE, data=up_data)
for fields in result:
(name, status, autostart, snodes) = fields
......@@ -291,6 +312,9 @@ class Watcher(object):
master = client.QueryConfigValues(["master_node"])[0]
if master != utils.HostInfo().name:
raise NotMasterError("This is not the master node")
# first archive old jobs
# and only then submit new ones
self.instances, self.bootids, self.smap = GetClusterData()
self.started_instances = set()
self.opts = opts
......@@ -300,12 +324,12 @@ class Watcher(object):
notepad = self.notepad
def ArchiveJobs(self, age):
def ArchiveJobs(age):
"""Archive old jobs.
......@@ -452,8 +476,12 @@ def main():
utils.SetupLogging(constants.LOG_WATCHER, debug=options.debug,
update_file = True
update_file = False
# on master or not, try to start the node dameon (use _PID but is
# the same as daemon name)
notepad = WatcherState()
......@@ -461,24 +489,30 @@ def main():
except errors.OpPrereqError:
# this is, from cli.GetClient, a not-master case
logging.debug("Not on master, exiting")
update_file = True
except luxi.NoMasterError, err:
logging.warning("Master seems to be down (%s), trying to restart",
if not StartMaster():
logging.critical("Can't start the master, exiting")
update_file = False
# else retry the connection
client = cli.GetClient()
# we are on master now (use _PID but is the same as daemon name)
watcher = Watcher(options, notepad)
except errors.ConfigurationError:
# Just exit if there's no configuration
update_file = True
update_file = True
if update_file:
......@@ -492,6 +526,10 @@ def main():
except errors.ResolverError, err:
logging.error("Cannot resolve hostname '%s', exiting.", err.args[0])
except errors.JobQueueFull:
logging.error("Job queue is full, can't query cluster state")
except errors.JobQueueDrainError:
logging.error("Job queue is drained, can't maintain cluster state")
except Exception, err:
logging.error(str(err), exc_info=True)
......@@ -1103,7 +1103,7 @@ def BlockdevCreate(disk, size, owner, on_primary, info):
device = bdev.Create(disk.dev_type, disk.physical_id, clist, size)
device = bdev.Create(disk.dev_type, disk.physical_id, clist, disk.size)
except errors.BlockDeviceError, err:
_Fail("Can't create block device: %s", err)
......@@ -1205,7 +1205,7 @@ def _RecursiveAssembleBD(disk, owner, as_primary):
if as_primary or disk.AssembleOnSecondary():
r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children)
r_dev = bdev.Assemble(disk.dev_type, disk.physical_id, children, disk.size)
result = r_dev
if as_primary or disk.OpenOnSecondary():
......@@ -1361,7 +1361,7 @@ def _RecursiveFindBD(disk):
for chdisk in disk.children:
return bdev.FindDevice(disk.dev_type, disk.physical_id, children)
return bdev.FindDevice(disk.dev_type, disk.physical_id, children, disk.size)
def BlockdevFind(disk):
......@@ -1683,6 +1683,10 @@ def OSEnvironment(instance, debug=0):
result['NIC_%d_FRONTEND_TYPE' % idx] = \
for source, kind in [(instance.beparams, "BE"), (instance.hvparams, "HV")]:
for key, value in source.items():
result["INSTANCE_%s_%s" % (kind, key)] = str(value)
return result
def BlockdevGrow(disk, amount):
......@@ -2330,7 +2334,7 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
# standalone, even though this should not happen with the
# new staged way of changing disk configs
except errors.BlockDeviceError, err:
_Fail("Can't change network configuration: %s", err)
if all_connected:
......@@ -108,13 +108,14 @@ class BlockDev(object):
after assembly we'll have our correct major/minor.
def __init__(self, unique_id, children):
def __init__(self, unique_id, children, size):
self._children = children
self.dev_path = None
self.unique_id = unique_id
self.major = None
self.minor = None
self.attached = False
self.size = size
def Assemble(self):
"""Assemble the device from its components.
......@@ -286,13 +287,13 @@ class LogicalVolume(BlockDev):
"""Logical Volume block device.
def __init__(self, unique_id, children):
def __init__(self, unique_id, children, size):
"""Attaches to a LV device.
The unique_id is a tuple (vg_name, lv_name)
super(LogicalVolume, self).__init__(unique_id, children)
super(LogicalVolume, self).__init__(unique_id, children, size)
if not isinstance(unique_id, (tuple, list)) or len(unique_id) != 2:
raise ValueError("Invalid configuration data %s" % str(unique_id))
self._vg_name, self._lv_name = unique_id
......@@ -318,18 +319,27 @@ class LogicalVolume(BlockDev):
pvlist = [ pv[1] for pv in pvs_info ]
free_size = sum([ pv[0] for pv in pvs_info ])
current_pvs = len(pvlist)
stripes = min(current_pvs, constants.LVM_STRIPECOUNT)
# The size constraint should have been checked from the master before
# calling the create function.
if free_size < size:
_ThrowError("Not enough free space: required %s,"
" available %s", size, free_size)
result = utils.RunCmd(["lvcreate", "-L%dm" % size, "-n%s" % lv_name,
vg_name] + pvlist)
cmd = ["lvcreate", "-L%dm" % size, "-n%s" % lv_name]
# If the free space is not well distributed, we won't be able to
# create an optimally-striped volume; in that case, we want to try
# with N, N-1, ..., 2, and finally 1 (non-stripped) number of
# stripes
for stripes_arg in range(stripes, 0, -1):
result = utils.RunCmd(cmd + ["-i%d" % stripes_arg] + [vg_name] + pvlist)
if not result.failed:
if result.failed:
_ThrowError("LV create failed (%s): %s",
result.fail_reason, result.output)
return LogicalVolume(unique_id, children)
return LogicalVolume(unique_id, children, size)
def GetPVInfo(vg_name):
......@@ -500,7 +510,7 @@ class LogicalVolume(BlockDev):
snap_name = self._lv_name + ".snap"
# remove existing snapshot if found
snap = LogicalVolume((self._vg_name, snap_name), None)
snap = LogicalVolume((self._vg_name, snap_name), None, size)
pvs_info = self.GetPVInfo(self._vg_name)
......@@ -568,10 +578,51 @@ class DRBD8Status(object):
SYNC_RE = re.compile(r"^.*\ssync'ed:\s*([0-9.]+)%.*"
"\sfinish: ([0-9]+):([0-9]+):([0-9]+)\s.*$")
CS_UNCONFIGURED = "Unconfigured"
CS_STANDALONE = "StandAlone"
CS_CONNECTED = "Connected"
CS_SYNCSOURCE = "SyncSource"
CS_SYNCTARGET = "SyncTarget"
CSET_SYNC = frozenset([
DS_DISKLESS = "Diskless"
DS_ATTACHING = "Attaching" # transient state
DS_FAILED = "Failed" # transient state, next: diskless
DS_NEGOTIATING = "Negotiating" # transient state
DS_INCONSISTENT = "Inconsistent" # while syncing or after creation
DS_OUTDATED = "Outdated"
DS_DUNKNOWN = "DUnknown" # shown for peer disk when not connected
DS_CONSISTENT = "Consistent"
DS_UPTODATE = "UpToDate" # normal state
RO_PRIMARY = "Primary"
RO_SECONDARY = "Secondary"
RO_UNKNOWN = "Unknown"
def __init__(self, procline):
u = self.UNCONF_RE.match(procline)
if u:
self.cstatus = "Unconfigured"
self.cstatus = self.CS_UNCONFIGURED
self.lrole = self.rrole = self.ldisk = self.rdisk = None
m = self.LINE_RE.match(procline)
......@@ -585,21 +636,21 @@ class DRBD8Status(object):
# end reading of data from the LINE_RE or UNCONF_RE
self.is_standalone = self.cstatus == "StandAlone"
self.is_wfconn = self.cstatus == "WFConnection"
self.is_connected = self.cstatus == "Connected"
self.is_primary = self.lrole == "Primary"
self.is_secondary = self.lrole == "Secondary"
self.peer_primary = self.rrole == "Primary"
self.peer_secondary = self.rrole == "Secondary"
self.is_standalone = self.cstatus == self.CS_STANDALONE
self.is_wfconn = self.cstatus == self.CS_WFCONNECTION
self.is_connected = self.cstatus == self.CS_CONNECTED
self.is_primary = self.lrole == self.RO_PRIMARY
self.is_secondary = self.lrole == self.RO_SECONDARY
self.peer_primary = self.rrole == self.RO_PRIMARY
self.peer_secondary = self.rrole == self.RO_SECONDARY
self.both_primary = self.is_primary and self.peer_primary
self.both_secondary = self.is_secondary and self.peer_secondary
self.is_diskless = self.ldisk == "Diskless"
self.is_disk_uptodate = self.ldisk == "UpToDate"
self.is_diskless = self.ldisk == self.DS_DISKLESS
self.is_disk_uptodate = self.ldisk == self.DS_UPTODATE
self.is_in_resync = self.cstatus in ("SyncSource", "SyncTarget")
self.is_in_use = self.cstatus != "Unconfigured"
self.is_in_resync = self.cstatus in self.CSET_SYNC
self.is_in_use = self.cstatus != self.CS_UNCONFIGURED
m = self.SYNC_RE.match(procline)
if m:
......@@ -609,13 +660,16 @@ class DRBD8Status(object):
seconds = int(
self.est_time = hours * 3600 + minutes * 60 + seconds
self.sync_percent = None
# we have (in this if branch) no percent information, but if
# we're resyncing we need to 'fake' a sync percent information,
# as this is how cmdlib determines if it makes sense to wait for
# resyncing or not
if self.is_in_resync:
self.sync_percent = 0
self.sync_percent = None
self.est_time = None
self.is_sync_target = self.peer_sync_source = self.cstatus == "SyncTarget"
self.peer_sync_target = self.is_sync_source = self.cstatus == "SyncSource"
self.is_resync = self.is_sync_target or self.is_sync_source
class BaseDRBD(BlockDev):
"""Base DRBD class.
......@@ -805,10 +859,10 @@ class DRBD8(BaseDRBD):
# timeout constants
def __init__(self, unique_id, children):
def __init__(self, unique_id, children, size):
if children and children.count(None) > 0:
children = []
super(DRBD8, self).__init__(unique_id, children)
super(DRBD8, self).__init__(unique_id, children, size)
self.major = self._DRBD_MAJOR
version = self._GetVersion()
if version['k_major'] != 8 :
......@@ -1031,12 +1085,15 @@ class DRBD8(BaseDRBD):
return retval
def _AssembleLocal(cls, minor, backend, meta):
def _AssembleLocal(cls, minor, backend, meta, size):
"""Configure the local part of a DRBD device.
args = ["drbdsetup", cls._DevPath(minor), "disk",
backend, meta, "0", "-e", "detach", "--create-device"]
backend, meta, "0",
"-d", "%sm" % size,
"-e", "detach",
result = utils.RunCmd(args)
if result.failed:
_ThrowError("drbd%d: can't attach local disk: %s", minor, result.output)
......@@ -1113,7 +1170,7 @@ class DRBD8(BaseDRBD):
self._InitMeta(self._FindUnusedMinor(), meta.dev_path)
self._AssembleLocal(self.minor, backend.dev_path, meta.dev_path)
self._AssembleLocal(self.minor, backend.dev_path, meta.dev_path, self.size)
self._children = devices
def RemoveChildren(self, devices):
......@@ -1396,7 +1453,7 @@ class DRBD8(BaseDRBD):
if match_r and "local_dev" not in info:
# no local disk, but network attached and it matches
self._AssembleLocal(minor, self._children[0].dev_path,
self._children[1].dev_path, self.size)
if self._MatchesNet(self._GetDevInfo(self._GetShowData(minor))):
......@@ -1447,7 +1504,7 @@ class DRBD8(BaseDRBD):
minor = self._aminor
if self._children and self._children[0] and self._children[1]:
self._AssembleLocal(minor, self._children[0].dev_path,
self._children[1].dev_path, self.size)
if self._lhost and self._lport and self._rhost and self._rport:
(self._lhost, self._lport, self._rhost, self._rport),
......@@ -1535,7 +1592,7 @@ class DRBD8(BaseDRBD):
aminor, meta)
cls._InitMeta(aminor, meta.dev_path)
return cls(unique_id, children)
return cls(unique_id, children, size)
def Grow(self, amount):
"""Resize the DRBD device and its backing storage.
......@@ -1559,13 +1616,13 @@ class FileStorage(BlockDev):
The unique_id for the file device is a (file_driver, file_path) tuple.
def __init__(self, unique_id, children):
def __init__(self, unique_id, children, size):
"""Initalizes a file device backend.
if children:
raise errors.BlockDeviceError("Invalid setup for file device")
super(FileStorage, self).__init__(unique_id, children)
super(FileStorage, self).__init__(unique_id, children, size)
if not isinstance(unique_id, (tuple, list)) or len(unique_id) != 2:
raise ValueError("Invalid configuration data %s" % str(unique_id))
self.driver = unique_id[0]
......@@ -1653,7 +1710,7 @@ class FileStorage(BlockDev):
except IOError, err:
_ThrowError("Error in file creation: %", str(err))
return FileStorage(unique_id, children)
return FileStorage(unique_id, children, size)
......@@ -1663,7 +1720,7 @@ DEV_MAP = {
def FindDevice(dev_type, unique_id, children):
def FindDevice(dev_type, unique_id, children, size):
"""Search for an existing, assembled device.
This will succeed only if the device exists and is assembled, but it
......@@ -1672,13 +1729,13 @@ def FindDevice(dev_type, unique_id, children):
if dev_type not in DEV_MAP:
raise errors.ProgrammerError("Invalid block device type '%s'" % dev_type)
device = DEV_MAP[dev_type](unique_id, children)
device = DEV_MAP[dev_type](unique_id, children, size)
if not device.attached:
return None
return device
def Assemble(dev_type, unique_id, children):
def Assemble(dev_type, unique_id, children, size):
"""Try to attach or assemble an existing device.
This will attach to assemble the device, as needed, to bring it
......@@ -1687,7 +1744,7 @@ def Assemble(dev_type, unique_id, children):
if dev_type not in DEV_MAP:
raise errors.ProgrammerError("Invalid block device type '%s'" % dev_type)
device = DEV_MAP[dev_type](unique_id, children)
device = DEV_MAP[dev_type](unique_id, children, size)
return device
......@@ -453,7 +453,8 @@ def _CheckNodeNotDrained(lu, node):
def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
memory, vcpus, nics, disk_template, disks):
memory, vcpus, nics, disk_template, disks,
bep, hvp, hypervisor):
"""Builds instance related env variables for hooks
This builds the hook environment from individual variables.
......@@ -479,6 +480,12 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
@param disk_template: the distk template of the instance
@type disks: list
@param disks: the list of (size, mode) pairs
@type bep: dict
@param bep: the backend parameters for the instance
@type hvp: dict
@param hvp: the hypervisor parameters for the instance
@type hypervisor: string
@param hypervisor: the hypervisor for the instance
@rtype: dict
@return: the hook environment for this instance
......@@ -497,6 +504,7 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
"INSTANCE_DISK_TEMPLATE": disk_template,
if nics:
......@@ -525,6 +533,10 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
env["INSTANCE_DISK_COUNT"] = disk_count
for source, kind in [(bep, "BE"), (hvp, "HV")]:
for key, value in source.items():
env["INSTANCE_%s_%s" % (kind, key)] = value
return env
def _PreBuildNICHooksList(lu, nics):
......@@ -564,7 +576,9 @@ def _BuildInstanceHookEnvByObject(lu, instance, override=None):
@return: the hook environment dictionary
bep = lu.cfg.GetClusterInfo().FillBE(instance)
cluster = lu.cfg.GetClusterInfo()
bep = cluster.FillBE(instance)
hvp = cluster.FillHV(instance)
args = {
'primary_node': instance.primary_node,
......@@ -576,6 +590,9 @@ def _BuildInstanceHookEnvByObject(lu, instance, override=None):
'nics': _PreBuildNICHooksList(lu, instance.nics),
'disk_template': instance.disk_template,
'disks': [(disk.size, disk.mode) for disk in instance.disks],
'bep': bep,
'hvp': hvp,
'hypervisor': instance.hypervisor,
if override:
......@@ -1681,6 +1698,7 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False):
lu.cfg.SetDiskID(dev, node)
retries = 0
degr_retries = 10 # in seconds, as we sleep 1 second each time
while True:
max_time = 0
done = True
......@@ -1714,6 +1732,16 @@ def _WaitForSync(lu, instance, oneshot=False, unlock=False):
rem_time = "no time estimate"
lu.proc.LogInfo("- device %s: %5.2f%% done, %s" %
(instance.disks[i].iv_name, perc_done, rem_time))
# if we're done but degraded, let's do a few small retries, to
# make sure we see a stable and not transient situation; therefore
# we force restart of the loop
if (done or oneshot) and cumul_degraded and degr_retries > 0:"Degraded disks found, %d retries left", degr_retries)
degr_retries -= 1
if done or oneshot:
......@@ -3614,10 +3642,15 @@ class LUFailoverInstance(LogicalUnit):
target_node = secondary_nodes[0]
_CheckNodeOnline(self, target_node)
_CheckNodeNotDrained(self, target_node)
# check memory requirements on the secondary node
_CheckNodeFreeMemory(self, target_node, "failing over instance %s" %, bep[constants.BE_MEMORY],