Commit 2f7140ba authored by Guido Trotter's avatar Guido Trotter
Browse files

Merge branch 'next' into branch-2.1



* next:
  Create a new --no-voting option for masterfailover
  ganeti-masterd: allow non-interactive --no-voting
  Fix pylint warnings
  Add custom pylintrc
  bootstrap: Don't leak file descriptor when generating SSL certificate
  Fix problem with EAGAIN on socket connection in clients
  Fix some typos
  Increase maximum accepted size for a DRBD meta dev
  Cleanup config data when draining nodes
  Fix node readd issues
  backend.DemoteFromMC: don't fail for missing files
  Allow GetMasterCandidateStats to ignore some nodes
  Fix error message for extra files on non MC nodes

Conflicts:

	lib/backend.py
          Most of the conflicts where in the new rpcs VS pylint fixes
          and usually the new rpcs fixed the pylint problems as well
	lib/bootstrap.py
          Small conflict between masterfailover --no-voting and new rpcs
	lib/cmdlib.py
          Net parameters conflicted here, kept that version
	lib/objects.py
          Same problem fixed in two different ways. 'next' version kept
Signed-off-by: default avatarGuido Trotter <ultrotter@google.com>
parents cc208ed0 fba15943
......@@ -160,6 +160,7 @@ EXTRA_DIST = \
$(MAINTAINERCLEANFILES) \
NEWS \
DEVNOTES \
pylintrc \
autotools/docbook-wrapper \
devel/upload.in \
$(docdot) \
......
......@@ -195,6 +195,7 @@ class ClientRqHandler(SocketServer.BaseRequestHandler):
def send_message(self, msg):
#print "sending", msg
# TODO: sendall is not guaranteed to send everything
self.request.sendall(msg + self.EOM)
......@@ -403,6 +404,10 @@ def ParseOptions():
help="Do not check that the nodes agree on this node"
" being the master and start the daemon unconditionally",
default=False, action="store_true")
parser.add_option("--yes-do-it", dest="yes_do_it",
help="Override interactive check for --no-voting",
default=False, action="store_true")
options, args = parser.parse_args()
return options, args
......@@ -479,7 +484,7 @@ def main():
ssconf.CheckMaster(options.debug)
# we believe we are the master, let's ask the other nodes...
if options.no_voting:
if options.no_voting and not options.yes_do_it:
sys.stdout.write("The 'no voting' option has been selected.\n")
sys.stdout.write("This is dangerous, please confirm by"
" typing uppercase 'yes': ")
......@@ -488,7 +493,7 @@ def main():
if confirmation != "YES":
print "Aborting."
return
else:
elif not options.no_voting:
if not CheckAgreement():
return
......
......@@ -104,7 +104,7 @@ The scripts will be run as follows:
be left
All informations about the cluster is passed using environment
All information about the cluster is passed using environment
variables. Different operations will have sligthly different
environments, but most of the variables are common.
......
......@@ -233,7 +233,7 @@ The response message is much more simple than the input one. It is
also a dict having three keys:
success
a boolean value denoting if the allocation was successfull or not
a boolean value denoting if the allocation was successful or not
info
a string with information from the scripts; if the allocation fails,
......
......@@ -174,7 +174,7 @@ def GetMasterInfo():
master_node = cfg.GetMasterNode()
except errors.ConfigurationError, err:
_Fail("Cluster configuration incomplete: %s", err, exc=True)
return master_netdev, master_ip, master_node
return (master_netdev, master_ip, master_node)
def StartMaster(start_daemons):
......@@ -337,7 +337,7 @@ def LeaveCluster():
def GetNodeInfo(vgname, hypervisor_type):
"""Gives back a hash with different informations about the node.
"""Gives back a hash with different information about the node.
@type vgname: C{string}
@param vgname: the name of the volume group to ask for disk space information
......@@ -609,7 +609,7 @@ def GetInstanceList(hypervisor_list):
def GetInstanceInfo(instance, hname):
"""Gives back the informations about an instance as a dictionary.
"""Gives back the information about an instance as a dictionary.
@type instance: string
@param instance: the instance name
......@@ -764,7 +764,7 @@ def RunRenameInstance(instance, old_name):
def _GetVGInfo(vg_name):
"""Get informations about the volume group.
"""Get information about the volume group.
@type vg_name: str
@param vg_name: the volume group which we query
......@@ -931,7 +931,7 @@ def InstanceShutdown(instance):
# test every 10secs for 2min
time.sleep(1)
for dummy in range(11):
for _ in range(11):
if instance.name not in GetInstanceList([hv_name]):
break
time.sleep(10)
......@@ -1242,7 +1242,7 @@ def BlockdevAssemble(disk, owner, as_primary):
def BlockdevShutdown(disk):
"""Shut down a block device.
First, if the device is assembled (Attach() is successfull), then
First, if the device is assembled (Attach() is successful), then
the device is shutdown. Then the children of the device are
shutdown.
......@@ -1348,7 +1348,7 @@ def BlockdevGetmirrorstatus(disks):
def _RecursiveFindBD(disk):
"""Check if a device is activated.
If so, return informations about the real device.
If so, return information about the real device.
@type disk: L{objects.Disk}
@param disk: the disk object we need to find
......@@ -1368,7 +1368,7 @@ def _RecursiveFindBD(disk):
def BlockdevFind(disk):
"""Check if a device is activated.
If it is, return informations about the real device.
If it is, return information about the real device.
@type disk: L{objects.Disk}
@param disk: the disk to find
......@@ -2068,7 +2068,7 @@ def RemoveFileStorageDir(file_storage_dir):
@param file_storage_dir: the directory we should cleanup
@rtype: tuple (success,)
@return: tuple of one element, C{success}, denoting
whether the operation was successfull
whether the operation was successful
"""
file_storage_dir = _TransformFileStorageDir(file_storage_dir)
......@@ -2254,7 +2254,8 @@ def DemoteFromMC():
if utils.IsProcessAlive(utils.ReadPidFile(pid_file)):
_Fail("The master daemon is running, will not demote")
try:
utils.CreateBackup(constants.CLUSTER_CONF_FILE)
if os.path.isfile(constants.CLUSTER_CONF_FILE):
utils.CreateBackup(constants.CLUSTER_CONF_FILE)
except EnvironmentError, err:
if err.errno != errno.ENOENT:
_Fail("Error while backing up cluster file: %s", err, exc=True)
......
......@@ -161,7 +161,7 @@ class BlockDev(object):
"""Remove this device.
This makes sense only for some of the device types: LV and file
storeage. Also note that if the device can't attach, the removal
storage. Also note that if the device can't attach, the removal
can't be completed.
"""
......@@ -444,7 +444,7 @@ class LogicalVolume(BlockDev):
def Assemble(self):
"""Assemble the device.
We alway run `lvchange -ay` on the LV to ensure it's active before
We always run `lvchange -ay` on the LV to ensure it's active before
use, as there were cases when xenvg was not active after boot
(also possibly after disk issues).
......@@ -828,7 +828,13 @@ class BaseDRBD(BlockDev):
bytes = sectors * 512
if bytes < 128 * 1024 * 1024: # less than 128MiB
_ThrowError("Meta device too small (%.2fMib)", (bytes / 1024 / 1024))
if bytes > (128 + 32) * 1024 * 1024: # account for an extra (big) PE on LVM
# the maximum *valid* size of the meta device when living on top
# of LVM is hard to compute: it depends on the number of stripes
# and the PE size; e.g. a 2-stripe, 64MB PE will result in a 128MB
# (normal size), but an eight-stripe 128MB PE will result in a 1GB
# size meta device; as such, we restrict it to 1GB (a little bit
# too generous, but making assumptions about PE size is hard)
if bytes > 1024 * 1024 * 1024:
_ThrowError("Meta device too big (%.2fMiB)", (bytes / 1024 / 1024))
def Rename(self, new_id):
......@@ -1252,14 +1258,14 @@ class DRBD8(BaseDRBD):
If sync_percent is None, it means all is ok
If estimated_time is None, it means we can't esimate
If estimated_time is None, it means we can't estimate
the time needed, otherwise it's the time left in seconds.
We set the is_degraded parameter to True on two conditions:
network not connected or local disk missing.
We compute the ldisk parameter based on wheter we have a local
We compute the ldisk parameter based on whether we have a local
disk or not.
@rtype: tuple
......@@ -1329,14 +1335,14 @@ class DRBD8(BaseDRBD):
ever_disconnected = _IgnoreError(self._ShutdownNet, self.minor)
timeout_limit = time.time() + self._NET_RECONFIG_TIMEOUT
sleep_time = 0.100 # we start the retry time at 100 miliseconds
sleep_time = 0.100 # we start the retry time at 100 milliseconds
while time.time() < timeout_limit:
status = self.GetProcStatus()
if status.is_standalone:
break
# retry the disconnect, it seems possible that due to a
# well-time disconnect on the peer, my disconnect command might
# be ingored and forgotten
# be ignored and forgotten
ever_disconnected = _IgnoreError(self._ShutdownNet, self.minor) or \
ever_disconnected
time.sleep(sleep_time)
......@@ -1641,7 +1647,7 @@ class FileStorage(BlockDev):
def Shutdown(self):
"""Shutdown the device.
This is a no-op for the file type, as we don't deacivate
This is a no-op for the file type, as we don't deactivate
the file on shutdown.
"""
......
......@@ -79,24 +79,27 @@ def _GenerateSelfSignedSslCert(file_name, validity=(365 * 5)):
"""
(fd, tmp_file_name) = tempfile.mkstemp(dir=os.path.dirname(file_name))
try:
# Set permissions before writing key
os.chmod(tmp_file_name, 0600)
result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
"-days", str(validity), "-nodes", "-x509",
"-keyout", tmp_file_name, "-out", tmp_file_name,
"-batch"])
if result.failed:
raise errors.OpExecError("Could not generate SSL certificate, command"
" %s had exitcode %s and error message %s" %
(result.cmd, result.exit_code, result.output))
# Make read-only
os.chmod(tmp_file_name, 0400)
os.rename(tmp_file_name, file_name)
try:
# Set permissions before writing key
os.chmod(tmp_file_name, 0600)
result = utils.RunCmd(["openssl", "req", "-new", "-newkey", "rsa:1024",
"-days", str(validity), "-nodes", "-x509",
"-keyout", tmp_file_name, "-out", tmp_file_name,
"-batch"])
if result.failed:
raise errors.OpExecError("Could not generate SSL certificate, command"
" %s had exitcode %s and error message %s" %
(result.cmd, result.exit_code, result.output))
# Make read-only
os.chmod(tmp_file_name, 0400)
os.rename(tmp_file_name, file_name)
finally:
utils.RemoveFile(tmp_file_name)
finally:
utils.RemoveFile(tmp_file_name)
os.close(fd)
def _InitGanetiServerSetup():
......@@ -384,13 +387,17 @@ def SetupNodeDaemon(cluster_name, node, ssh_key_check):
(node, result.fail_reason, result.output))
def MasterFailover():
def MasterFailover(no_voting=False):
"""Failover the master node.
This checks that we are not already the master, and will cause the
current master to cease being master, and the non-master to become
new master.
@type no_voting: boolean
@param no_voting: force the operation without remote nodes agreement
(dangerous)
"""
sstore = ssconf.SimpleStore()
......@@ -412,18 +419,20 @@ def MasterFailover():
" master candidates is:\n"
"%s" % ('\n'.join(mc_no_master)))
vote_list = GatherMasterVotes(node_list)
if vote_list:
voted_master = vote_list[0][0]
if voted_master is None:
raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not"
" respond.")
elif voted_master != old_master:
raise errors.OpPrereqError("I have wrong configuration, I believe the"
" master is %s but the other nodes voted for"
" %s. Please resync the configuration of"
" this node." % (old_master, voted_master))
if not no_voting:
vote_list = GatherMasterVotes(node_list)
if vote_list:
voted_master = vote_list[0][0]
if voted_master is None:
raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
" not respond.")
elif voted_master != old_master:
raise errors.OpPrereqError("I have a wrong configuration, I believe"
" the master is %s but the other nodes"
" voted %s. Please resync the configuration"
" of this node." %
(old_master, voted_master))
# end checks
rcode = 0
......@@ -448,7 +457,8 @@ def MasterFailover():
# cluster info
cfg.Update(cluster_info)
result = rpc.RpcRunner.call_node_start_master(new_master, True)
# 2.0.X: Don't start the master if no_voting is true
result = rpc.RpcRunner.call_node_start_master(new_master, not no_voting)
msg = result.RemoteFailMsg()
if msg:
logging.error("Could not start the master role on the new master"
......@@ -490,7 +500,7 @@ def GatherMasterVotes(node_list):
@type node_list: list
@param node_list: the list of nodes to query for master info; the current
node wil be removed if it is in the list
node will be removed if it is in the list
@rtype: list
@return: list of (node, votes)
......
......@@ -341,7 +341,7 @@ keyval_option = KeyValOption
def _ParseArgs(argv, commands, aliases):
"""Parser for the command line arguments.
This function parses the arguements and returns the function which
This function parses the arguments and returns the function which
must be executed together with its (modified) arguments.
@param argv: the command line
......@@ -459,10 +459,10 @@ def AskUser(text, choices=None):
choices = [('y', True, 'Perform the operation'),
('n', False, 'Do not perform the operation')]
if not choices or not isinstance(choices, list):
raise errors.ProgrammerError("Invalid choiches argument to AskUser")
raise errors.ProgrammerError("Invalid choices argument to AskUser")
for entry in choices:
if not isinstance(entry, tuple) or len(entry) < 3 or entry[0] == '?':
raise errors.ProgrammerError("Invalid choiches element to AskUser")
raise errors.ProgrammerError("Invalid choices element to AskUser")
answer = choices[-1][1]
new_text = []
......@@ -778,7 +778,7 @@ def GenericMain(commands, override=None, aliases=None):
except (errors.GenericError, luxi.ProtocolError,
JobSubmittedException), err:
result, err_msg = FormatError(err)
logging.exception("Error durring command processing")
logging.exception("Error during command processing")
ToStderr(err_msg)
return result
......
......@@ -68,7 +68,7 @@ class LogicalUnit(object):
def __init__(self, processor, op, context, rpc):
"""Constructor for LogicalUnit.
This needs to be overriden in derived classes in order to check op
This needs to be overridden in derived classes in order to check op
validity.
"""
......@@ -118,7 +118,7 @@ class LogicalUnit(object):
CheckPrereq, doing these separate is better because:
- ExpandNames is left as as purely a lock-related function
- CheckPrereq is run after we have aquired locks (and possible
- CheckPrereq is run after we have acquired locks (and possible
waited for them)
The function is allowed to change the self.op attribute so that
......@@ -456,7 +456,7 @@ def _CheckNodeNotDrained(lu, node):
def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
memory, vcpus, nics, disk_template, disks,
bep, hvp, hypervisor):
bep, hvp, hypervisor_name):
"""Builds instance related env variables for hooks
This builds the hook environment from individual variables.
......@@ -479,15 +479,15 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
@param nics: list of tuples (ip, mac, mode, link) representing
the NICs the instance has
@type disk_template: string
@param disk_template: the distk template of the instance
@param disk_template: the disk template of the instance
@type disks: list
@param disks: the list of (size, mode) pairs
@type bep: dict
@param bep: the backend parameters for the instance
@type hvp: dict
@param hvp: the hypervisor parameters for the instance
@type hypervisor: string
@param hypervisor: the hypervisor for the instance
@type hypervisor_name: string
@param hypervisor_name: the hypervisor for the instance
@rtype: dict
@return: the hook environment for this instance
......@@ -506,7 +506,7 @@ def _BuildInstanceHookEnv(name, primary_node, secondary_nodes, os_type, status,
"INSTANCE_MEMORY": memory,
"INSTANCE_VCPUS": vcpus,
"INSTANCE_DISK_TEMPLATE": disk_template,
"INSTANCE_HYPERVISOR": hypervisor,
"INSTANCE_HYPERVISOR": hypervisor_name,
}
if nics:
......@@ -654,7 +654,7 @@ class LUDestroyCluster(NoHooksLU):
This checks whether the cluster is empty.
Any errors are signalled by raising errors.OpPrereqError.
Any errors are signaled by raising errors.OpPrereqError.
"""
master = self.cfg.GetMasterNode()
......@@ -705,7 +705,7 @@ class LUVerifyCluster(LogicalUnit):
Test list:
- compares ganeti version
- checks vg existance and size > 20G
- checks vg existence and size > 20G
- checks config file checksum
- checks ssh to other nodes
......@@ -787,8 +787,8 @@ class LUVerifyCluster(LogicalUnit):
else:
# not candidate and this is not a must-have file
bad = True
feedback_fn(" - ERROR: non master-candidate has old/wrong file"
" '%s'" % file_name)
feedback_fn(" - ERROR: file '%s' should not exist on non master"
" candidates (and the file is outdated)" % file_name)
else:
# all good, except non-master/non-must have combination
if not node_is_mc and not must_have_file:
......@@ -944,7 +944,7 @@ class LUVerifyCluster(LogicalUnit):
if bep[constants.BE_AUTO_BALANCE]:
needed_mem += bep[constants.BE_MEMORY]
if nodeinfo['mfree'] < needed_mem:
feedback_fn(" - ERROR: not enough memory on node %s to accomodate"
feedback_fn(" - ERROR: not enough memory on node %s to accommodate"
" failovers should node %s fail" % (node, prinode))
bad = True
return bad
......@@ -963,7 +963,7 @@ class LUVerifyCluster(LogicalUnit):
def BuildHooksEnv(self):
"""Build hooks env.
Cluster-Verify hooks just rone in the post phase and their failure makes
Cluster-Verify hooks just ran in the post phase and their failure makes
the output be logged in the verify output and the verification to fail.
"""
......@@ -1231,7 +1231,7 @@ class LUVerifyCluster(LogicalUnit):
return not bad
def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
"""Analize the post-hooks' result
"""Analyze the post-hooks' result
This method analyses the hook result, handles it, and sends some
nicely-formatted feedback back to the user.
......@@ -1337,7 +1337,6 @@ class LUVerifyDisks(NoHooksLU):
node_lvs = self.rpc.call_lv_list(nodes, vg_name)
to_act = set()
for node in nodes:
# node_volume
node_res = node_lvs[node]
......@@ -1453,7 +1452,7 @@ def _RecursiveCheckIfLVMBased(disk):
@type disk: L{objects.Disk}
@param disk: the disk to check
@rtype: booleean
@rtype: boolean
@return: boolean indicating whether a LD_LV dev_type was found or not
"""
......@@ -1909,7 +1908,7 @@ class LURemoveNode(LogicalUnit):
- it does not have primary or secondary instances
- it's not the master
Any errors are signalled by raising errors.OpPrereqError.
Any errors are signaled by raising errors.OpPrereqError.
"""
node = self.cfg.GetNodeInfo(self.cfg.ExpandNodeName(self.op.node_name))
......@@ -2239,7 +2238,7 @@ class LUAddNode(LogicalUnit):
- it is resolvable
- its parameters (single/dual homed) matches the cluster
Any errors are signalled by raising errors.OpPrereqError.
Any errors are signaled by raising errors.OpPrereqError.
"""
node_name = self.op.node_name
......@@ -2293,7 +2292,7 @@ class LUAddNode(LogicalUnit):
raise errors.OpPrereqError("The master has a private ip but the"
" new node doesn't have one")
# checks reachablity
# checks reachability
if not utils.TcpPing(primary_ip, constants.DEFAULT_NODED_PORT):
raise errors.OpPrereqError("Node not reachable by ping")
......@@ -2305,14 +2304,24 @@ class LUAddNode(LogicalUnit):
" based ping to noded port")
cp_size = self.cfg.GetClusterInfo().candidate_pool_size
mc_now, _ = self.cfg.GetMasterCandidateStats()
master_candidate = mc_now < cp_size
if self.op.readd:
exceptions = [node]
else:
exceptions = []
mc_now, mc_max = self.cfg.GetMasterCandidateStats(exceptions)
# the new node will increase mc_max with one, so:
mc_max = min(mc_max + 1, cp_size)
self.master_candidate = mc_now < mc_max
self.new_node = objects.Node(name=node,
primary_ip=primary_ip,
secondary_ip=secondary_ip,
master_candidate=master_candidate,
offline=False, drained=False)
if self.op.readd:
self.new_node = self.cfg.GetNodeInfo(node)
assert self.new_node is not None, "Can't retrieve locked node %s" % node
else:
self.new_node = objects.Node(name=node,
primary_ip=primary_ip,
secondary_ip=secondary_ip,
master_candidate=self.master_candidate,
offline=False, drained=False)
def Exec(self, feedback_fn):
"""Adds the new node to the cluster.
......@@ -2321,6 +2330,20 @@ class LUAddNode(LogicalUnit):
new_node = self.new_node
node = new_node.name
# for re-adds, reset the offline/drained/master-candidate flags;
# we need to reset here, otherwise offline would prevent RPC calls
# later in the procedure; this also means that if the re-add
# fails, we are left with a non-offlined, broken node
if self.op.readd:
new_node.drained = new_node.offline = False
self.LogInfo("Readding a node, the offline/drained flags were reset")
# if we demote the node, we do cleanup later in the procedure
new_node.master_candidate = self.master_candidate
# notify the user about any possible mc promotion
if new_node.master_candidate:
self.LogInfo("Node will be a master candidate")
# check connectivity
result = self.rpc.call_version([node])[node]
result.Raise("Can't get version information from node %s" % node)
......@@ -2386,6 +2409,15 @@ class LUAddNode(LogicalUnit):
if self.op.readd:
_RedistributeAncillaryFiles(self)
self.context.ReaddNode(new_node)
# make sure we redistribute the config
self.cfg.Update(new_node)
# and make sure the new node will not have old files around
if not new_node.master_candidate:
result = self.rpc.call_node_demote_from_mc(new_node.name)
msg = result.RemoteFailMsg()
if msg:
self.LogWarning("Node failed to demote itself from master"
" candidate status: %s" % msg)
else:
_RedistributeAncillaryFiles(self, additional_nodes=[node])
self.context.AddNode(new_node)
......@@ -2505,6 +2537,10 @@ class LUSetNodeParams(LogicalUnit):
node.master_candidate = False
changed_mc = True
result.append(("master_candidate", "auto-demotion due to drain"))
rrc = self.rpc.call_node_demote_from_mc(node.name)
msg = rrc.RemoteFailMsg()
if msg:
self.LogWarning("Node failed to demote itself: %s" % msg)
if node.offline:
node.offline = False
result.append(("offline", "clear offline status due to drain"))
......@@ -2593,8 +2629,8 @@ class LUQueryClusterInfo(NoHooksLU):
"master": cluster.master_node,
"default_hypervisor": cluster.default_hypervisor,
"enabled_hypervisors": cluster.enabled_hypervisors,
"hvparams": dict([(hvname, cluster.hvparams[hvname])
for hvname in cluster.enabled_hypervisors]),
"hvparams": dict([(hypervisor_name, cluster.hvparams[hypervisor])
for hypervisor_name in cluster.enabled_hypervisors]),
"beparams": cluster.beparams,
"nicparams": cluster.nicparams,
"candidate_pool_size": cluster.candidate_pool_size,
......@@ -2757,7 +2793,7 @@ def _StartInstanceDisks(lu, instance, force):
"""Start the disks of an instance.
"""
disks_ok, dummy = _AssembleInstanceDisks(lu, instance,
disks_ok, _ = _AssembleInstanceDisks(lu, instance,
ignore_secondaries=force)
if not disks_ok:
_ShutdownInstanceDisks(lu, instance)
......@@ -2943,7 +2979,7 @@ class LUStartupInstance(LogicalUnit):
_CheckNodeOnline(self, instance.primary_node)
bep = self.cfg.GetClusterInfo().FillBE(instance)
# check bridges existance
# check bridges existence
_CheckInstanceBridgesExist(self, instance)
remote_info = self.rpc.call_instance_info(instance.primary_node,
......@@ -3022,7 +3058,7 @@ class LURebootInstance(LogicalUnit):
_CheckNodeOnline(self, instance.primary_node)
# check bridges existance
# check bridges existence
_CheckInstanceBridgesExist(self, instance)
def Exec(self, feedback_fn):
......@@ -3762,7 +3798,7 @@ class LUFailoverInstance(LogicalUnit):
logging.info("Starting instance %s on node %s",
instance.name, target_node)