Commit 59b76636 authored by Iustin Pop's avatar Iustin Pop
Browse files

Merge commit 'origin/next'

* commit 'origin/next': (74 commits)
  Fix gnt-node modify online help
  Fix gnt-job info entry in gnt-job(8)
  locking: Don't swallow exceptions
  Add check for duplicate MACs in instance add
  scripts/gnt-node: fix a help string
  Optimise multi-job submit
  Extend gnt-debug with more debugging options
  Return cluster tags from LUQueryClusterInfo
  Add script to clean archived jobs after 21 days
  rapi: export more static node information
  Pass the correct signal to handlers
  cli: Use ToStdout/ToStderr instead of print
  Fix small typo in gnt-node
  Simplify handling of boolean args in rapi
  Fix checks in LUSetNodeParms for the master node
  Improve the example startup script
  Fix insserv dependencies
  Fix a typo in InitCluster
  Ignore results from drained nodes in iallocator
  Ship the ethers hook
  ...
parents a6ac80db b3fd544f
......@@ -6,6 +6,7 @@
# global ignores
*.py[co]
*.swp
# /
/Makefile
......@@ -26,6 +27,9 @@
/*.tar.bz2
/*.tar.gz
# daemons
/daemons/ganeti-cleaner
# devel
/devel/clean-cluster
/devel/upload
......
......@@ -26,6 +26,7 @@ DIRS = \
devel \
doc \
doc/examples \
doc/examples/hooks \
lib \
lib/http \
lib/hypervisor \
......@@ -44,6 +45,7 @@ MAINTAINERCLEANFILES = \
CLEANFILES = \
autotools/replace_vars.sed \
daemons/ganeti-cleaner \
devel/upload \
doc/rapi-resources.gen \
doc/examples/bash_completion \
......@@ -138,6 +140,9 @@ dist_sbin_SCRIPTS = \
scripts/gnt-node \
scripts/gnt-os
nodist_sbin_SCRIPTS = \
daemons/ganeti-cleaner
dist_tools_SCRIPTS = \
tools/burnin \
tools/cfgshell \
......@@ -148,7 +153,9 @@ EXTRA_DIST = \
$(MAINTAINERCLEANFILES) \
NEWS \
DEVNOTES \
pylintrc \
autotools/docbook-wrapper \
daemons/ganeti-cleaner.in \
devel/upload.in \
$(docrst) \
$(docdot) \
......@@ -157,6 +164,7 @@ EXTRA_DIST = \
doc/examples/ganeti.initd.in \
doc/examples/ganeti.cron.in \
doc/examples/dumb-allocator \
doc/examples/hooks/ethers \
doc/locking.txt \
test/testutils.py \
test/mocks.py \
......@@ -241,6 +249,11 @@ doc/examples/%: doc/examples/%.in stamp-directories \
$(REPLACE_VARS_SED)
sed -f $(REPLACE_VARS_SED) < $< > $@
daemons/ganeti-cleaner: daemons/ganeti-cleaner.in stamp-directories \
$(REPLACE_VARS_SED)
sed -f $(REPLACE_VARS_SED) < $< > $@
chmod +x $@
doc/%.html: doc/%.rst
@test -n "$(RST2HTML)" || { echo 'rst2html' not found during configure; exit 1; }
$(RST2HTML) $< $@
......
#!/bin/bash
#
# Copyright (C) 2009 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
set -e
DATA_DIR=@LOCALSTATEDIR@/lib/ganeti
QUEUE_ARCHIVE_DIR=$DATA_DIR/queue/archive
# Define how many days archived jobs should be left alone
REMOVE_AFTER=21
# Exit if machine is not part of a cluster
[[ -e $DATA_DIR/ssconf_master_node ]] || echo 0
# Exit if queue archive directory doesn't exist
[[ -d $QUEUE_ARCHIVE_DIR ]] || exit 0
# Remove old jobs
find $QUEUE_ARCHIVE_DIR -mindepth 2 -type f -mtime +$REMOVE_AFTER -print0 | \
xargs -r0 rm -f
exit 0
......@@ -28,13 +28,10 @@ inheritance from parent classes requires it.
import os
import errno
import sys
import SocketServer
import time
import collections
import Queue
import random
import signal
import logging
......@@ -195,6 +192,7 @@ class ClientRqHandler(SocketServer.BaseRequestHandler):
def send_message(self, msg):
#print "sending", msg
# TODO: sendall is not guaranteed to send everything
self.request.sendall(msg + self.EOM)
......@@ -213,6 +211,13 @@ class ClientOps:
ops = [opcodes.OpCode.LoadOpCode(state) for state in args]
return queue.SubmitJob(ops)
if method == luxi.REQ_SUBMIT_MANY_JOBS:
logging.info("Received multiple jobs")
jobs = []
for ops in args:
jobs.append([opcodes.OpCode.LoadOpCode(state) for state in ops])
return queue.SubmitManyJobs(jobs)
elif method == luxi.REQ_CANCEL_JOB:
job_id = args
logging.info("Received job cancel request for %s", job_id)
......@@ -465,7 +470,6 @@ def main():
"""Main function"""
options, args = ParseOptions()
utils.debug = options.debug
utils.no_fork = True
if options.fork:
......@@ -516,7 +520,7 @@ def main():
rpc.Init()
try:
# activate ip
master_node = ssconf.SimpleConfigReader().GetMasterNode()
master_node = ssconf.SimpleStore().GetMasterNode()
if not rpc.RpcRunner.call_node_start_master(master_node, False, False):
logging.error("Can't activate master IP address")
......
......@@ -26,9 +26,7 @@
import os
import sys
import traceback
import SocketServer
import errno
import logging
import signal
......@@ -754,7 +752,6 @@ def main():
global queue_lock
options, args = ParseOptions()
utils.debug = options.debug
if options.fork:
utils.CloseFDs()
......@@ -762,13 +759,9 @@ def main():
for fname in (constants.SSL_CERT_FILE,):
if not os.path.isfile(fname):
print "config %s not there, will not run." % fname
sys.exit(5)
sys.exit(constants.EXIT_NOTCLUSTER)
try:
port = utils.GetNodeDaemonPort()
except errors.ConfigurationError, err:
print "Cluster configuration incomplete: '%s'" % str(err)
sys.exit(5)
port = utils.GetNodeDaemonPort()
dirs = [(val, constants.RUN_DIRS_MODE) for val in constants.SUB_RUN_DIRS]
dirs.append((constants.LOG_OS_DIR, 0750))
......
......@@ -206,17 +206,20 @@ def ParseOptions():
parser.add_option("-f", "--foreground", dest="fork",
help="Don't detach from the current terminal",
default=True, action="store_false")
parser.add_option("-b", "--bind", dest="bind_address",
help="Bind address",
default="", metavar="ADDRESS")
options, args = parser.parse_args()
if len(args) != 0:
print >> sys.stderr, "Usage: %s [-d] [-p port]" % sys.argv[0]
sys.exit(1)
sys.exit(constants.EXIT_FAILURE)
if options.ssl and not (options.ssl_cert and options.ssl_key):
print >> sys.stderr, ("For secure mode please provide "
"--ssl-key and --ssl-cert arguments")
sys.exit(1)
"--ssl-key and --ssl-cert arguments")
sys.exit(constants.EXIT_FAILURE)
return options, args
......@@ -237,7 +240,7 @@ def main():
ssl_cert_path=options.ssl_cert)
except Exception, err:
sys.stderr.write("Can't load the SSL certificate/key: %s\n" % (err,))
sys.exit(1)
sys.exit(constants.EXIT_FAILURE)
else:
ssl_params = None
......@@ -252,7 +255,7 @@ def main():
utils.WritePidFile(constants.RAPI_PID)
try:
mainloop = daemon.Mainloop()
server = RemoteApiHttpServer(mainloop, "", options.port,
server = RemoteApiHttpServer(mainloop, options.bind_address, options.port,
ssl_params=ssl_params, ssl_verify_peer=False,
request_executor_class=
JsonErrorRequestExecutor)
......
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
# restart failed instances
# Restart failed instances (every 5 minutes)
*/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
# Clean job archive (at 01:45 AM)
45 1 * * * root [ -x @SBINDIR@/ganeti-cleaner ] && @SBINDIR@/ganeti-cleaner
......@@ -3,12 +3,12 @@
# based on skeleton from Debian GNU/Linux
### BEGIN INIT INFO
# Provides: ganeti
# Required-Start: $syslog $remote_fs xend
# Required-Stop: $syslog $remote_fs xend
# Required-Start: $syslog $remote_fs
# Required-Stop: $syslog $remote_fs
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Ganeti Xen Cluster Manager
# Description: Ganeti Xen Cluster Manager
# Short-Description: Ganeti Cluster Manager
# Description: Ganeti Cluster Manager
### END INIT INFO
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
......@@ -18,24 +18,18 @@ GANETIRUNDIR="@LOCALSTATEDIR@/run/ganeti"
GANETI_DEFAULTS_FILE="@SYSCONFDIR@/default/ganeti"
NODED_NAME="ganeti-noded"
NODED="@PREFIX@/sbin/${NODED_NAME}"
NODED_PID="${GANETIRUNDIR}/${NODED_NAME}.pid"
NODED="ganeti-noded"
NODED_ARGS=""
MASTERD_NAME="ganeti-masterd"
MASTERD="@PREFIX@/sbin/${MASTERD_NAME}"
MASTERD_PID="${GANETIRUNDIR}/${MASTERD_NAME}.pid"
MASTERD="ganeti-masterd"
MASTERD_ARGS=""
RAPI_NAME="ganeti-rapi"
RAPI="@PREFIX@/sbin/${RAPI_NAME}"
RAPI_PID="${GANETIRUNDIR}/${RAPI_NAME}.pid"
RAPI="ganeti-rapi"
RAPI_ARGS=""
SCRIPTNAME="@SYSCONFDIR@/init.d/ganeti"
test -f $NODED || exit 0
test -f "@PREFIX@/sbin/$NODED" || exit 0
. /lib/lsb/init-functions
......@@ -71,47 +65,66 @@ check_exitcode() {
}
start_action() {
# called as start_action daemon pidfile
# called as start_action daemon-name
local daemon="$1"; shift
local pidfile="$1"; shift
log_action_begin_msg "$daemon"
start-stop-daemon --start --quiet --exec "$daemon" --pidfile "$pidfile" \
start-stop-daemon --start --quiet \
--pidfile "${GANETIRUNDIR}/${daemon}.pid" \
--startas "@PREFIX@/sbin/$daemon" \
--oknodo \
-- "$@"
check_exitcode $?
}
stop_action() {
# called as stop_action daemon pidfile
log_action_begin_msg "$1"
# called as stop_action daemon-name
local daemon="$1"
log_action_begin_msg "$daemon"
start-stop-daemon --stop --quiet --oknodo \
--retry 30 --pidfile "$2"
--retry 30 --pidfile "${GANETIRUNDIR}/${daemon}.pid"
check_exitcode $?
}
maybe_do() {
requested="$1"; shift
action="$1"; shift
target="$1"
if [ -z "$requested" -o "$requested" = "$target" ]; then
$action "$@"
fi
}
if [ -n "$2" -a \
"$2" != "$NODED" -a \
"$2" != "$MASTERD" -a \
"$2" != "$RAPI" ]; then
log_failure_msg "Unknown daemon '$2' requested"
exit 1
fi
case "$1" in
start)
log_daemon_msg "Starting $DESC" "$NAME"
log_daemon_msg "Starting $DESC" "$2"
check_config
start_action $NODED $NODED_PID $NODED_ARGS
start_action $MASTERD $MASTERD_PID $MASTERD_ARGS
start_action $RAPI $RAPI_PID $RAPI_ARGS
maybe_do "$2" start_action $NODED $NODED_ARGS
maybe_do "$2" start_action $MASTERD $MASTERD_ARGS
maybe_do "$2" start_action $RAPI $RAPI_ARGS
;;
stop)
log_daemon_msg "Stopping $DESC" "$NAME"
stop_action $RAPI $RAPI_PID
stop_action $MASTERD $MASTERD_PID
stop_action $NODED $NODED_PID
log_daemon_msg "Stopping $DESC" "$2"
maybe_do "$2" stop_action $RAPI
maybe_do "$2" stop_action $MASTERD
maybe_do "$2" stop_action $NODED
;;
restart|force-reload)
log_daemon_msg "Reloading $DESC"
stop_action $RAPI $RAPI_PID
stop_action $MASTERD $MASTERD_PID
stop_action $NODED $NODED_PID
log_daemon_msg "Reloading $DESC" "$2"
maybe_do "$2" stop_action $RAPI
maybe_do "$2" stop_action $MASTERD
maybe_do "$2" stop_action $NODED
check_config
start_action $NODED $NODED_PID
start_action $MASTERD $MASTERD_PID
start_action $RAPI $RAPI_PID
maybe_do "$2" start_action $NODED $NODED_ARGS
maybe_do "$2" start_action $MASTERD $MASTERD_ARGS
maybe_do "$2" start_action $RAPI $RAPI_ARGS
;;
*)
log_success_msg "Usage: $SCRIPTNAME {start|stop|force-reload|restart}"
......
#!/bin/bash
# Copyright (C) 2009 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
# This is an example ganeti hook that writes the instance mac addresses in the
# node's /etc/ether file. It will pic up the first nic connected to the
# TARGET_BRIDGE bridge, and write it down with the syntax "MAC INSTANCE_NAME".
# The hook will also send a HUP signal the daemon whose PID is in
# DAEMON_PID_FILE, so that it can load the new /etc/ethers file and use it.
# This has been tested in conjunction with dnsmasq's dhcp implementation.
# It will also remove any other occurrences for the same instance in the
# aformentioned file. This hook supports the "instance-add", "instance-modify"
# "instance-remove", and "instance-mirror-replace" ganeti post hook paths. To
# install it add a symlink from those hooks' directories to where this file is
# installed (with a mode which permits execution).
# TARGET_BRIDGE: We'll only add the first nic which gets connected to this
# bridge to /etc/ethers.
TARGET_BRIDGE="br0"
DAEMON_PID_FILE="/var/run/dnsmasq.pid"
# In order to handle concurrent execution of this lock, we use the $LOCKFILE.
# LOCKFILE_CREATE and LOCKFILE_REMOVE are the path names for the lockfile-progs
# programs which we use as helpers.
LOCKFILE="/var/lock/ganeti_ethers"
LOCKFILE_CREATE="/usr/bin/lockfile-create"
LOCKFILE_REMOVE="/usr/bin/lockfile-remove"
hooks_path=$GANETI_HOOKS_PATH
[ -n "$hooks_path" ] || exit 1
instance=$GANETI_INSTANCE_NAME
[ -n "$instance" ] || exit 1
nic_count=$GANETI_INSTANCE_NIC_COUNT
acquire_lockfile() {
$LOCKFILE_CREATE $LOCKFILE || exit 1
trap "$LOCKFILE_REMOVE $LOCKFILE" EXIT
}
update_ethers_from_new() {
chmod 644 /etc/ethers.new
mv /etc/ethers.new /etc/ethers
[ -f "$DAEMON_PID_FILE" ] && kill -HUP $(< $DAEMON_PID_FILE)
}
if [ "$hooks_path" = "instance-add" -o \
"$hooks_path" = "instance-modify" -o \
"$hooks_path" = "instance-mirror-replace" ]
then
for i in $(seq 0 $((nic_count - 1)) ); do
bridge_var="GANETI_INSTANCE_NIC${i}_BRIDGE"
bridge=${!bridge_var}
if [ -n "$bridge" -a "$bridge" = "$TARGET_BRIDGE" ]; then
mac_var="GANETI_INSTANCE_NIC${i}_MAC"
mac=${!mac_var}
acquire_lockfile
cat /etc/ethers | awk -- "! /^([[:xdigit:]:]*)[[:blank:]]+$instance\>/;
END {print \"$mac\t$instance\"}" > /etc/ethers.new
update_ethers_from_new
break
fi
done
fi
if [ "$hooks_path" = "instance-remove" -o \
\( "$hooks_path" = "instance-modify" -a "$nic_count" -eq 0 \) ]; then
acquire_lockfile
cat /etc/ethers | awk -- "! /^([[:xdigit:]:]*)[[:blank:]]+$instance\>/" \
> /etc/ethers.new
update_ethers_from_new
fi
......@@ -104,7 +104,7 @@ The scripts will be run as follows:
be left
All informations about the cluster is passed using environment
All information about the cluster is passed using environment
variables. Different operations will have sligthly different
environments, but most of the variables are common.
......
......@@ -176,7 +176,7 @@ instances
nodes
dictionary with the data for the nodes in the cluster, indexed by
the node name; the dict contains:
the node name; the dict contains [*]_ :
total_disk
the total disk size of this node (mebibytes)
......@@ -225,15 +225,19 @@ nodes
or ``offline`` flags set. More details about these of node status
flags is available in the manpage *ganeti(7)*.
.. [*] Note that no run-time data is present for offline or drained nodes;
this means the tags total_memory, reserved_memory, free_memory, total_disk,
free_disk, total_cpus, i_pri_memory and i_pri_up memory will be absent
Respone message
~~~~~~~~~~~~~~~
Response message
~~~~~~~~~~~~~~~~
The response message is much more simple than the input one. It is
also a dict having three keys:
success
a boolean value denoting if the allocation was successfull or not
a boolean value denoting if the allocation was successful or not
info
a string with information from the scripts; if the allocation fails,
......
......@@ -19,7 +19,12 @@
# 02110-1301, USA.
"""Functions used by the node daemon"""
"""Functions used by the node daemon
@var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
the L{UploadFile} function
"""
import os
......@@ -115,6 +120,23 @@ def _CleanDirectory(path, exclude=None):
utils.RemoveFile(full_name)
def _BuildUploadFileList():
"""Build the list of allowed upload files.
This is abstracted so that it's built only once at module import time.
"""
return frozenset([
constants.CLUSTER_CONF_FILE,
constants.ETC_HOSTS,
constants.SSH_KNOWN_HOSTS_FILE,
constants.VNC_PASSWORD_FILE,
])
_ALLOWED_UPLOAD_FILES = _BuildUploadFileList()
def JobQueuePurge():
"""Removes job queue files and archived jobs.
......@@ -141,7 +163,7 @@ def GetMasterInfo():
master_netdev = cfg.GetMasterNetdev()
master_ip = cfg.GetMasterIP()
master_node = cfg.GetMasterNode()
except errors.ConfigurationError, err:
except errors.ConfigurationError:
logging.exception("Cluster configuration incomplete")
return (None, None, None)
return (master_netdev, master_ip, master_node)
......@@ -320,7 +342,7 @@ def LeaveCluster():
def GetNodeInfo(vgname, hypervisor_type):
"""Gives back a hash with different informations about the node.
"""Gives back a hash with different information about the node.
@type vgname: C{string}
@param vgname: the name of the volume group to ask for disk space information
......@@ -585,7 +607,7 @@ def GetInstanceList(hypervisor_list):
try:
names = hypervisor.GetHypervisor(hname).ListInstances()
results.extend(names)
except errors.HypervisorError, err:
except errors.HypervisorError:
logging.exception("Error enumerating instances for hypevisor %s", hname)
raise
......@@ -593,7 +615,7 @@ def GetInstanceList(hypervisor_list):
def GetInstanceInfo(instance, hname):
"""Gives back the informations about an instance as a dictionary.
"""Gives back the information about an instance as a dictionary.
@type instance: string
@param instance: the instance name
......@@ -758,7 +780,7 @@ def RunRenameInstance(instance, old_name):
def _GetVGInfo(vg_name):
"""Get informations about the volume group.
"""Get information about the volume group.
@type vg_name: str
@param vg_name: the volume group which we query
......@@ -930,7 +952,7 @@ def InstanceShutdown(instance):
# test every 10secs for 2min
time.sleep(1)
for dummy in range(11):
for _ in range(11):
if instance.name not in GetInstanceList([hv_name]):
break
time.sleep(10)
......@@ -1044,7 +1066,7 @@ def AcceptInstance(instance, info, target):
msg = "Failed to accept instance"
logging.exception(msg)
return (False, '%s: %s' % (msg, err))
return (True, "Accept successfull")
return (True, "Accept successful")
def FinalizeMigration(instance, info, success):
......@@ -1092,7 +1114,7 @@ def MigrateInstance(instance, target, live):
msg = "Failed to migrate instance"
logging.exception(msg)
return (False, "%s: %s" % (msg, err))
return (True, "Migration successfull")
return (True, "Migration successful")
def BlockdevCreate(disk, size, owner, on_primary, info):
......@@ -1285,7 +1307,7 @@ def BlockdevAssemble(disk, owner, as_primary):
def BlockdevShutdown(disk):
"""Shut down a block device.
First, if the device is assembled (Attach() is successfull), then
First, if the device is assembled (Attach() is successful), then
the device is shutdown. Then the children of the device are
shutdown.
......@@ -1403,7 +1425,7 @@ def BlockdevGetmirrorstatus(disks):
def _RecursiveFindBD(disk):
"""Check if a device is activated.
If so, return informations about the real device.
If so, return information about the real device.