Commit 4e3b9527 authored by Christos Stavrakakis's avatar Christos Stavrakakis
Browse files

cyclades: Retry failed instance creation

Handle special case where an OP_INSTANCE_CREATE job which uses the
opportunistic locking feature fails. This failure is probably due to the
fact that all nodes may be already locked. In this case, make
snf-dispatcher retry the failed job with opportunistic locking turned
off. Also, make snf-ganeti-eventd include in the message the input of
the failed job.

Finally, add setting 'GANETI_USE_OPPORTUNISTIC_LOCKING' to turn on/off
the use of opportunistic locking feature.
parent de4d23a7
......@@ -33,6 +33,10 @@
## a network. This requires qemu-kvm=1.0.
#GANETI_USE_HOTPLUG = False
#
## If True, Ganeti will try to allocate new instances only on nodes that are
## not already locked. This might result in slightly unbalanced clusters.
#GANETI_USE_OPPORTUNISTIC_LOCKING = True
#
## This module implements the strategy for allocating a vm to a backend
#BACKEND_ALLOCATOR_MODULE = "synnefo.logic.allocators.default_allocator"
## Refresh backend statistics timeout, in minutes, used in backend allocation
......
......@@ -33,6 +33,10 @@ GANETI_CREATEINSTANCE_KWARGS = {
# a network. This requires qemu-kvm=1.0.
GANETI_USE_HOTPLUG = False
# If True, Ganeti will try to allocate new instances only on nodes that are
# not already locked. This might result in slightly unbalanced clusters.
GANETI_USE_OPPORTUNISTIC_LOCKING = True
# This module implements the strategy for allocating a vm to a backend
BACKEND_ALLOCATOR_MODULE = "synnefo.logic.allocators.default_allocator"
# Refresh backend statistics timeout, in minutes, used in backend allocation
......
......@@ -434,7 +434,7 @@ def create_instance(vm, public_nic, flavor, image):
'img_format': image['format']}
# Use opportunistic locking
kw['opportunistic_locking'] = True
kw['opportunistic_locking'] = settings.GANETI_USE_OPPORTUNISTIC_LOCKING
# Defined in settings.GANETI_CREATEINSTANCE_KWARGS
# kw['hvparams'] = dict(serial_console=False)
......
......@@ -34,7 +34,8 @@ import logging
import json
from functools import wraps
from synnefo.db.models import Backend, VirtualMachine, Network, BackendNetwork
from synnefo.db.models import (Backend, VirtualMachine, Network,
BackendNetwork, pooled_rapi_client)
from synnefo.logic import utils, backend
from synnefo.lib.utils import merge_time
......@@ -170,9 +171,37 @@ def update_db(vm, msg, event_time):
log.error("Message is of unknown type %s.", msg['type'])
return
operation = msg["operation"]
status = msg["status"]
jobID = msg["jobId"]
logmsg = msg["logmsg"]
nics = msg.get("nics", None)
backend.process_op_status(vm, event_time, msg['jobId'], msg['operation'],
msg['status'], msg['logmsg'], nics)
job_fields = msg.get("job_fields", {})
# Special case: OP_INSTANCE_CREATE with opportunistic locking may fail
# if all Ganeti nodes are already locked. Retry the job without
# opportunistic locking..
if (operation == "OP_INSTANCE_CREATE" and status == "error" and
job_fields.get("opportunistic_locking", False)):
if vm.backendjobid != jobID: # The job has already been retried!
return
# Remove extra fields
[job_fields.pop(f) for f in ("OP_ID", "reason")]
name = job_fields.pop("name", job_fields.pop("instance_name"))
# Turn off opportunistic locking before retrying the job
job_fields["opportunistic_locking"] = False
with pooled_rapi_client(vm) as c:
jobID = c.CreateInstance(name=name, **job_fields)
# Update the VM fields
vm.backendjobid = jobID
vm.backendjobstatus = None
vm.save()
log.info("Retrying failed creation of instance '%s' without"
" opportunistic locking. New job ID: '%s'", name, jobID)
return
backend.process_op_status(vm, event_time, jobID, operation,
status, logmsg, nics)
log.debug("Done processing ganeti-op-status msg for vm %s.",
msg['instance'])
......
......@@ -247,7 +247,13 @@ class JobFileHandler(pyinotify.ProcessEvent):
nics = get_instance_nics(msg["instance"], self.logger)
msg["nics"] = nics
if op_id == "OP_INSTANCE_CREATE" and op.status == "error":
# In case an instance creation fails send the job input
# so that the job can be retried if needed.
msg["job_fields"] = op.Serialize()["input"]
msg = json.dumps(msg)
self.logger.debug("Delivering msg: %s (key=%s)", msg, routekey)
# Send the message to RabbitMQ
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment