Commit b6cd68a4 authored by Christos Stavrakakis's avatar Christos Stavrakakis
Browse files

cyclades: Fix removal of stale NICs in BUILD state

Until now, NICs that did not exist in the Ganeti backend and were in
BUILD state were removed after a timeout (BUILDING_NIC_TIMEOUT). This
was dangerous, because if  an OP_INSTANCE_CONNECT job had been "stuck"
in Ganeti, the NIC and it's IP address would be released. This commit
fixes this by removing such NICs only if the OP_INSTANCE_CONNECT has
been finished and the NIC does not exist in the Ganeti backend. Because,
querying Ganeti has a time overhead, this is only done for NICs that
have not been created too recently.
parent 4d00e690
......@@ -55,10 +55,6 @@ _firewall_tags = {
_reverse_tags = dict((v.split(':')[3], k) for k, v in _firewall_tags.items())
# Timeout in seconds for building NICs. After this period the NICs considered
# stale and removed from DB.
BUILDING_NIC_TIMEOUT = timedelta(seconds=180)
SIMPLE_NIC_FIELDS = ["state", "mac", "network", "firewall_profile", "index"]
COMPLEX_NIC_FIELDS = ["ipv4_address", "ipv6_address"]
NIC_FIELDS = SIMPLE_NIC_FIELDS + COMPLEX_NIC_FIELDS
......@@ -255,17 +251,12 @@ def _process_net_status(vm, etime, nics):
db_nic = db_nics.get(nic_name)
ganeti_nic = ganeti_nics.get(nic_name)
if ganeti_nic is None:
# NIC exists in DB but not in Ganeti. If the NIC is in 'building'
# state for more than 5 minutes, then we remove the NIC.
# TODO: This is dangerous as the job may be stack in the queue, and
# releasing the IP may lead to duplicate IP use.
if db_nic.state != "BUILD" or\
(db_nic.state == "BUILD" and
etime > db_nic.created + BUILDING_NIC_TIMEOUT):
if nic_is_stale(vm, nic):
log.debug("Removing stale NIC '%s'" % db_nic)
remove_nic_ips(db_nic)
db_nic.delete()
else:
log.warning("Ignoring recent building NIC: %s", db_nic)
log.info("NIC '%s' is still being created" % db_nic)
elif db_nic is None:
msg = ("NIC/%s of VM %s does not exist in DB! Cannot automatically"
" fix this issue!" % (nic_name, vm))
......@@ -782,15 +773,35 @@ def network_exists_in_backend(backend_network):
return False
def job_is_still_running(vm):
def job_is_still_running(vm, job_id=None):
with pooled_rapi_client(vm) as c:
try:
job_info = c.GetJobStatus(vm.backendjobid)
if job_id is None:
job_id = vm.backendjobid
job_info = c.GetJobStatus(job_id)
return not (job_info["status"] in rapi.JOB_STATUS_FINALIZED)
except rapi.GanetiApiError:
return False
def nic_is_stale(vm, nic, timeout=60):
"""Check if a NIC is stale or exists in the Ganeti backend."""
# First check the state of the NIC and if there is a pending CONNECT
if nic.state == "BUILD" and vm.task == "CONNECT":
if datetime.now() < nic.created + timedelta(seconds=timeout):
# Do not check for too recent NICs to avoid the time overhead
return False
if job_is_still_running(vm, job_id=vm.task_job_id):
return False
else:
# If job has finished, check that the NIC exists, because the
# message may have been lost or stuck in the queue.
vm_info = get_instance_info(vm)
if nic.backend_uuid in vm_info["nic.names"]:
return False
return True
def ensure_network_is_active(backend, network_id):
"""Ensure that a network is active in the specified backend
......
......@@ -61,7 +61,7 @@ from django.conf import settings
import logging
import itertools
import bitarray
from datetime import datetime
from datetime import datetime, timedelta
from django.db import transaction
from synnefo.db.models import (Backend, VirtualMachine, Flavor,
......@@ -74,10 +74,7 @@ from synnefo.logic import utils, backend as backend_mod
logger = logging.getLogger()
logging.basicConfig()
try:
CHECK_INTERVAL = settings.RECONCILIATION_CHECK_INTERVAL
except AttributeError:
CHECK_INTERVAL = 60
BUILDING_NIC_TIMEOUT = timedelta(seconds=120)
GANETI_JOB_ERROR = "error"
GANETI_JOBS_PENDING = ["queued", "waiting", "running", "canceling"]
......@@ -285,8 +282,7 @@ class BackendReconciler(object):
server_id)
def reconcile_unsynced_nics(self, server_id, db_server, gnt_server):
building_time = (self.event_time -
backend_mod.BUILDING_NIC_TIMEOUT)
building_time = self.event_time - BUILDING_NIC_TIMEOUT
db_nics = db_server.nics.exclude(state="BUILD",
created__lte=building_time) \
.order_by("id")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment