From 3f78eef21e5c4f401db51376664f68ed16a67e90 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Fri, 2 Nov 2007 13:44:29 +0000
Subject: [PATCH] Implement device to instance mapping cache

Currently, troubleshooting DRBD problems involves a manual process of going
backwards from the DRBD device to the instance that owns it.

This patch adds a weak (i.e. not guaranteed to be correct or up-to-date)
cache of device to instance. The cache should be, in normal operation,
having correct information as the only time when devices change paths
are when they are started/stopped, and the code in backend.py adds cache
updates to exactly these operations.

The only drawback of this implementation is that we don't fully update
the cache on renames of devices (we clean the old entries but we don't
add new ones). Since the rename changes the path only for LVs (and not
drbd and md), this is less of a problem as the target of this code is
debugging DRBD and MD issues.

The patch writes files named bdev_drbd<N> (or bdev_md<N>,
bdev_xenvg_...) in /var/run/ganeti (more exactly, LOCALSTATEDIR/ganeti).
The files start with 'bdev_' and continue with the path of the device
under /dev/ (this prefix stripped), and contain the following values,
space separated:
  - instance name
  - primary or secondary (depending on how the device is on the primary
    or secondary node)
  - instance visible name: sda or sdb or not_visible, the latter case
    when the device is not the top-level device (i.e. remote_raid1
    templates will have sd[ab] for the md, but not_visible for drbd and
    logical volumes)

The cache is designed to not raise any errors, if there is an I/O error
it will only be logged in the node daemon log file. This is in order to
reduce the possible impact of the cache on the block device activation
and shutdown code.

Reviewed-by: imsnah
---
 daemons/ganeti-noded |  8 ++---
 lib/backend.py       | 85 ++++++++++++++++++++++++++++++++++++++++----
 lib/cmdlib.py        | 44 ++++++++++++++---------
 lib/constants.py     |  3 +-
 lib/rpc.py           |  8 ++---
 5 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded
index 954010645..92825bb42 100755
--- a/daemons/ganeti-noded
+++ b/daemons/ganeti-noded
@@ -97,11 +97,11 @@ class ServerObject(pb.Avatar):
     """Create a block device.
 
     """
-    bdev_s, size, on_primary, info = params
+    bdev_s, size, owner, on_primary, info = params
     bdev = objects.Disk.FromDict(bdev_s)
     if bdev is None:
       raise ValueError("can't unserialize data!")
-    return backend.CreateBlockDevice(bdev, size, on_primary, info)
+    return backend.CreateBlockDevice(bdev, size, owner, on_primary, info)
 
   @staticmethod
   def perspective_blockdev_remove(params):
@@ -125,11 +125,11 @@ class ServerObject(pb.Avatar):
     """Assemble a block device.
 
     """
-    bdev_s, on_primary = params
+    bdev_s, owner, on_primary = params
     bdev = objects.Disk.FromDict(bdev_s)
     if bdev is None:
       raise ValueError("can't unserialize data!")
-    return backend.AssembleBlockDevice(bdev, on_primary)
+    return backend.AssembleBlockDevice(bdev, owner, on_primary)
 
   @staticmethod
   def perspective_blockdev_shutdown(params):
diff --git a/lib/backend.py b/lib/backend.py
index 2719ab3c7..69d75d80a 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -620,7 +620,7 @@ def RebootInstance(instance, reboot_type, extra_args):
   return True
 
 
-def CreateBlockDevice(disk, size, on_primary, info):
+def CreateBlockDevice(disk, size, owner, on_primary, info):
   """Creates a block device for an instance.
 
   Args:
@@ -638,7 +638,7 @@ def CreateBlockDevice(disk, size, on_primary, info):
   clist = []
   if disk.children:
     for child in disk.children:
-      crdev = _RecursiveAssembleBD(child, on_primary)
+      crdev = _RecursiveAssembleBD(child, owner, on_primary)
       if on_primary or disk.AssembleOnSecondary():
         # we need the children open in case the device itself has to
         # be assembled
@@ -664,6 +664,8 @@ def CreateBlockDevice(disk, size, on_primary, info):
     device.SetSyncSpeed(constants.SYNC_SPEED)
     if on_primary or disk.OpenOnSecondary():
       device.Open(force=True)
+    DevCacheManager.UpdateCache(device.dev_path, owner,
+                                on_primary, disk.iv_name)
 
   device.SetInfo(info)
 
@@ -686,7 +688,10 @@ def RemoveBlockDevice(disk):
     logger.Info("Can't attach to device %s in remove" % disk)
     rdev = None
   if rdev is not None:
+    r_path = rdev.dev_path
     result = rdev.Remove()
+    if result:
+      DevCacheManager.RemoveCache(r_path)
   else:
     result = True
   if disk.children:
@@ -695,7 +700,7 @@ def RemoveBlockDevice(disk):
   return result
 
 
-def _RecursiveAssembleBD(disk, as_primary):
+def _RecursiveAssembleBD(disk, owner, as_primary):
   """Activate a block device for an instance.
 
   This is run on the primary and secondary nodes for an instance.
@@ -715,7 +720,7 @@ def _RecursiveAssembleBD(disk, as_primary):
   children = []
   if disk.children:
     for chld_disk in disk.children:
-      children.append(_RecursiveAssembleBD(chld_disk, as_primary))
+      children.append(_RecursiveAssembleBD(chld_disk, owner, as_primary))
 
   if as_primary or disk.AssembleOnSecondary():
     r_dev = bdev.AttachOrAssemble(disk.dev_type, disk.physical_id, children)
@@ -725,12 +730,15 @@ def _RecursiveAssembleBD(disk, as_primary):
       r_dev.Open()
     else:
       r_dev.Close()
+    DevCacheManager.UpdateCache(r_dev.dev_path, owner,
+                                as_primary, disk.iv_name)
+
   else:
     result = True
   return result
 
 
-def AssembleBlockDevice(disk, as_primary):
+def AssembleBlockDevice(disk, owner, as_primary):
   """Activate a block device for an instance.
 
   This is a wrapper over _RecursiveAssembleBD.
@@ -740,7 +748,7 @@ def AssembleBlockDevice(disk, as_primary):
     True for secondary nodes
 
   """
-  result = _RecursiveAssembleBD(disk, as_primary)
+  result = _RecursiveAssembleBD(disk, owner, as_primary)
   if isinstance(result, bdev.BlockDev):
     result = result.dev_path
   return result
@@ -759,7 +767,10 @@ def ShutdownBlockDevice(disk):
   """
   r_dev = _RecursiveFindBD(disk)
   if r_dev is not None:
+    r_path = r_dev.dev_path
     result = r_dev.Shutdown()
+    if result:
+      DevCacheManager.RemoveCache(r_path)
   else:
     result = True
   if disk.children:
@@ -1356,7 +1367,16 @@ def RenameBlockDevices(devlist):
       result = False
       continue
     try:
+      old_rpath = dev.dev_path
       dev.Rename(unique_id)
+      new_rpath = dev.dev_path
+      if old_rpath != new_rpath:
+        DevCacheManager.RemoveCache(old_rpath)
+        # FIXME: we should add the new cache information here, like:
+        # DevCacheManager.UpdateCache(new_rpath, owner, ...)
+        # but we don't have the owner here - maybe parse from existing
+        # cache? for now, we only lose lvm data when we rename, which
+        # is less critical than DRBD or MD
     except errors.BlockDeviceError, err:
       logger.Error("Can't rename device '%s' to '%s': %s" %
                    (dev, unique_id, err))
@@ -1473,3 +1493,56 @@ class HooksRunner(object):
       rr.append(("%s/%s" % (subdir, relname), rrval, output))
 
     return rr
+
+
+class DevCacheManager(object):
+  """Simple class for managing a chache of block device information.
+
+  """
+  _DEV_PREFIX = "/dev/"
+  _ROOT_DIR = constants.BDEV_CACHE_DIR
+
+  @classmethod
+  def _ConvertPath(cls, dev_path):
+    """Converts a /dev/name path to the cache file name.
+
+    This replaces slashes with underscores and strips the /dev
+    prefix. It then returns the full path to the cache file
+
+    """
+    if dev_path.startswith(cls._DEV_PREFIX):
+      dev_path = dev_path[len(cls._DEV_PREFIX):]
+    dev_path = dev_path.replace("/", "_")
+    fpath = "%s/bdev_%s" % (cls._ROOT_DIR, dev_path)
+    return fpath
+
+  @classmethod
+  def UpdateCache(cls, dev_path, owner, on_primary, iv_name):
+    """Updates the cache information for a given device.
+
+    """
+    fpath = cls._ConvertPath(dev_path)
+    if on_primary:
+      state = "primary"
+    else:
+      state = "secondary"
+    if iv_name is None:
+      iv_name = "not_visible"
+    fdata = "%s %s %s\n" % (str(owner), state, iv_name)
+    try:
+      utils.WriteFile(fpath, data=fdata)
+    except EnvironmentError, err:
+      logger.Error("Can't update bdev cache for %s, error %s" %
+                   (dev_path, str(err)))
+
+  @classmethod
+  def RemoveCache(cls, dev_path):
+    """Remove data for a dev_path.
+
+    """
+    fpath = cls._ConvertPath(dev_path)
+    try:
+      utils.RemoveFile(fpath)
+    except EnvironmentError, err:
+      logger.Error("Can't update bdev cache for %s, error %s" %
+                   (dev_path, str(err)))
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index bcb29a39d..829c01913 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -1821,7 +1821,8 @@ def _AssembleInstanceDisks(instance, cfg, ignore_secondaries=False):
     for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
       cfg.SetDiskID(node_disk, node)
       is_primary = node == instance.primary_node
-      result = rpc.call_blockdev_assemble(node, node_disk, is_primary)
+      result = rpc.call_blockdev_assemble(node, node_disk,
+                                          instance.name, is_primary)
       if not result:
         logger.Error("could not prepare block device %s on node %s (is_pri"
                      "mary=%s)" % (inst_disk.iv_name, node, is_primary))
@@ -2560,7 +2561,7 @@ class LUFailoverInstance(LogicalUnit):
                                (instance.name, target_node))
 
 
-def _CreateBlockDevOnPrimary(cfg, node, device, info):
+def _CreateBlockDevOnPrimary(cfg, node, instance, device, info):
   """Create a tree of block devices on the primary node.
 
   This always creates all devices.
@@ -2568,11 +2569,12 @@ def _CreateBlockDevOnPrimary(cfg, node, device, info):
   """
   if device.children:
     for child in device.children:
-      if not _CreateBlockDevOnPrimary(cfg, node, child, info):
+      if not _CreateBlockDevOnPrimary(cfg, node, instance, child, info):
         return False
 
   cfg.SetDiskID(device, node)
-  new_id = rpc.call_blockdev_create(node, device, device.size, True, info)
+  new_id = rpc.call_blockdev_create(node, device, device.size,
+                                    instance.name, True, info)
   if not new_id:
     return False
   if device.physical_id is None:
@@ -2580,7 +2582,7 @@ def _CreateBlockDevOnPrimary(cfg, node, device, info):
   return True
 
 
-def _CreateBlockDevOnSecondary(cfg, node, device, force, info):
+def _CreateBlockDevOnSecondary(cfg, node, instance, device, force, info):
   """Create a tree of block devices on a secondary node.
 
   If this device type has to be created on secondaries, create it and
@@ -2593,13 +2595,15 @@ def _CreateBlockDevOnSecondary(cfg, node, device, force, info):
     force = True
   if device.children:
     for child in device.children:
-      if not _CreateBlockDevOnSecondary(cfg, node, child, force, info):
+      if not _CreateBlockDevOnSecondary(cfg, node, instance,
+                                        child, force, info):
         return False
 
   if not force:
     return True
   cfg.SetDiskID(device, node)
-  new_id = rpc.call_blockdev_create(node, device, device.size, False, info)
+  new_id = rpc.call_blockdev_create(node, device, device.size,
+                                    instance.name, False, info)
   if not new_id:
     return False
   if device.physical_id is None:
@@ -2754,13 +2758,14 @@ def _CreateDisks(cfg, instance):
               (device.iv_name, instance.name))
     #HARDCODE
     for secondary_node in instance.secondary_nodes:
-      if not _CreateBlockDevOnSecondary(cfg, secondary_node, device, False,
-                                        info):
+      if not _CreateBlockDevOnSecondary(cfg, secondary_node, instance,
+                                        device, False, info):
         logger.Error("failed to create volume %s (%s) on secondary node %s!" %
                      (device.iv_name, device, secondary_node))
         return False
     #HARDCODE
-    if not _CreateBlockDevOnPrimary(cfg, instance.primary_node, device, info):
+    if not _CreateBlockDevOnPrimary(cfg, instance.primary_node,
+                                    instance, device, info):
       logger.Error("failed to create volume %s on primary!" %
                    device.iv_name)
       return False
@@ -3206,14 +3211,16 @@ class LUAddMDDRBDComponent(LogicalUnit):
 
     logger.Info("adding new mirror component on secondary")
     #HARDCODE
-    if not _CreateBlockDevOnSecondary(self.cfg, remote_node, new_drbd, False,
+    if not _CreateBlockDevOnSecondary(self.cfg, remote_node, instance,
+                                      new_drbd, False,
                                       _GetInstanceInfoText(instance)):
       raise errors.OpExecError("Failed to create new component on secondary"
                                " node %s" % remote_node)
 
     logger.Info("adding new mirror component on primary")
     #HARDCODE
-    if not _CreateBlockDevOnPrimary(self.cfg, instance.primary_node, new_drbd,
+    if not _CreateBlockDevOnPrimary(self.cfg, instance.primary_node,
+                                    instance, new_drbd,
                                     _GetInstanceInfoText(instance)):
       # remove secondary dev
       self.cfg.SetDiskID(new_drbd, remote_node)
@@ -3444,7 +3451,8 @@ class LUReplaceDisks(LogicalUnit):
       logger.Info("adding new mirror component on secondary for %s" %
                   dev.iv_name)
       #HARDCODE
-      if not _CreateBlockDevOnSecondary(cfg, remote_node, new_drbd, False,
+      if not _CreateBlockDevOnSecondary(cfg, remote_node, instance,
+                                        new_drbd, False,
                                         _GetInstanceInfoText(instance)):
         raise errors.OpExecError("Failed to create new component on"
                                  " secondary node %s\n"
@@ -3453,7 +3461,8 @@ class LUReplaceDisks(LogicalUnit):
 
       logger.Info("adding new mirror component on primary")
       #HARDCODE
-      if not _CreateBlockDevOnPrimary(cfg, instance.primary_node, new_drbd,
+      if not _CreateBlockDevOnPrimary(cfg, instance.primary_node,
+                                      instance, new_drbd,
                                       _GetInstanceInfoText(instance)):
         # remove secondary dev
         cfg.SetDiskID(new_drbd, remote_node)
@@ -3558,7 +3567,7 @@ class LUReplaceDisks(LogicalUnit):
       # _Create...OnPrimary (which forces the creation), even if we
       # are talking about the secondary node
       for new_lv in new_lvs:
-        if not _CreateBlockDevOnPrimary(cfg, tgt_node, new_lv,
+        if not _CreateBlockDevOnPrimary(cfg, tgt_node, instance, new_lv,
                                         _GetInstanceInfoText(instance)):
           raise errors.OpExecError("Failed to create new LV named '%s' on"
                                    " node '%s'" %
@@ -3669,7 +3678,7 @@ class LUReplaceDisks(LogicalUnit):
       # _Create...OnPrimary (which forces the creation), even if we
       # are talking about the secondary node
       for new_lv in dev.children:
-        if not _CreateBlockDevOnPrimary(cfg, new_node, new_lv,
+        if not _CreateBlockDevOnPrimary(cfg, new_node, instance, new_lv,
                                         _GetInstanceInfoText(instance)):
           raise errors.OpExecError("Failed to create new LV named '%s' on"
                                    " node '%s'" %
@@ -3680,7 +3689,8 @@ class LUReplaceDisks(LogicalUnit):
                               logical_id=(pri_node, new_node,
                                           dev.logical_id[2]),
                               children=dev.children)
-      if not _CreateBlockDevOnSecondary(cfg, new_node, new_drbd, False,
+      if not _CreateBlockDevOnSecondary(cfg, new_node, instance,
+                                        new_drbd, False,
                                       _GetInstanceInfoText(instance)):
         raise errors.OpExecError("Failed to create new DRBD on"
                                  " node '%s'" % new_node)
diff --git a/lib/constants.py b/lib/constants.py
index 2a810b109..8ade4b680 100644
--- a/lib/constants.py
+++ b/lib/constants.py
@@ -25,7 +25,7 @@ from ganeti import _autoconf
 
 # various versions
 CONFIG_VERSION = 3
-PROTOCOL_VERSION = 4
+PROTOCOL_VERSION = 5
 RELEASE_VERSION = _autoconf.PACKAGE_VERSION
 OS_API_VERSION = 5
 EXPORT_VERSION = 0
@@ -33,6 +33,7 @@ EXPORT_VERSION = 0
 
 # file paths
 DATA_DIR = _autoconf.LOCALSTATEDIR + "/lib/ganeti"
+BDEV_CACHE_DIR = _autoconf.LOCALSTATEDIR + "/run/ganeti"
 CLUSTER_CONF_FILE = DATA_DIR + "/config.data"
 SSL_CERT_FILE = DATA_DIR + "/server.pem"
 WATCHER_STATEFILE = DATA_DIR + "/watcher.data"
diff --git a/lib/rpc.py b/lib/rpc.py
index 790ad5e40..d84a85e04 100644
--- a/lib/rpc.py
+++ b/lib/rpc.py
@@ -489,13 +489,13 @@ def call_version(node_list):
   return c.getresult()
 
 
-def call_blockdev_create(node, bdev, size, on_primary, info):
+def call_blockdev_create(node, bdev, size, owner, on_primary, info):
   """Request creation of a given block device.
 
   This is a single-node call.
 
   """
-  params = [bdev.ToDict(), size, on_primary, info]
+  params = [bdev.ToDict(), size, owner, on_primary, info]
   c = Client("blockdev_create", params)
   c.connect(node)
   c.run()
@@ -527,13 +527,13 @@ def call_blockdev_rename(node, devlist):
   return c.getresult().get(node, False)
 
 
-def call_blockdev_assemble(node, disk, on_primary):
+def call_blockdev_assemble(node, disk, owner, on_primary):
   """Request assembling of a given block device.
 
   This is a single-node call.
 
   """
-  params = [disk.ToDict(), on_primary]
+  params = [disk.ToDict(), owner, on_primary]
   c = Client("blockdev_assemble", params)
   c.connect(node)
   c.run()
-- 
GitLab