From 6906a9d8e61fedad8779b0fe5e56756a403a2361 Mon Sep 17 00:00:00 2001
From: Guido Trotter <ultrotter@google.com>
Date: Wed, 21 Jan 2009 09:54:11 +0000
Subject: [PATCH] Add calls in the intra-node migration protocol

Currently the hypervisor is expected to do all the migration from the
source side. With this patch we also add the option of passing some
information to the target side, and starting some operation there.

As a bonus, a function to cleanup any started operation is included.

Reviewed-by: iustinp
---
 daemons/ganeti-noded | 26 ++++++++++++++
 lib/backend.py       | 38 ++++++++++++++++++++
 lib/cmdlib.py        | 86 ++++++++++++++++++++++++++++++++++++++------
 lib/rpc.py           | 53 +++++++++++++++++++++++++++
 4 files changed, 192 insertions(+), 11 deletions(-)

diff --git a/daemons/ganeti-noded b/daemons/ganeti-noded
index c146e8b69..08984e957 100755
--- a/daemons/ganeti-noded
+++ b/daemons/ganeti-noded
@@ -402,6 +402,32 @@ class NodeHttpServer(http.server.HttpServer):
     extra_args = params[1]
     return backend.StartInstance(instance, extra_args)
 
+  @staticmethod
+  def perspective_migration_info(params):
+    """Gather information about an instance to be migrated.
+
+    """
+    instance = objects.Instance.FromDict(params[0])
+    return backend.MigrationInfo(instance)
+
+  @staticmethod
+  def perspective_accept_instance(params):
+    """Prepare the node to accept an instance.
+
+    """
+    instance, info, target = params
+    instance = objects.Instance.FromDict(instance)
+    return backend.AcceptInstance(instance, info, target)
+
+  @staticmethod
+  def perspective_finalize_migration(params):
+    """Finalize the instance migration.
+
+    """
+    instance, info, success = params
+    instance = objects.Instance.FromDict(instance)
+    return backend.FinalizeMigration(instance, info, success)
+
   @staticmethod
   def perspective_instance_migrate(params):
     """Migrates an instance.
diff --git a/lib/backend.py b/lib/backend.py
index cb9261a0d..6147a4684 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -973,6 +973,44 @@ def RebootInstance(instance, reboot_type, extra_args):
   return True
 
 
+def MigrationInfo(instance):
+  """Gather information about an instance to be migrated.
+
+  @type instance: L{objects.Instance}
+  @param instance: the instance definition
+
+  """
+  return (True, '')
+
+
+def AcceptInstance(instance, info, target):
+  """Prepare the node to accept an instance.
+
+  @type instance: L{objects.Instance}
+  @param instance: the instance definition
+  @type info: string/data (opaque)
+  @param info: migration information, from the source node
+  @type target: string
+  @param target: target host (usually ip), on this node
+
+  """
+  return (True, "Accept successfull")
+
+
+def FinalizeMigration(instance, info, success):
+  """Finalize any preparation to accept an instance.
+
+  @type instance: L{objects.Instance}
+  @param instance: the instance definition
+  @type info: string/data (opaque)
+  @param info: migration information, from the source node
+  @type success: boolean
+  @param success: whether the migration was a success or a failure
+
+  """
+  return (True, "Migration Finalized")
+
+
 def MigrateInstance(instance, target, live):
   """Migrates an instance to another node.
 
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index f1b4ab226..b06cddc17 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -3631,6 +3631,41 @@ class LUMigrateInstance(LogicalUnit):
 
     self.feedback_fn("* done")
 
+  def _RevertDiskStatus(self):
+    """Try to revert the disk status after a failed migration.
+
+    """
+    target_node = self.target_node
+    try:
+      self._EnsureSecondary(target_node)
+      self._GoStandalone()
+      self._GoReconnect(False)
+      self._WaitUntilSync()
+    except errors.OpExecError, err:
+      self.LogWarning("Migration failed and I can't reconnect the"
+                      " drives: error '%s'\n"
+                      "Please look and recover the instance status" %
+                      str(err))
+
+  def _AbortMigration(self):
+    """Call the hypervisor code to abort a started migration.
+
+    """
+    instance = self.instance
+    target_node = self.target_node
+    migration_info = self.migration_info
+
+    abort_result = self.rpc.call_finalize_migration(target_node,
+                                                    instance,
+                                                    migration_info,
+                                                    False)
+    abort_msg = abort_result.RemoteFailMsg()
+    if abort_msg:
+      logging.error("Aborting migration failed on target node %s: %s" %
+                    (target_node, abort_msg))
+      # Don't raise an exception here, as we stil have to try to revert the
+      # disk status, even if this step failed.
+
   def _ExecMigration(self):
     """Migrate an instance.
 
@@ -3654,11 +3689,38 @@ class LUMigrateInstance(LogicalUnit):
                                  " synchronized on target node,"
                                  " aborting migrate." % dev.iv_name)
 
+    # First get the migration information from the remote node
+    result = self.rpc.call_migration_info(source_node, instance)
+    msg = result.RemoteFailMsg()
+    if msg:
+      log_err = ("Failed fetching source migration information from %s: %s" %
+                  (source_node, msg))
+      logging.error(log_err)
+      raise errors.OpExecError(log_err)
+
+    self.migration_info = migration_info = result.data[1]
+
+    # Then switch the disks to master/master mode
     self._EnsureSecondary(target_node)
     self._GoStandalone()
     self._GoReconnect(True)
     self._WaitUntilSync()
 
+    self.feedback_fn("* preparing %s to accept the instance" % target_node)
+    result = self.rpc.call_accept_instance(target_node,
+                                           instance,
+                                           migration_info,
+                                           self.nodes_ip[target_node])
+
+    msg = result.RemoteFailMsg()
+    if msg:
+      logging.error("Instance pre-migration failed, trying to revert"
+                    " disk status: %s", msg)
+      self._AbortMigration()
+      self._RevertDiskStatus()
+      raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
+                               (instance.name, msg))
+
     self.feedback_fn("* migrating instance to %s" % target_node)
     time.sleep(10)
     result = self.rpc.call_instance_migrate(source_node, instance,
@@ -3668,17 +3730,8 @@ class LUMigrateInstance(LogicalUnit):
     if msg:
       logging.error("Instance migration failed, trying to revert"
                     " disk status: %s", msg)
-      try:
-        self._EnsureSecondary(target_node)
-        self._GoStandalone()
-        self._GoReconnect(False)
-        self._WaitUntilSync()
-      except errors.OpExecError, err:
-        self.LogWarning("Migration failed and I can't reconnect the"
-                        " drives: error '%s'\n"
-                        "Please look and recover the instance status" %
-                        str(err))
-
+      self._AbortMigration()
+      self._RevertDiskStatus()
       raise errors.OpExecError("Could not migrate instance %s: %s" %
                                (instance.name, msg))
     time.sleep(10)
@@ -3687,6 +3740,17 @@ class LUMigrateInstance(LogicalUnit):
     # distribute new instance config to the other nodes
     self.cfg.Update(instance)
 
+    result = self.rpc.call_finalize_migration(target_node,
+                                              instance,
+                                              migration_info,
+                                              True)
+    msg = result.RemoteFailMsg()
+    if msg:
+      logging.error("Instance migration succeeded, but finalization failed:"
+                    " %s" % msg)
+      raise errors.OpExecError("Could not finalize instance migration: %s" %
+                               msg)
+
     self._EnsureSecondary(source_node)
     self._WaitUntilSync()
     self._GoStandalone()
diff --git a/lib/rpc.py b/lib/rpc.py
index 467163e8b..fc71a9335 100644
--- a/lib/rpc.py
+++ b/lib/rpc.py
@@ -432,6 +432,59 @@ class RpcRunner(object):
     return self._SingleNodeCall(node, "instance_shutdown",
                                 [self._InstDict(instance)])
 
+  def call_migration_info(self, node, instance):
+    """Gather the information necessary to prepare an instance migration.
+
+    This is a single-node call.
+
+    @type node: string
+    @param node: the node on which the instance is currently running
+    @type instance: C{objects.Instance}
+    @param instance: the instance definition
+
+    """
+    return self._SingleNodeCall(node, "migration_info",
+                                [self._InstDict(instance)])
+
+  def call_accept_instance(self, node, instance, info, target):
+    """Prepare a node to accept an instance.
+
+    This is a single-node call.
+
+    @type node: string
+    @param node: the target node for the migration
+    @type instance: C{objects.Instance}
+    @param instance: the instance definition
+    @type info: opaque/hypervisor specific (string/data)
+    @param info: result for the call_migration_info call
+    @type target: string
+    @param target: target hostname (usually ip address) (on the node itself)
+
+    """
+    return self._SingleNodeCall(node, "accept_instance",
+                                [self._InstDict(instance), info, target])
+
+  def call_finalize_migration(self, node, instance, info, success):
+    """Finalize any target-node migration specific operation.
+
+    This is called both in case of a successful migration and in case of error
+    (in which case it should abort the migration).
+
+    This is a single-node call.
+
+    @type node: string
+    @param node: the target node for the migration
+    @type instance: C{objects.Instance}
+    @param instance: the instance definition
+    @type info: opaque/hypervisor specific (string/data)
+    @param info: result for the call_migration_info call
+    @type success: boolean
+    @param success: whether the migration was a success or a failure
+
+    """
+    return self._SingleNodeCall(node, "finalize_migration",
+                                [self._InstDict(instance), info, success])
+
   def call_instance_migrate(self, node, instance, target, live):
     """Migrate an instance.
 
-- 
GitLab