From 53c776b5b9204d16d0f1ae15c8e2bc9805d88e45 Mon Sep 17 00:00:00 2001
From: Iustin Pop <iustin@google.com>
Date: Tue, 13 Jan 2009 15:21:22 +0000
Subject: [PATCH] Forward port the live migration from 1.2 branch

This is forward port via copy (and not individual patches cherry-pick)
of the latest code on the 1.2 branch related to the migration.

The changes compared to 1.2 are the fact that we don't need the
IdentifyDisks step anymore (the drbd rpc calls are independent now), and
the rpc module improvements.

Reviewed-by: ultrotter
---
 lib/backend.py           |  12 +-
 lib/cmdlib.py            | 304 +++++++++++++++++++++++++++++++++++++++
 lib/hypervisor/hv_xen.py |  12 +-
 lib/mcpu.py              |   1 +
 lib/opcodes.py           |  13 ++
 scripts/gnt-instance     |  55 +++++++
 6 files changed, 388 insertions(+), 9 deletions(-)

diff --git a/lib/backend.py b/lib/backend.py
index f73d87ebc..75c4bdbdf 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -975,14 +975,14 @@ def MigrateInstance(instance, target, live):
       - msg is a string with details in case of failure
 
   """
-  hyper = hypervisor.GetHypervisor(instance.hypervisor_name)
+  hyper = hypervisor.GetHypervisor(instance.hypervisor)
 
   try:
     hyper.MigrateInstance(instance.name, target, live)
   except errors.HypervisorError, err:
-    msg = "Failed to migrate instance: %s" % str(err)
-    logging.error(msg)
-    return (False, msg)
+    msg = "Failed to migrate instance"
+    logging.exception(msg)
+    return (False, "%s: %s" % (msg, err))
   return (True, "Migration successfull")
 
 
@@ -2223,9 +2223,9 @@ def DrbdAttachNet(nodes_ip, disks, instance_name, multimaster):
     return status, bdevs
 
   if multimaster:
-    for cf, rd in zip(disks, bdevs):
+    for idx, rd in enumerate(bdevs):
       try:
-        _SymlinkBlockDev(instance_name, rd.dev_path, cf.iv_name)
+        _SymlinkBlockDev(instance_name, rd.dev_path, idx)
       except EnvironmentError, err:
         return (False, "Can't create symlink: %s" % str(err))
   # reconnect disks, switch to new master configuration and if
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index cfd031f47..f98204283 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -3387,6 +3387,310 @@ class LUFailoverInstance(LogicalUnit):
                                  (instance.name, target_node))
 
 
+class LUMigrateInstance(LogicalUnit):
+  """Migrate an instance.
+
+  This is migration without shutting down, compared to the failover,
+  which is done with shutdown.
+
+  """
+  HPATH = "instance-migrate"
+  HTYPE = constants.HTYPE_INSTANCE
+  _OP_REQP = ["instance_name", "live", "cleanup"]
+
+  REQ_BGL = False
+
+  def ExpandNames(self):
+    self._ExpandAndLockInstance()
+    self.needed_locks[locking.LEVEL_NODE] = []
+    self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_REPLACE
+
+  def DeclareLocks(self, level):
+    if level == locking.LEVEL_NODE:
+      self._LockInstancesNodes()
+
+  def BuildHooksEnv(self):
+    """Build hooks env.
+
+    This runs on master, primary and secondary nodes of the instance.
+
+    """
+    env = _BuildInstanceHookEnvByObject(self, self.instance)
+    nl = [self.cfg.GetMasterNode()] + list(self.instance.secondary_nodes)
+    return env, nl, nl
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    This checks that the instance is in the cluster.
+
+    """
+    instance = self.cfg.GetInstanceInfo(
+      self.cfg.ExpandInstanceName(self.op.instance_name))
+    if instance is None:
+      raise errors.OpPrereqError("Instance '%s' not known" %
+                                 self.op.instance_name)
+
+    if instance.disk_template != constants.DT_DRBD8:
+      raise errors.OpPrereqError("Instance's disk layout is not"
+                                 " drbd8, cannot migrate.")
+
+    secondary_nodes = instance.secondary_nodes
+    if not secondary_nodes:
+      raise errors.ProgrammerError("no secondary node but using "
+                                   "drbd8 disk template")
+
+    i_be = self.cfg.GetClusterInfo().FillBE(instance)
+
+    target_node = secondary_nodes[0]
+    # check memory requirements on the secondary node
+    _CheckNodeFreeMemory(self, target_node, "migrating instance %s" %
+                         instance.name, i_be[constants.BE_MEMORY],
+                         instance.hypervisor)
+
+    # check bridge existance
+    brlist = [nic.bridge for nic in instance.nics]
+    result = self.rpc.call_bridges_exist(target_node, brlist)
+    if result.failed or not result.data:
+      raise errors.OpPrereqError("One or more target bridges %s does not"
+                                 " exist on destination node '%s'" %
+                                 (brlist, target_node))
+
+    if not self.op.cleanup:
+      result = self.rpc.call_instance_migratable(instance.primary_node,
+                                                 instance)
+      msg = result.RemoteFailMsg()
+      if msg:
+        raise errors.OpPrereqError("Can't migrate: %s - please use failover" %
+                                   msg)
+
+    self.instance = instance
+
+  def _WaitUntilSync(self):
+    """Poll with custom rpc for disk sync.
+
+    This uses our own step-based rpc call.
+
+    """
+    self.feedback_fn("* wait until resync is done")
+    all_done = False
+    while not all_done:
+      all_done = True
+      result = self.rpc.call_drbd_wait_sync(self.all_nodes,
+                                            self.nodes_ip,
+                                            self.instance.disks)
+      min_percent = 100
+      for node, nres in result.items():
+        msg = nres.RemoteFailMsg()
+        if msg:
+          raise errors.OpExecError("Cannot resync disks on node %s: %s" %
+                                   (node, msg))
+        node_done, node_percent = nres.data[1]
+        all_done = all_done and node_done
+        if node_percent is not None:
+          min_percent = min(min_percent, node_percent)
+      if not all_done:
+        if min_percent < 100:
+          self.feedback_fn("   - progress: %.1f%%" % min_percent)
+        time.sleep(2)
+
+  def _EnsureSecondary(self, node):
+    """Demote a node to secondary.
+
+    """
+    self.feedback_fn("* switching node %s to secondary mode" % node)
+
+    for dev in self.instance.disks:
+      self.cfg.SetDiskID(dev, node)
+
+    result = self.rpc.call_blockdev_close(node, self.instance.name,
+                                          self.instance.disks)
+    msg = result.RemoteFailMsg()
+    if msg:
+      raise errors.OpExecError("Cannot change disk to secondary on node %s,"
+                               " error %s" % (node, msg))
+
+  def _GoStandalone(self):
+    """Disconnect from the network.
+
+    """
+    self.feedback_fn("* changing into standalone mode")
+    result = self.rpc.call_drbd_disconnect_net(self.all_nodes, self.nodes_ip,
+                                               self.instance.disks)
+    for node, nres in result.items():
+      msg = nres.RemoteFailMsg()
+      if msg:
+        raise errors.OpExecError("Cannot disconnect disks node %s,"
+                                 " error %s" % (node, msg))
+
+  def _GoReconnect(self, multimaster):
+    """Reconnect to the network.
+
+    """
+    if multimaster:
+      msg = "dual-master"
+    else:
+      msg = "single-master"
+    self.feedback_fn("* changing disks into %s mode" % msg)
+    result = self.rpc.call_drbd_attach_net(self.all_nodes, self.nodes_ip,
+                                           self.instance.disks,
+                                           self.instance.name, multimaster)
+    for node, nres in result.items():
+      msg = nres.RemoteFailMsg()
+      if msg:
+        raise errors.OpExecError("Cannot change disks config on node %s,"
+                                 " error: %s" % (node, msg))
+
+  def _ExecCleanup(self):
+    """Try to cleanup after a failed migration.
+
+    The cleanup is done by:
+      - check that the instance is running only on one node
+        (and update the config if needed)
+      - change disks on its secondary node to secondary
+      - wait until disks are fully synchronized
+      - disconnect from the network
+      - change disks into single-master mode
+      - wait again until disks are fully synchronized
+
+    """
+    instance = self.instance
+    target_node = self.target_node
+    source_node = self.source_node
+
+    # check running on only one node
+    self.feedback_fn("* checking where the instance actually runs"
+                     " (if this hangs, the hypervisor might be in"
+                     " a bad state)")
+    ins_l = self.rpc.call_instance_list(self.all_nodes, [instance.hypervisor])
+    for node, result in ins_l.items():
+      result.Raise()
+      if not isinstance(result.data, list):
+        raise errors.OpExecError("Can't contact node '%s'" % node)
+
+    runningon_source = instance.name in ins_l[source_node].data
+    runningon_target = instance.name in ins_l[target_node].data
+
+    if runningon_source and runningon_target:
+      raise errors.OpExecError("Instance seems to be running on two nodes,"
+                               " or the hypervisor is confused. You will have"
+                               " to ensure manually that it runs only on one"
+                               " and restart this operation.")
+
+    if not (runningon_source or runningon_target):
+      raise errors.OpExecError("Instance does not seem to be running at all."
+                               " In this case, it's safer to repair by"
+                               " running 'gnt-instance stop' to ensure disk"
+                               " shutdown, and then restarting it.")
+
+    if runningon_target:
+      # the migration has actually succeeded, we need to update the config
+      self.feedback_fn("* instance running on secondary node (%s),"
+                       " updating config" % target_node)
+      instance.primary_node = target_node
+      self.cfg.Update(instance)
+      demoted_node = source_node
+    else:
+      self.feedback_fn("* instance confirmed to be running on its"
+                       " primary node (%s)" % source_node)
+      demoted_node = target_node
+
+    self._EnsureSecondary(demoted_node)
+    try:
+      self._WaitUntilSync()
+    except errors.OpExecError:
+      # we ignore here errors, since if the device is standalone, it
+      # won't be able to sync
+      pass
+    self._GoStandalone()
+    self._GoReconnect(False)
+    self._WaitUntilSync()
+
+    self.feedback_fn("* done")
+
+  def _ExecMigration(self):
+    """Migrate an instance.
+
+    The migrate is done by:
+      - change the disks into dual-master mode
+      - wait until disks are fully synchronized again
+      - migrate the instance
+      - change disks on the new secondary node (the old primary) to secondary
+      - wait until disks are fully synchronized
+      - change disks into single-master mode
+
+    """
+    instance = self.instance
+    target_node = self.target_node
+    source_node = self.source_node
+
+    self.feedback_fn("* checking disk consistency between source and target")
+    for dev in instance.disks:
+      if not _CheckDiskConsistency(self, dev, target_node, False):
+        raise errors.OpExecError("Disk %s is degraded or not fully"
+                                 " synchronized on target node,"
+                                 " aborting migrate." % dev.iv_name)
+
+    self._EnsureSecondary(target_node)
+    self._GoStandalone()
+    self._GoReconnect(True)
+    self._WaitUntilSync()
+
+    self.feedback_fn("* migrating instance to %s" % target_node)
+    time.sleep(10)
+    result = self.rpc.call_instance_migrate(source_node, instance,
+                                            self.nodes_ip[target_node],
+                                            self.op.live)
+    msg = result.RemoteFailMsg()
+    if msg:
+      logging.error("Instance migration failed, trying to revert"
+                    " disk status: %s", msg)
+      try:
+        self._EnsureSecondary(target_node)
+        self._GoStandalone()
+        self._GoReconnect(False)
+        self._WaitUntilSync()
+      except errors.OpExecError, err:
+        self.LogWarning("Migration failed and I can't reconnect the"
+                        " drives: error '%s'\n"
+                        "Please look and recover the instance status" %
+                        str(err))
+
+      raise errors.OpExecError("Could not migrate instance %s: %s" %
+                               (instance.name, msg))
+    time.sleep(10)
+
+    instance.primary_node = target_node
+    # distribute new instance config to the other nodes
+    self.cfg.Update(instance)
+
+    self._EnsureSecondary(source_node)
+    self._WaitUntilSync()
+    self._GoStandalone()
+    self._GoReconnect(False)
+    self._WaitUntilSync()
+
+    self.feedback_fn("* done")
+
+  def Exec(self, feedback_fn):
+    """Perform the migration.
+
+    """
+    self.feedback_fn = feedback_fn
+
+    self.source_node = self.instance.primary_node
+    self.target_node = self.instance.secondary_nodes[0]
+    self.all_nodes = [self.source_node, self.target_node]
+    self.nodes_ip = {
+      self.source_node: self.cfg.GetNodeInfo(self.source_node).secondary_ip,
+      self.target_node: self.cfg.GetNodeInfo(self.target_node).secondary_ip,
+      }
+    if self.op.cleanup:
+      return self._ExecCleanup()
+    else:
+      return self._ExecMigration()
+
+
 def _CreateBlockDevOnPrimary(lu, node, instance, device, info):
   """Create a tree of block devices on the primary node.
 
diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py
index a0a59ad34..0aa956032 100644
--- a/lib/hypervisor/hv_xen.py
+++ b/lib/hypervisor/hv_xen.py
@@ -51,11 +51,11 @@ class XenHypervisor(hv_base.BaseHypervisor):
     raise NotImplementedError
 
   @staticmethod
-  def _RemoveConfigFile(instance):
+  def _RemoveConfigFile(instance_name):
     """Remove the xen configuration file.
 
     """
-    utils.RemoveFile("/etc/xen/%s" % instance.name)
+    utils.RemoveFile("/etc/xen/%s" % instance_name)
 
   @staticmethod
   def _GetXMList(include_node):
@@ -155,7 +155,7 @@ class XenHypervisor(hv_base.BaseHypervisor):
     """Stop an instance.
 
     """
-    self._RemoveConfigFile(instance)
+    self._RemoveConfigFile(instance.name)
     if force:
       command = ["xm", "destroy", instance.name]
     else:
@@ -290,6 +290,12 @@ class XenHypervisor(hv_base.BaseHypervisor):
     if result.failed:
       raise errors.HypervisorError("Failed to migrate instance %s: %s" %
                                    (instance, result.output))
+    # remove old xen file after migration succeeded
+    try:
+      self._RemoveConfigFile(instance)
+    except EnvironmentError, err:
+      logger.Error("Failure while removing instance config file: %s" %
+                   str(err))
 
 
 class XenPvmHypervisor(XenHypervisor):
diff --git a/lib/mcpu.py b/lib/mcpu.py
index 1c3384da3..231070056 100644
--- a/lib/mcpu.py
+++ b/lib/mcpu.py
@@ -68,6 +68,7 @@ class Processor(object):
     opcodes.OpDeactivateInstanceDisks: cmdlib.LUDeactivateInstanceDisks,
     opcodes.OpReplaceDisks: cmdlib.LUReplaceDisks,
     opcodes.OpFailoverInstance: cmdlib.LUFailoverInstance,
+    opcodes.OpMigrateInstance: cmdlib.LUMigrateInstance,
     opcodes.OpConnectConsole: cmdlib.LUConnectConsole,
     opcodes.OpQueryInstances: cmdlib.LUQueryInstances,
     opcodes.OpQueryInstanceData: cmdlib.LUQueryInstanceData,
diff --git a/lib/opcodes.py b/lib/opcodes.py
index 32edfaa7e..6848c4e85 100644
--- a/lib/opcodes.py
+++ b/lib/opcodes.py
@@ -413,6 +413,19 @@ class OpFailoverInstance(OpCode):
   __slots__ = ["instance_name", "ignore_consistency"]
 
 
+class OpMigrateInstance(OpCode):
+  """Migrate an instance.
+
+  This migrates (without shutting down an instance) to its secondary
+  node.
+
+  @var instance_name: the name of the instance
+
+  """
+  OP_ID = "OP_INSTANCE_MIGRATE"
+  __slots__ = ["instance_name", "live", "cleanup"]
+
+
 class OpConnectConsole(OpCode):
   """Connect to an instance's console."""
   OP_ID = "OP_INSTANCE_CONSOLE"
diff --git a/scripts/gnt-instance b/scripts/gnt-instance
index b19717480..f1063d8b4 100755
--- a/scripts/gnt-instance
+++ b/scripts/gnt-instance
@@ -821,6 +821,41 @@ def FailoverInstance(opts, args):
   return 0
 
 
+def MigrateInstance(opts, args):
+  """Migrate an instance.
+
+  The migrate is done without shutdown.
+
+  Args:
+    opts - class with options as members
+    args - list with a single element, the instance name
+  Opts used:
+    force - whether to migrate without asking questions.
+
+  """
+  instance_name = args[0]
+  force = opts.force
+
+  if not force:
+    if opts.cleanup:
+      usertext = ("Instance %s will be recovered from a failed migration."
+                  " Note that the migration procedure (including cleanup)" %
+                  (instance_name,))
+    else:
+      usertext = ("Instance %s will be migrated. Note that migration" %
+                  (instance_name,))
+    usertext += (" is **experimental** in this version."
+                " This might impact the instance if anything goes wrong."
+                " Continue?")
+    if not AskUser(usertext):
+      return 1
+
+  op = opcodes.OpMigrateInstance(instance_name=instance_name, live=opts.live,
+                                 cleanup=opts.cleanup)
+  SubmitOpCode(op)
+  return 0
+
+
 def ConnectToInstanceConsole(opts, args):
   """Connect to the console of an instance.
 
@@ -1269,6 +1304,26 @@ commands = {
                "[-f] <instance>",
                "Stops the instance and starts it on the backup node, using"
                " the remote mirror (only for instances of type drbd)"),
+  'migrate': (MigrateInstance, ARGS_ONE,
+               [DEBUG_OPT, FORCE_OPT,
+                make_option("--non-live", dest="live",
+                            default=True, action="store_false",
+                            help="Do a non-live migration (this usually means"
+                            " freeze the instance, save the state,"
+                            " transfer and only then resume running on the"
+                            " secondary node)"),
+                make_option("--cleanup", dest="cleanup",
+                            default=False, action="store_true",
+                            help="Instead of performing the migration, try to"
+                            " recover from a failed cleanup. This is safe"
+                            " to run even if the instance is healthy, but it"
+                            " will create extra replication traffic and "
+                            " disrupt briefly the replication (like during the"
+                            " migration"),
+                ],
+               "[-f] <instance>",
+               "Migrate instance to its secondary node"
+               " (only for instances of type drbd)"),
   'info': (ShowInstanceConfig, ARGS_ANY,
            [DEBUG_OPT,
             make_option("-s", "--static", dest="static",
-- 
GitLab