Commit 266aea94 authored by Iustin Pop's avatar Iustin Pop
Browse files

Remove hn1 and related code

hn1 was deprecated for a while and this patch removes it altogether. The
support code in Cluster.hs is also removed.
parent 78ecfa8f
......@@ -13,7 +13,6 @@ TAGS
......@@ -31,9 +31,7 @@ module Ganeti.HTools.Cluster
-- * Types
, AllocSolution
, Solution(..)
, Table(..)
, Removal
, Score
, IMove(..)
, CStats(..)
......@@ -42,8 +40,6 @@ module Ganeti.HTools.Cluster
-- * First phase functions
, computeBadItems
-- * Second phase functions
, computeSolution
, applySolution
, printSolution
, printSolutionLine
, formatCmds
......@@ -83,14 +79,6 @@ type Placement = (Idx, Ndx, Ndx, Score)
-- | Allocation\/relocation solution.
type AllocSolution = [(Maybe Node.List, Instance.Instance, [Node.Node])]
-- | A cluster solution described as the solution delta and the list
-- of placements.
data Solution = Solution Int [Placement]
deriving (Eq, Ord, Show)
-- | A removal set.
data Removal = Removal Node.List [Instance.Instance]
-- | An instance move definition
data IMove = Failover -- ^ Failover the instance (f)
| ReplacePrimary Ndx -- ^ Replace primary (f, r:np, f)
......@@ -115,24 +103,6 @@ data CStats = CStats { cs_fmem :: Int -- ^ Cluster free mem
-- * Utility functions
-- | Returns the delta of a solution or -1 for Nothing.
solutionDelta :: Maybe Solution -> Int
solutionDelta sol = case sol of
Just (Solution d _) -> d
_ -> -1
-- | Cap the removal list if needed.
capRemovals :: [a] -> Int -> [a]
capRemovals removals max_removals =
if max_removals > 0 then
take max_removals removals
-- | Check if the given node list fails the N+1 check.
verifyN1Check :: [Node.Node] -> Bool
verifyN1Check nl = any Node.failN1 nl
-- | Verifies the N+1 status and return the affected nodes.
verifyN1 :: [Node.Node] -> [Node.Node]
verifyN1 nl = filter Node.failN1 nl
......@@ -224,260 +194,6 @@ compCV nl =
getOnline :: Node.List -> [Node.Node]
getOnline = filter (not . Node.offline) . Container.elems
-- * hn1 functions
-- | Add an instance and return the new node and instance maps.
addInstance :: Node.List -> Instance.Instance ->
Node.Node -> Node.Node -> Maybe Node.List
addInstance nl idata pri sec =
let pdx = Node.idx pri
sdx = Node.idx sec
in do
pnode <- Node.addPri pri idata
snode <- Node.addSec sec idata pdx
new_nl <- return $ Container.addTwo sdx snode
pdx pnode nl
return new_nl
-- | Remove an instance and return the new node and instance maps.
removeInstance :: Node.List -> Instance.Instance -> Node.List
removeInstance nl idata =
let pnode = Instance.pnode idata
snode = Instance.snode idata
pn = Container.find pnode nl
sn = Container.find snode nl
new_nl = Container.addTwo
pnode (Node.removePri pn idata)
snode (Node.removeSec sn idata) nl in
-- | Remove an instance and return the new node map.
removeInstances :: Node.List -> [Instance.Instance] -> Node.List
removeInstances = foldl' removeInstance
{-| Compute a new version of a cluster given a solution.
This is not used for computing the solutions, but for applying a
(known-good) solution to the original cluster for final display.
It first removes the relocated instances after which it places them on
their new nodes.
applySolution :: Node.List -> Instance.List -> [Placement] -> Node.List
applySolution nl il sol =
let odxes = map (\ (a, b, c, _) -> (Container.find a il,
Node.idx (Container.find b nl),
Node.idx (Container.find c nl))
) sol
idxes = (\ (x, _, _) -> x) (unzip3 odxes)
nc = removeInstances nl idxes
foldl' (\ nz (a, b, c) ->
let new_p = Container.find b nz
new_s = Container.find c nz in
fromJust (addInstance nz a new_p new_s)
) nc odxes
-- ** First phase functions
{-| Given a list 1,2,3..n build a list of pairs [(1, [2..n]), (2,
[3..n]), ...]
genParts :: [a] -> Int -> [(a, [a])]
genParts l count =
case l of
[] -> []
x:xs ->
if length l < count then
(x, xs) : (genParts xs count)
-- | Generates combinations of count items from the names list.
genNames :: Int -> [b] -> [[b]]
genNames count1 names1 =
let aux_fn count names current =
case count of
0 -> [current]
_ ->
(\ (x, xs) -> aux_fn (count - 1) xs (x:current))
(genParts names count)
aux_fn count1 names1 []
{-| Checks if removal of instances results in N+1 pass.
Note: the check removal cannot optimize by scanning only the affected
nodes, since the cluster is known to be not healthy; only the check
placement can make this shortcut.
checkRemoval :: Node.List -> [Instance.Instance] -> Maybe Removal
checkRemoval nl victims =
let nx = removeInstances nl victims
failN1 = verifyN1Check (Container.elems nx)
if failN1 then
Just $ Removal nx victims
-- | Computes the removals list for a given depth.
computeRemovals :: Node.List
-> [Instance.Instance]
-> Int
-> [Maybe Removal]
computeRemovals nl bad_instances depth =
map (checkRemoval nl) $ genNames depth bad_instances
-- ** Second phase functions
-- | Single-node relocation cost.
nodeDelta :: Ndx -> Ndx -> Ndx -> Int
nodeDelta i p s =
if i == p || i == s then
-- | Compute best solution.
-- This function compares two solutions, choosing the minimum valid
-- solution.
compareSolutions :: Maybe Solution -> Maybe Solution -> Maybe Solution
compareSolutions a b = case (a, b) of
(Nothing, x) -> x
(x, Nothing) -> x
(x, y) -> min x y
-- | Check if a given delta is worse then an existing solution.
tooHighDelta :: Maybe Solution -> Int -> Int -> Bool
tooHighDelta sol new_delta max_delta =
if new_delta > max_delta && max_delta >=0 then
case sol of
Nothing -> False
Just (Solution old_delta _) -> old_delta <= new_delta
{-| Check if placement of instances still keeps the cluster N+1 compliant.
This is the workhorse of the allocation algorithm: given the
current node and instance maps, the list of instances to be
placed, and the current solution, this will return all possible
solution by recursing until all target instances are placed.
checkPlacement :: Node.List -- ^ The current node list
-> [Instance.Instance] -- ^ List of instances still to place
-> [Placement] -- ^ Partial solution until now
-> Int -- ^ The delta of the partial solution
-> Maybe Solution -- ^ The previous solution
-> Int -- ^ Abort if the we go above this delta
-> Maybe Solution -- ^ The new solution
checkPlacement nl victims current current_delta prev_sol max_delta =
let target = head victims
opdx = Instance.pnode target
osdx = Instance.snode target
vtail = tail victims
have_tail = (length vtail) > 0
nodes = Container.elems nl
iidx = Instance.idx target
(\ accu_p pri ->
pri_idx = Node.idx pri
upri_delta = current_delta + nodeDelta pri_idx opdx osdx
new_pri = Node.addPri pri target
fail_delta1 = tooHighDelta accu_p upri_delta max_delta
if fail_delta1 || isNothing(new_pri) then accu_p
else let pri_nl = Container.add pri_idx (fromJust new_pri) nl in
(\ accu sec ->
sec_idx = Node.idx sec
upd_delta = upri_delta +
nodeDelta sec_idx opdx osdx
fail_delta2 = tooHighDelta accu upd_delta max_delta
new_sec = Node.addSec sec target pri_idx
if sec_idx == pri_idx || fail_delta2 ||
isNothing new_sec then accu
else let
nx = Container.add sec_idx (fromJust new_sec) pri_nl
upd_cv = compCV nx
plc = (iidx, pri_idx, sec_idx, upd_cv)
c2 = plc:current
result =
if have_tail then
checkPlacement nx vtail c2 upd_delta
accu max_delta
Just (Solution upd_delta c2)
in compareSolutions accu result
) accu_p nodes
) prev_sol nodes
{-| Auxiliary function for solution computation.
We write this in an explicit recursive fashion in order to control
early-abort in case we have met the min delta. We can't use foldr
instead of explicit recursion since we need the accumulator for the
abort decision.
advanceSolution :: [Maybe Removal] -- ^ The removal to process
-> Int -- ^ Minimum delta parameter
-> Int -- ^ Maximum delta parameter
-> Maybe Solution -- ^ Current best solution
-> Maybe Solution -- ^ New best solution
advanceSolution [] _ _ sol = sol
advanceSolution (Nothing:xs) m n sol = advanceSolution xs m n sol
advanceSolution ((Just (Removal nx removed)):xs) min_d max_d prev_sol =
let new_sol = checkPlacement nx removed [] 0 prev_sol max_d
new_delta = solutionDelta $! new_sol
if new_delta >= 0 && new_delta <= min_d then
advanceSolution xs min_d max_d new_sol
-- | Computes the placement solution.
solutionFromRemovals :: [Maybe Removal] -- ^ The list of (possible) removals
-> Int -- ^ Minimum delta parameter
-> Int -- ^ Maximum delta parameter
-> Maybe Solution -- ^ The best solution found
solutionFromRemovals removals min_delta max_delta =
advanceSolution removals min_delta max_delta Nothing
{-| Computes the solution at the given depth.
This is a wrapper over both computeRemovals and
solutionFromRemovals. In case we have no solution, we return Nothing.
computeSolution :: Node.List -- ^ The original node data
-> [Instance.Instance] -- ^ The list of /bad/ instances
-> Int -- ^ The /depth/ of removals
-> Int -- ^ Maximum number of removals to process
-> Int -- ^ Minimum delta parameter
-> Int -- ^ Maximum delta parameter
-> Maybe Solution -- ^ The best solution found (or Nothing)
computeSolution nl bad_instances depth max_removals min_delta max_delta =
removals = computeRemovals nl bad_instances depth
removals' = capRemovals removals max_removals
solutionFromRemovals removals' min_delta max_delta
-- * hbal functions
-- | Compute best table. Note that the ordering of the arguments is important.
HPROGS = hbal hn1 hscan hail hspace
HPROGS = hbal hscan hail hspace
HSRCS := $(wildcard Ganeti/HTools/*.hs)
HDDIR = apidoc
......@@ -37,18 +37,6 @@ becomes better. We stop when no further move can improve the score.
For algorithm details and usage, see the man page hbal(1).
Cluster N+1 solver
This program runs a very simple brute force algorithm over the instance
placement space in order to determine the shortest number of replace-disks
needed to fix the cluster. Note this means we won't get a balanced cluster,
just one that passes N+1 checks.
For algorithm details and usage, see the man page hn1(1).
.. note:: This program is deprecated, hbal should be used instead.
IAllocator plugin
......@@ -72,12 +60,12 @@ checks). For more details, see the man page hspace(1).
Integration with Ganeti
The ``hbal``, ``hspace`` and ``hn1`` programs can either get their
input from text files, or online from a cluster via RAPI. For online
collection via RAPI, the "-m" argument to both hn1 and hbal should
specify the cluster or master node name. ``hail`` uses the standard
iallocator API and thus doesn't need any special setup (just needs to
be installed in the right directory).
The ``hbal`` and ``hspace`` programs can either get their input from
text files, or online from a cluster via RAPI. For online collection
via RAPI, the "-m" argument to both hbal and hspace should specify the
cluster or master node name. ``hail`` uses the standard iallocator API
and thus doesn't need any special setup (just needs to be installed in
the right directory).
For generating the text files, a separate tool (``hscan``) is provided
to automate their gathering if RAPI is available, which is better
......@@ -43,8 +43,8 @@ The exist status of the command will be zero, unless for some reason
the algorithm fatally failed (e.g. wrong node or instance data).
.BR hn1 "(1), " hscan "(1), " ganeti "(7), " gnt-instance "(8), "
.BR gnt-node "(8)"
.BR hbal "(1), " hspace "(1), " hscan "(1), " ganeti "(7), "
.BR gnt-instance "(8), " gnt-node "(8)"
......@@ -545,8 +545,8 @@ changed in a way that the program will output a different solution
list (but hopefully will end in the same state).
.BR hn1 "(1), " hscan "(1), " ganeti "(7), " gnt-instance "(8), "
.BR gnt-node "(8)"
.BR hspace "(1), " hscan "(1), " hail "(1), "
.BR ganeti "(7), " gnt-instance "(8), " gnt-node "(8)"
.TH HN1 1 2009-03-23 htools "Ganeti H-tools"
hn1 \- N+1 fixer for Ganeti
.B hn1
.B "[-C]"
.B "[-p]"
.B "[-o]"
.BI "[ -m " cluster "]"
.BI "[-n " nodes-file " ]"
.BI "[ -i " instances-file "]"
.BI "[-d " depth "]"
.BI "[-r " max-removals "]"
.BI "[-L " max-delta "]"
.BI "[-l " min-delta "]"
.B hn1
.B --version
hn1 is a cluster N+1 fixer that tries to compute the minimum number of
moves needed for getting all nodes to be N+1 compliant.
The algorithm is designed to be a 'perfect' algorithm, so that we
always examine the entire solution space until we find the minimum
solution. The algorithm can be tweaked via the \fB-d\fR, \fB-r\fR,
\fB-L\fR and \fB-l\fR options.
By default, the program will show the solution in a somewhat cryptic
format; for getting the actual Ganeti command list, use the \fB-C\fR
\fBNote:\fR this program is somewhat deprecated; \fBhbal(1)\fR gives
usually much faster results, and a better cluster. It is recommended
to use this program only when \fBhbal\fR doesn't give a N+1 compliant
The algorithm works in multiple rounds, of increasing \fIdepth\fR,
until we have a solution.
First, before starting the solution computation, we compute all the
N+1-fail nodes and the instances they hold. These instances are
candidate for replacement (and only these!).
The program start then with \fIdepth\fR one (unless overridden via the
\fB-d\fR option), and at each round:
.RS 4
.TP 3
it tries to remove from the cluster as many instances as the current
depth in order to make the cluster N+1 compliant
then, for each of the possible instance combinations that allow this
(unless the total size is reduced via the \fB-r\fR option), it tries
to put them back on the cluster while maintaining N+1 compliance
It might be that at a given round, the results are:
.RS 4
.TP 3
no instance combination that can be put back; this means it is not
possible to make the cluster N+1 compliant with this number of
instances being moved, so we increase the depth and go on to the next
one or more successful result, in which case we take the one that has
as few changes as possible (by change meaning a replace-disks needed)
The main problem with the algorithm is that, being an exhaustive
search, the CPU time required grows very very quickly based on
depth. On a 20-node, 80-instances cluster, depths up to 5-6 are
quickly computed, and depth 10 could already take days.
The main factors that influence the run time are:
.RS 4
.TP 3
the removal depth; for each increase with one of the depth, we grow
the solution space by the number of nodes squared (since a new
instance can live any two nodes as primary/secondary, therefore
(almost) N times N); i.e., depth=1 will create a N^2 solution space,
depth two will make this N^4, depth three will be N^6, etc.
the removal depth again; for each increase in the depth, there will be
more valid removal sets, and the space of solutions increases linearly
with the number of removal sets
Therefore, the smaller the depth the faster the algorithm will be; it doesn't
seem like this algorithm will work for clusters of 100 nodes and many many
small instances (e.g. 256MB instances on 16GB nodes).
As an optimisation, since the algorithm is designed to prune the
search space as quickly as possible, is by luck we find a good
solution early at a given depth, then the other solutions which would
result in a bigger delta (the number of changes) will not be
investigated, and the program will finish fast. Since this is random
and depends on where in the full solution space the good solution will
be, there are two options for cutting down the time needed:
.RS 4
.TP 3
\fB-l\fR makes any solution that has delta lower than its parameter
succeed instantly; the default value for this parameter is zero, so
once we find a "perfect" solution we finish early
\fB-L\fR makes any solution with delta higher than its parameter being
rejected instantly (and not descend on the search tree); this can
reduce the depth of the search tree, with sometimes significant
speedups; by default, this optimization is not used
The algorithm also has some other internal optimisations:
.RS 4
.TP 3
when choosing where to place an instance in phase two, there are
N*(N-1) possible primary/secondary options; however, if instead of
iterating over all p * s pairs, we first determine the set of primary
nodes that can hold this instance (without failing N+1), we can cut
(N-1) secondary placements for each primary node removed; and since
this applies at every iteration of phase 2 it linearly decreases the
solution space, and on full clusters, this can mean a four-five times
reductions of solution space
since the number of solutions is very high even for smaller depths (on
the test data, depth=4 results in 1.8M solutions) we can't compare
them at the end, so at each iteration in phase 2 we only promote the
best solution out of our own set of solutions
since the placement of instances can only increase the delta of the
solution (placing a new instance will add zero or more replace-disks
steps), it means the delta will only increase while recursing during
phase 2; therefore, if we know at one point that we have a current
delta that is equal or higher to the delta of the best solution so
far, we can abort the recursion; this cuts a tremendous number of
branches; further promotion of the best solution from one removal set
to another can cut entire removal sets after a few recursions
The options that can be passed to the program are as follows:
.B -C, --print-commands
Print the command list at the end of the run. Without this, the
program will only show a shorter, but cryptic output.
.B -p, --print-nodes
Prints the before and after node status, in a format designed to allow
the user to understand the node's most important parameters.
The node list will contain these informations:
.B F
a character denoting the status of the node, with '-' meaning an
offline node, '*' meaning N+1 failure and blank meaning a good node
.B Name
the node name
.B t_mem
the total node memory
.B n_mem
the memory used by the node itself
.B i_mem
the memory used by instances
.B x_mem
amount memory which seems to be in use but cannot be determined why or
by which instance; usually this means that the hypervisor has some
overhead or that there are other reporting errors
.B f_mem
the free node memory
.B r_mem
the reserved node memory, which is the amount of free memory needed
for N+1 compliance
.B t_dsk
total disk
.B f_dsk
free disk
.B pcpu
the number of physical cpus on the node
.B vcpu
the number of virtual cpus allocated to primary instances
.B pri
number of primary instances
.B sec
number of secondary instances
.B p_fmem
percent of free memory
.B p_fdsk
percent of free disk
.B r_cpu
ratio of virtual to physical cpus
.BI "-n" nodefile ", --nodes=" nodefile
The name of the file holding node information (if not collecting via
RAPI), instead of the default \fInodes\fR file (but see below how to
customize the default value via the environment).
.BI "-i" instancefile ", --instances=" instancefile
The name of the file holding instance information (if not collecting
via RAPI), instead of the default \fIinstances\fR file (but see below
how to customize the default value via the environment).
.BI "-m" cluster
Collect data not from files but directly from the
.I cluster
given as an argument via RAPI. If the argument doesn't contain a colon
(:), then it is converted into a fully-built URL via prepending
https:// and appending the default RAPI port, otherwise it's
considered a fully-specified URL and is used unchanged.