Cluster.hs 63.2 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1 2 3
{-| Implementation of cluster-wide logic.

This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
Iustin Pop's avatar
Iustin Pop committed
5 6 7

-}

Iustin Pop's avatar
Iustin Pop committed
8 9
{-

10
Copyright (C) 2009, 2010, 2011 Google Inc.
Iustin Pop's avatar
Iustin Pop committed
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.

-}

29
module Ganeti.HTools.Cluster
Iustin Pop's avatar
Iustin Pop committed
30 31
    (
     -- * Types
32
      AllocSolution(..)
33
    , EvacSolution(..)
Iustin Pop's avatar
Iustin Pop committed
34
    , Table(..)
35
    , CStats(..)
36
    , AllocStats
Iustin Pop's avatar
Iustin Pop committed
37
    , AllocResult
38
    , AllocMethod
Iustin Pop's avatar
Iustin Pop committed
39 40
    -- * Generic functions
    , totalResources
41
    , computeAllocationDelta
Iustin Pop's avatar
Iustin Pop committed
42 43 44
    -- * First phase functions
    , computeBadItems
    -- * Second phase functions
45
    , printSolutionLine
46
    , formatCmds
47 48
    , involvedNodes
    , splitJobs
49 50 51
    -- * Display functions
    , printNodes
    , printInsts
Iustin Pop's avatar
Iustin Pop committed
52 53
    -- * Balacing functions
    , checkMove
54
    , doNextBalance
55
    , tryBalance
Iustin Pop's avatar
Iustin Pop committed
56
    , compCV
57
    , compCVNodes
58
    , compDetailedCV
Iustin Pop's avatar
Iustin Pop committed
59
    , printStats
60
    , iMoveToJob
61
    -- * IAllocator functions
62
    , genAllocNodes
63
    , tryAlloc
64
    , tryMGAlloc
65
    , tryReloc
66
    , tryNodeEvac
67
    , tryChangeGroup
68
    , collapseFailures
69 70 71
    -- * Allocation functions
    , iterateAlloc
    , tieredAlloc
72
     -- * Node group functions
73 74
    , instanceGroup
    , findSplitInstances
75
    , splitCluster
Iustin Pop's avatar
Iustin Pop committed
76 77
    ) where

78
import qualified Data.IntSet as IntSet
Iustin Pop's avatar
Iustin Pop committed
79
import Data.List
80
import Data.Maybe (fromJust, isNothing)
81
import Data.Ord (comparing)
Iustin Pop's avatar
Iustin Pop committed
82 83
import Text.Printf (printf)

84 85 86
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
import qualified Ganeti.HTools.Node as Node
87
import qualified Ganeti.HTools.Group as Group
Iustin Pop's avatar
Iustin Pop committed
88
import Ganeti.HTools.Types
89
import Ganeti.HTools.Utils
90
import Ganeti.HTools.Compat
91
import qualified Ganeti.OpCodes as OpCodes
Iustin Pop's avatar
Iustin Pop committed
92

Iustin Pop's avatar
Iustin Pop committed
93 94
-- * Types

Iustin Pop's avatar
Iustin Pop committed
95
-- | Allocation\/relocation solution.
96
data AllocSolution = AllocSolution
97 98 99 100
  { asFailures :: [FailMode]              -- ^ Failure counts
  , asAllocs   :: Int                     -- ^ Good allocation count
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
  , asLog      :: [String]                -- ^ Informational messages
101 102
  }

103 104 105 106
-- | Node evacuation/group change iallocator result type. This result
-- type consists of actual opcodes (a restricted subset) that are
-- transmitted back to Ganeti.
data EvacSolution = EvacSolution
107 108
    { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
    , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
109
                                        -- relocated
110
    , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
111 112
    }

Iustin Pop's avatar
Iustin Pop committed
113 114 115 116
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
type AllocResult = (FailStats, Node.List, Instance.List,
                    [Instance.Instance], [CStats])

117
-- | A type denoting the valid allocation mode/pairs.
118
--
119 120 121 122 123 124
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
-- for a two-node allocation, this will be a @Right [('Ndx',
-- ['Ndx'])]@. In the latter case, the list is basically an
-- association list, grouped by primary node and holding the potential
-- secondary nodes in the sub-list.
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
125

126
-- | The empty solution we start with when computing allocations.
127 128
emptyAllocSolution :: AllocSolution
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
129
                                   , asSolution = Nothing, asLog = [] }
130

131 132 133 134 135 136 137
-- | The empty evac solution.
emptyEvacSolution :: EvacSolution
emptyEvacSolution = EvacSolution { esMoved = []
                                 , esFailed = []
                                 , esOpCodes = []
                                 }

138
-- | The complete state for the balancing solution.
139
data Table = Table Node.List Instance.List Score [Placement]
140
             deriving (Show, Read)
Iustin Pop's avatar
Iustin Pop committed
141

Iustin Pop's avatar
Iustin Pop committed
142
-- | Cluster statistics data type.
Iustin Pop's avatar
Iustin Pop committed
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
data CStats = CStats { csFmem :: Integer -- ^ Cluster free mem
                     , csFdsk :: Integer -- ^ Cluster free disk
                     , csAmem :: Integer -- ^ Cluster allocatable mem
                     , csAdsk :: Integer -- ^ Cluster allocatable disk
                     , csAcpu :: Integer -- ^ Cluster allocatable cpus
                     , csMmem :: Integer -- ^ Max node allocatable mem
                     , csMdsk :: Integer -- ^ Max node allocatable disk
                     , csMcpu :: Integer -- ^ Max node allocatable cpu
                     , csImem :: Integer -- ^ Instance used mem
                     , csIdsk :: Integer -- ^ Instance used disk
                     , csIcpu :: Integer -- ^ Instance used cpu
                     , csTmem :: Double  -- ^ Cluster total mem
                     , csTdsk :: Double  -- ^ Cluster total disk
                     , csTcpu :: Double  -- ^ Cluster total cpus
                     , csVcpu :: Integer -- ^ Cluster virtual cpus (if
                                         -- node pCpu has been set,
                                         -- otherwise -1)
                     , csXmem :: Integer -- ^ Unnacounted for mem
                     , csNmem :: Integer -- ^ Node own memory
                     , csScore :: Score  -- ^ The cluster score
                     , csNinst :: Int    -- ^ The total number of instances
164
                     }
165
            deriving (Show, Read)
166

167
-- | Currently used, possibly to allocate, unallocable.
168 169
type AllocStats = (RSpec, RSpec, RSpec)

170 171 172 173 174 175 176 177 178 179
-- | A simple type for allocation functions.
type AllocMethod =  Node.List           -- ^ Node list
                 -> Instance.List       -- ^ Instance list
                 -> Maybe Int           -- ^ Optional allocation limit
                 -> Instance.Instance   -- ^ Instance spec for allocation
                 -> AllocNodes          -- ^ Which nodes we should allocate on
                 -> [Instance.Instance] -- ^ Allocated instances
                 -> [CStats]            -- ^ Running cluster stats
                 -> Result AllocResult  -- ^ Allocation result

Iustin Pop's avatar
Iustin Pop committed
180 181
-- * Utility functions

Iustin Pop's avatar
Iustin Pop committed
182 183
-- | Verifies the N+1 status and return the affected nodes.
verifyN1 :: [Node.Node] -> [Node.Node]
Iustin Pop's avatar
Iustin Pop committed
184
verifyN1 = filter Node.failN1
Iustin Pop's avatar
Iustin Pop committed
185

Iustin Pop's avatar
Iustin Pop committed
186 187 188 189 190 191 192 193 194 195
{-| Computes the pair of bad nodes and instances.

The bad node list is computed via a simple 'verifyN1' check, and the
bad instance list is the list of primary and secondary instances of
those nodes.

-}
computeBadItems :: Node.List -> Instance.List ->
                   ([Node.Node], [Instance.Instance])
computeBadItems nl il =
196
  let bad_nodes = verifyN1 $ getOnline nl
197
      bad_instances = map (`Container.find` il) .
Iustin Pop's avatar
Iustin Pop committed
198
                      sort . nub $
199
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
Iustin Pop's avatar
Iustin Pop committed
200 201 202
  in
    (bad_nodes, bad_instances)

203
-- | Zero-initializer for the CStats type.
204
emptyCStats :: CStats
205
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
206

207
-- | Update stats with data from a new node.
208 209
updateCStats :: CStats -> Node.Node -> CStats
updateCStats cs node =
210 211 212 213 214
    let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
                 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
                 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
                 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
                 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
215
                 csVcpu = x_vcpu,
216
                 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
Iustin Pop's avatar
Iustin Pop committed
217
               }
218
            = cs
219
        inc_amem = Node.fMem node - Node.rMem node
220
        inc_amem' = if inc_amem > 0 then inc_amem else 0
221
        inc_adsk = Node.availDisk node
222 223 224 225
        inc_imem = truncate (Node.tMem node) - Node.nMem node
                   - Node.xMem node - Node.fMem node
        inc_icpu = Node.uCpu node
        inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
226
        inc_vcpu = Node.hiCpu node
227
        inc_acpu = Node.availCpu node
Iustin Pop's avatar
Iustin Pop committed
228

Iustin Pop's avatar
Iustin Pop committed
229 230 231 232 233 234 235 236 237 238 239
    in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
          , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
          , csAmem = x_amem + fromIntegral inc_amem'
          , csAdsk = x_adsk + fromIntegral inc_adsk
          , csAcpu = x_acpu + fromIntegral inc_acpu
          , csMmem = max x_mmem (fromIntegral inc_amem')
          , csMdsk = max x_mdsk (fromIntegral inc_adsk)
          , csMcpu = max x_mcpu (fromIntegral inc_acpu)
          , csImem = x_imem + fromIntegral inc_imem
          , csIdsk = x_idsk + fromIntegral inc_idsk
          , csIcpu = x_icpu + fromIntegral inc_icpu
240 241 242
          , csTmem = x_tmem + Node.tMem node
          , csTdsk = x_tdsk + Node.tDsk node
          , csTcpu = x_tcpu + Node.tCpu node
Iustin Pop's avatar
Iustin Pop committed
243 244 245
          , csVcpu = x_vcpu + fromIntegral inc_vcpu
          , csXmem = x_xmem + fromIntegral (Node.xMem node)
          , csNmem = x_nmem + fromIntegral (Node.nMem node)
246
          , csNinst = x_ninst + length (Node.pList node)
247
          }
248

Iustin Pop's avatar
Iustin Pop committed
249
-- | Compute the total free disk and memory in the cluster.
250
totalResources :: Node.List -> CStats
251 252
totalResources nl =
    let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
253
    in cs { csScore = compCV nl }
Iustin Pop's avatar
Iustin Pop committed
254

255 256 257
-- | Compute the delta between two cluster state.
--
-- This is used when doing allocations, to understand better the
Iustin Pop's avatar
Iustin Pop committed
258 259 260
-- available cluster resources. The return value is a triple of the
-- current used values, the delta that was still allocated, and what
-- was left unallocated.
261 262 263 264 265
computeAllocationDelta :: CStats -> CStats -> AllocStats
computeAllocationDelta cini cfin =
    let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu} = cini
        CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
                csTmem = t_mem, csTdsk = t_dsk, csVcpu = v_cpu } = cfin
Iustin Pop's avatar
Iustin Pop committed
266 267 268 269 270 271 272 273
        rini = RSpec (fromIntegral i_icpu) (fromIntegral i_imem)
               (fromIntegral i_idsk)
        rfin = RSpec (fromIntegral (f_icpu - i_icpu))
               (fromIntegral (f_imem - i_imem))
               (fromIntegral (f_idsk - i_idsk))
        un_cpu = fromIntegral (v_cpu - f_icpu)::Int
        runa = RSpec un_cpu (truncate t_mem - fromIntegral f_imem)
               (truncate t_dsk - fromIntegral f_idsk)
274 275
    in (rini, rfin, runa)

276
-- | The names and weights of the individual elements in the CV list.
Iustin Pop's avatar
Iustin Pop committed
277 278 279 280 281 282 283 284 285 286 287 288
detailedCVInfo :: [(Double, String)]
detailedCVInfo = [ (1,  "free_mem_cv")
                 , (1,  "free_disk_cv")
                 , (1,  "n1_cnt")
                 , (1,  "reserved_mem_cv")
                 , (4,  "offline_all_cnt")
                 , (16, "offline_pri_cnt")
                 , (1,  "vcpu_ratio_cv")
                 , (1,  "cpu_load_cv")
                 , (1,  "mem_load_cv")
                 , (1,  "disk_load_cv")
                 , (1,  "net_load_cv")
Iustin Pop's avatar
Iustin Pop committed
289
                 , (2,  "pri_tags_score")
Iustin Pop's avatar
Iustin Pop committed
290 291
                 ]

Iustin Pop's avatar
Iustin Pop committed
292
-- | Holds the weights used by 'compCVNodes' for each metric.
Iustin Pop's avatar
Iustin Pop committed
293 294
detailedCVWeights :: [Double]
detailedCVWeights = map fst detailedCVInfo
295

Iustin Pop's avatar
Iustin Pop committed
296
-- | Compute the mem and disk covariance.
297 298
compDetailedCV :: [Node.Node] -> [Double]
compDetailedCV all_nodes =
Iustin Pop's avatar
Iustin Pop committed
299 300
    let
        (offline, nodes) = partition Node.offline all_nodes
301 302
        mem_l = map Node.pMem nodes
        dsk_l = map Node.pDsk nodes
303
        -- metric: memory covariance
Iustin Pop's avatar
Iustin Pop committed
304
        mem_cv = stdDev mem_l
305
        -- metric: disk covariance
Iustin Pop's avatar
Iustin Pop committed
306
        dsk_cv = stdDev dsk_l
307 308 309 310
        -- metric: count of instances living on N1 failing nodes
        n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
                                                   length (Node.pList n)) .
                   filter Node.failN1 $ nodes :: Double
311
        res_l = map Node.pRem nodes
312
        -- metric: reserved memory covariance
Iustin Pop's avatar
Iustin Pop committed
313
        res_cv = stdDev res_l
314 315 316 317 318
        -- offline instances metrics
        offline_ipri = sum . map (length . Node.pList) $ offline
        offline_isec = sum . map (length . Node.sList) $ offline
        -- metric: count of instances on offline nodes
        off_score = fromIntegral (offline_ipri + offline_isec)::Double
319 320 321 322
        -- metric: count of primary instances on offline nodes (this
        -- helps with evacuation/failover of primary instances on
        -- 2-node clusters with one node offline)
        off_pri_score = fromIntegral offline_ipri::Double
323
        cpu_l = map Node.pCpu nodes
324
        -- metric: covariance of vcpu/pcpu ratio
Iustin Pop's avatar
Iustin Pop committed
325
        cpu_cv = stdDev cpu_l
326
        -- metrics: covariance of cpu, memory, disk and network load
327 328 329 330 331 332
        (c_load, m_load, d_load, n_load) = unzip4 $
            map (\n ->
                     let DynUtil c1 m1 d1 n1 = Node.utilLoad n
                         DynUtil c2 m2 d2 n2 = Node.utilPool n
                     in (c1/c2, m1/m2, d1/d2, n1/n2)
                ) nodes
333 334 335
        -- metric: conflicting instance count
        pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
        pri_tags_score = fromIntegral pri_tags_inst::Double
336
    in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
Iustin Pop's avatar
Iustin Pop committed
337
       , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
338
       , pri_tags_score ]
Iustin Pop's avatar
Iustin Pop committed
339 340

-- | Compute the /total/ variance.
341 342 343 344
compCVNodes :: [Node.Node] -> Double
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV

-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
Iustin Pop's avatar
Iustin Pop committed
345
compCV :: Node.List -> Double
346 347
compCV = compCVNodes . Container.elems

348
-- | Compute online nodes from a 'Node.List'.
349 350 351
getOnline :: Node.List -> [Node.Node]
getOnline = filter (not . Node.offline) . Container.elems

352
-- * Balancing functions
Iustin Pop's avatar
Iustin Pop committed
353 354 355 356 357 358 359

-- | Compute best table. Note that the ordering of the arguments is important.
compareTables :: Table -> Table -> Table
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
    if a_cv > b_cv then b else a

-- | Applies an instance move to a given node list and instance.
360
applyMove :: Node.List -> Instance.Instance
361
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
Iustin Pop's avatar
Iustin Pop committed
362
-- Failover (f)
Iustin Pop's avatar
Iustin Pop committed
363
applyMove nl inst Failover =
364 365
    let old_pdx = Instance.pNode inst
        old_sdx = Instance.sNode inst
Iustin Pop's avatar
Iustin Pop committed
366 367 368 369
        old_p = Container.find old_pdx nl
        old_s = Container.find old_sdx nl
        int_p = Node.removePri old_p inst
        int_s = Node.removeSec old_s inst
370
        force_p = Node.offline old_p
371
        new_nl = do -- Maybe monad
372
          new_p <- Node.addPriEx force_p int_s inst
373
          new_s <- Node.addSec int_p inst old_sdx
374 375 376 377
          let new_inst = Instance.setBoth inst old_sdx old_pdx
          return (Container.addTwo old_pdx new_s old_sdx new_p nl,
                  new_inst, old_sdx, old_pdx)
    in new_nl
Iustin Pop's avatar
Iustin Pop committed
378

Iustin Pop's avatar
Iustin Pop committed
379
-- Replace the primary (f:, r:np, f)
Iustin Pop's avatar
Iustin Pop committed
380
applyMove nl inst (ReplacePrimary new_pdx) =
381 382
    let old_pdx = Instance.pNode inst
        old_sdx = Instance.sNode inst
Iustin Pop's avatar
Iustin Pop committed
383 384 385 386 387
        old_p = Container.find old_pdx nl
        old_s = Container.find old_sdx nl
        tgt_n = Container.find new_pdx nl
        int_p = Node.removePri old_p inst
        int_s = Node.removeSec old_s inst
388
        force_p = Node.offline old_p
389
        new_nl = do -- Maybe monad
390 391
          -- check that the current secondary can host the instance
          -- during the migration
392
          tmp_s <- Node.addPriEx force_p int_s inst
393
          let tmp_s' = Node.removePri tmp_s inst
394 395
          new_p <- Node.addPriEx force_p tgt_n inst
          new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
396 397 398 399 400
          let new_inst = Instance.setPri inst new_pdx
          return (Container.add new_pdx new_p $
                  Container.addTwo old_pdx int_p old_sdx new_s nl,
                  new_inst, new_pdx, old_sdx)
    in new_nl
Iustin Pop's avatar
Iustin Pop committed
401

Iustin Pop's avatar
Iustin Pop committed
402
-- Replace the secondary (r:ns)
Iustin Pop's avatar
Iustin Pop committed
403
applyMove nl inst (ReplaceSecondary new_sdx) =
404 405
    let old_pdx = Instance.pNode inst
        old_sdx = Instance.sNode inst
Iustin Pop's avatar
Iustin Pop committed
406 407 408
        old_s = Container.find old_sdx nl
        tgt_n = Container.find new_sdx nl
        int_s = Node.removeSec old_s inst
409
        force_s = Node.offline old_s
410
        new_inst = Instance.setSec inst new_sdx
411
        new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
412 413 414 415
                 \new_s -> return (Container.addTwo new_sdx
                                   new_s old_sdx int_s nl,
                                   new_inst, old_pdx, new_sdx)
    in new_nl
Iustin Pop's avatar
Iustin Pop committed
416

Iustin Pop's avatar
Iustin Pop committed
417
-- Replace the secondary and failover (r:np, f)
Iustin Pop's avatar
Iustin Pop committed
418
applyMove nl inst (ReplaceAndFailover new_pdx) =
419 420
    let old_pdx = Instance.pNode inst
        old_sdx = Instance.sNode inst
Iustin Pop's avatar
Iustin Pop committed
421 422 423 424 425
        old_p = Container.find old_pdx nl
        old_s = Container.find old_sdx nl
        tgt_n = Container.find new_pdx nl
        int_p = Node.removePri old_p inst
        int_s = Node.removeSec old_s inst
426
        force_s = Node.offline old_s
427 428
        new_nl = do -- Maybe monad
          new_p <- Node.addPri tgt_n inst
429
          new_s <- Node.addSecEx force_s int_p inst new_pdx
430 431 432 433 434
          let new_inst = Instance.setBoth inst new_pdx old_pdx
          return (Container.add new_pdx new_p $
                  Container.addTwo old_pdx new_s old_sdx int_s nl,
                  new_inst, new_pdx, old_pdx)
    in new_nl
Iustin Pop's avatar
Iustin Pop committed
435

Iustin Pop's avatar
Iustin Pop committed
436 437
-- Failver and replace the secondary (f, r:ns)
applyMove nl inst (FailoverAndReplace new_sdx) =
438 439
    let old_pdx = Instance.pNode inst
        old_sdx = Instance.sNode inst
Iustin Pop's avatar
Iustin Pop committed
440 441 442 443 444
        old_p = Container.find old_pdx nl
        old_s = Container.find old_sdx nl
        tgt_n = Container.find new_sdx nl
        int_p = Node.removePri old_p inst
        int_s = Node.removeSec old_s inst
445
        force_p = Node.offline old_p
446
        new_nl = do -- Maybe monad
447 448
          new_p <- Node.addPriEx force_p int_s inst
          new_s <- Node.addSecEx force_p tgt_n inst old_sdx
449 450 451 452 453
          let new_inst = Instance.setBoth inst old_sdx new_sdx
          return (Container.add new_sdx new_s $
                  Container.addTwo old_sdx new_p old_pdx int_p nl,
                  new_inst, old_sdx, new_sdx)
    in new_nl
Iustin Pop's avatar
Iustin Pop committed
454

Iustin Pop's avatar
Iustin Pop committed
455
-- | Tries to allocate an instance on one given node.
456
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
457
                 -> OpResult Node.AllocElement
458 459
allocateOnSingle nl inst new_pdx =
    let p = Container.find new_pdx nl
460
        new_inst = Instance.setBoth inst new_pdx Node.noSecondary
461 462 463 464
    in  Node.addPri p inst >>= \new_p -> do
      let new_nl = Container.add new_pdx new_p nl
          new_score = compCV nl
      return (new_nl, new_inst, [new_p], new_score)
465

Iustin Pop's avatar
Iustin Pop committed
466
-- | Tries to allocate an instance on a given pair of nodes.
467
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
468
               -> OpResult Node.AllocElement
469 470 471
allocateOnPair nl inst new_pdx new_sdx =
    let tgt_p = Container.find new_pdx nl
        tgt_s = Container.find new_sdx nl
472 473 474 475 476 477
    in do
      new_p <- Node.addPri tgt_p inst
      new_s <- Node.addSec tgt_s inst new_pdx
      let new_inst = Instance.setBoth inst new_pdx new_sdx
          new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
      return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
478

Iustin Pop's avatar
Iustin Pop committed
479 480
-- | Tries to perform an instance move and returns the best table
-- between the original one and the new one.
Iustin Pop's avatar
Iustin Pop committed
481 482 483 484 485 486 487 488
checkSingleStep :: Table -- ^ The original table
                -> Instance.Instance -- ^ The instance to move
                -> Table -- ^ The current best table
                -> IMove -- ^ The move to apply
                -> Table -- ^ The final best table
checkSingleStep ini_tbl target cur_tbl move =
    let
        Table ini_nl ini_il _ ini_plc = ini_tbl
489
        tmp_resu = applyMove ini_nl target move
Iustin Pop's avatar
Iustin Pop committed
490
    in
491
      case tmp_resu of
492
        OpFail _ -> cur_tbl
493
        OpGood (upd_nl, new_inst, pri_idx, sec_idx) ->
494 495 496
            let tgt_idx = Instance.idx target
                upd_cvar = compCV upd_nl
                upd_il = Container.add tgt_idx new_inst ini_il
497
                upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
498 499 500
                upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
            in
              compareTables cur_tbl upd_tbl
Iustin Pop's avatar
Iustin Pop committed
501

502 503 504 505
-- | Given the status of the current secondary as a valid new node and
-- the current candidate target node, generate the possible moves for
-- a instance.
possibleMoves :: Bool      -- ^ Whether the secondary node is a valid new node
506
              -> Bool      -- ^ Whether we can change the primary node
507 508
              -> Ndx       -- ^ Target node candidate
              -> [IMove]   -- ^ List of valid result moves
509 510 511 512 513

possibleMoves _ False tdx =
    [ReplaceSecondary tdx]

possibleMoves True True tdx =
514 515 516 517 518
    [ReplaceSecondary tdx,
     ReplaceAndFailover tdx,
     ReplacePrimary tdx,
     FailoverAndReplace tdx]

519
possibleMoves False True tdx =
520 521 522 523
    [ReplaceSecondary tdx,
     ReplaceAndFailover tdx]

-- | Compute the best move for a given instance.
524 525
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
                  -> Bool              -- ^ Whether disk moves are allowed
526
                  -> Bool              -- ^ Whether instance moves are allowed
527 528 529
                  -> Table             -- ^ Original table
                  -> Instance.Instance -- ^ Instance to move
                  -> Table             -- ^ Best new table for this instance
530
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
Iustin Pop's avatar
Iustin Pop committed
531
    let
532 533
        opdx = Instance.pNode target
        osdx = Instance.sNode target
534
        nodes = filter (\idx -> idx /= opdx && idx /= osdx) nodes_idx
535
        use_secondary = elem osdx nodes_idx && inst_moves
536 537 538
        aft_failover = if use_secondary -- if allowed to failover
                       then checkSingleStep ini_tbl target ini_tbl Failover
                       else ini_tbl
539
        all_moves = if disk_moves
540 541
                    then concatMap
                         (possibleMoves use_secondary inst_moves) nodes
542
                    else []
Iustin Pop's avatar
Iustin Pop committed
543 544
    in
      -- iterate over the possible nodes for this instance
545
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
Iustin Pop's avatar
Iustin Pop committed
546

Iustin Pop's avatar
Iustin Pop committed
547
-- | Compute the best next move.
548
checkMove :: [Ndx]               -- ^ Allowed target node indices
549
          -> Bool                -- ^ Whether disk moves are allowed
550
          -> Bool                -- ^ Whether instance moves are allowed
551
          -> Table               -- ^ The current solution
Iustin Pop's avatar
Iustin Pop committed
552
          -> [Instance.Instance] -- ^ List of instances still to move
553
          -> Table               -- ^ The new solution
554
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
Iustin Pop's avatar
Iustin Pop committed
555
    let Table _ _ _ ini_plc = ini_tbl
556 557 558 559 560
        -- we're using rwhnf from the Control.Parallel.Strategies
        -- package; we don't need to use rnf as that would force too
        -- much evaluation in single-threaded cases, and in
        -- multi-threaded case the weak head normal form is enough to
        -- spark the evaluation
561 562
        tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
                               inst_moves ini_tbl)
563
                 victims
Iustin Pop's avatar
Iustin Pop committed
564
        -- iterate over all instances, computing the best move
565
        best_tbl = foldl' compareTables ini_tbl tables
Iustin Pop's avatar
Iustin Pop committed
566
        Table _ _ _ best_plc = best_tbl
567 568 569
    in if length best_plc == length ini_plc
       then ini_tbl -- no advancement
       else best_tbl
Iustin Pop's avatar
Iustin Pop committed
570

571
-- | Check if we are allowed to go deeper in the balancing.
572 573 574 575
doNextBalance :: Table     -- ^ The starting table
              -> Int       -- ^ Remaining length
              -> Score     -- ^ Score at which to stop
              -> Bool      -- ^ The resulting table and commands
576 577 578 579 580
doNextBalance ini_tbl max_rounds min_score =
    let Table _ _ ini_cv ini_plc = ini_tbl
        ini_plc_len = length ini_plc
    in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score

581
-- | Run a balance move.
582 583
tryBalance :: Table       -- ^ The starting table
           -> Bool        -- ^ Allow disk moves
584
           -> Bool        -- ^ Allow instance moves
Iustin Pop's avatar
Iustin Pop committed
585
           -> Bool        -- ^ Only evacuate moves
586 587
           -> Score       -- ^ Min gain threshold
           -> Score       -- ^ Min gain
588
           -> Maybe Table -- ^ The resulting table and commands
589
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
590 591
    let Table ini_nl ini_il ini_cv _ = ini_tbl
        all_inst = Container.elems ini_il
Iustin Pop's avatar
Iustin Pop committed
592 593 594
        all_inst' = if evac_mode
                    then let bad_nodes = map Node.idx . filter Node.offline $
                                         Container.elems ini_nl
595
                         in filter (any (`elem` bad_nodes) . Instance.allNodes)
Iustin Pop's avatar
Iustin Pop committed
596 597
                            all_inst
                    else all_inst
598
        reloc_inst = filter Instance.movable all_inst'
599 600
        node_idx = map Node.idx . filter (not . Node.offline) $
                   Container.elems ini_nl
601
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
602
        (Table _ _ fin_cv _) = fin_tbl
603
    in
604
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
605
      then Just fin_tbl -- this round made success, return the new table
606 607
      else Nothing

608 609
-- * Allocation functions

610
-- | Build failure stats out of a list of failures.
611 612
collapseFailures :: [FailMode] -> FailStats
collapseFailures flst =
613 614
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
            [minBound..maxBound]
615

616 617 618 619 620 621 622 623 624
-- | Compares two Maybe AllocElement and chooses the besst score.
bestAllocElement :: Maybe Node.AllocElement
                 -> Maybe Node.AllocElement
                 -> Maybe Node.AllocElement
bestAllocElement a Nothing = a
bestAllocElement Nothing b = b
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
    if ascore < bscore then a else b

625
-- | Update current Allocation solution and failure stats with new
626
-- elements.
627
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
628
concatAllocs as (OpFail reason) = as { asFailures = reason : asFailures as }
629

630
concatAllocs as (OpGood ns) =
631
    let -- Choose the old or new solution, based on the cluster score
632
        cntok = asAllocs as
633
        osols = asSolution as
634
        nsols = bestAllocElement osols (Just ns)
Iustin Pop's avatar
Iustin Pop committed
635
        nsuc = cntok + 1
636 637 638 639
    -- Note: we force evaluation of nsols here in order to keep the
    -- memory profile low - we know that we will need nsols for sure
    -- in the next cycle, so we force evaluation of nsols, since the
    -- foldl' in the caller will only evaluate the tuple, but not the
Iustin Pop's avatar
Iustin Pop committed
640
    -- elements of the tuple
641
    in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
642

643 644 645 646 647 648 649 650 651 652 653 654 655
-- | Sums two 'AllocSolution' structures.
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
          (AllocSolution bFails bAllocs bSols bLog) =
    -- note: we add b first, since usually it will be smaller; when
    -- fold'ing, a will grow and grow whereas b is the per-group
    -- result, hence smaller
    let nFails  = bFails ++ aFails
        nAllocs = aAllocs + bAllocs
        nSols   = bestAllocElement aSols bSols
        nLog    = bLog ++ aLog
    in AllocSolution nFails nAllocs nSols nLog

656
-- | Given a solution, generates a reasonable description for it.
657 658 659
describeSolution :: AllocSolution -> String
describeSolution as =
  let fcnt = asFailures as
660
      sols = asSolution as
661 662 663
      freasons =
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
        filter ((> 0) . snd) . collapseFailures $ fcnt
664 665 666 667 668 669 670
  in case sols of
     Nothing -> "No valid allocation solutions, failure reasons: " ++
                (if null fcnt then "unknown reasons" else freasons)
     Just (_, _, nodes, cv) ->
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
               (intercalate "/" . map Node.name $ nodes)
671

672
-- | Annotates a solution with the appropriate string.
673 674 675
annotateSolution :: AllocSolution -> AllocSolution
annotateSolution as = as { asLog = describeSolution as : asLog as }

676 677 678 679 680 681 682 683
-- | Reverses an evacuation solution.
--
-- Rationale: we always concat the results to the top of the lists, so
-- for proper jobset execution, we should reverse all lists.
reverseEvacSolution :: EvacSolution -> EvacSolution
reverseEvacSolution (EvacSolution f m o) =
    EvacSolution (reverse f) (reverse m) (reverse o)

684
-- | Generate the valid node allocation singles or pairs for a new instance.
685 686
genAllocNodes :: Group.List        -- ^ Group list
              -> Node.List         -- ^ The node map
687
              -> Int               -- ^ The number of nodes required
688 689
              -> Bool              -- ^ Whether to drop or not
                                   -- unallocable nodes
690
              -> Result AllocNodes -- ^ The (monadic) result
691 692
genAllocNodes gl nl count drop_unalloc =
    let filter_fn = if drop_unalloc
693 694
                    then filter (Group.isAllocable .
                                 flip Container.find gl . Node.group)
695 696
                    else id
        all_nodes = filter_fn $ getOnline nl
697 698 699 700 701
        all_pairs = [(Node.idx p,
                      [Node.idx s | s <- all_nodes,
                                         Node.idx p /= Node.idx s,
                                         Node.group p == Node.group s]) |
                     p <- all_nodes]
702
    in case count of
703
         1 -> Ok (Left (map Node.idx all_nodes))
704
         2 -> Ok (Right (filter (not . null . snd) all_pairs))
705 706
         _ -> Bad "Unsupported number of nodes, only one or two  supported"

707 708 709 710 711
-- | Try to allocate an instance on the cluster.
tryAlloc :: (Monad m) =>
            Node.List         -- ^ The node list
         -> Instance.List     -- ^ The instance list
         -> Instance.Instance -- ^ The instance to allocate
712
         -> AllocNodes        -- ^ The allocation targets
713
         -> m AllocSolution   -- ^ Possible solution list
Iustin Pop's avatar
Iustin Pop committed
714
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
715
tryAlloc nl _ inst (Right ok_pairs) =
716 717 718 719 720
    let psols = parMap rwhnf (\(p, ss) ->
                                  foldl' (\cstate ->
                                          concatAllocs cstate .
                                          allocateOnPair nl inst p)
                                  emptyAllocSolution ss) ok_pairs
721
        sols = foldl' sumAllocs emptyAllocSolution psols
Iustin Pop's avatar
Iustin Pop committed
722
    in return $ annotateSolution sols
723

Iustin Pop's avatar
Iustin Pop committed
724
tryAlloc _  _ _    (Left []) = fail "No online nodes"
725 726
tryAlloc nl _ inst (Left all_nodes) =
    let sols = foldl' (\cstate ->
Iustin Pop's avatar
Iustin Pop committed
727
                           concatAllocs cstate . allocateOnSingle nl inst
728
                      ) emptyAllocSolution all_nodes
Iustin Pop's avatar
Iustin Pop committed
729
    in return $ annotateSolution sols
730

731
-- | Given a group/result, describe it as a nice (list of) messages.
732 733
solutionDescription :: Group.List -> (Gdx, Result AllocSolution) -> [String]
solutionDescription gl (groupId, result) =
734
  case result of
735
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
736
    Bad message -> [printf "Group %s: error %s" gname message]
737 738
  where grp = Container.find groupId gl
        gname = Group.name grp
739
        pol = allocPolicyToString (Group.allocPolicy grp)
740 741

-- | From a list of possibly bad and possibly empty solutions, filter
742
-- only the groups with a valid result. Note that the result will be
743
-- reversed compared to the original list.
744 745 746
filterMGResults :: Group.List
                -> [(Gdx, Result AllocSolution)]
                -> [(Gdx, AllocSolution)]
747 748 749 750 751
filterMGResults gl = foldl' fn []
    where unallocable = not . Group.isAllocable . flip Container.find gl
          fn accu (gdx, rasol) =
              case rasol of
                Bad _ -> accu
752
                Ok sol | isNothing (asSolution sol) -> accu
753 754
                       | unallocable gdx -> accu
                       | otherwise -> (gdx, sol):accu
755

756
-- | Sort multigroup results based on policy and score.
757 758 759 760
sortMGResults :: Group.List
             -> [(Gdx, AllocSolution)]
             -> [(Gdx, AllocSolution)]
sortMGResults gl sols =
761
    let extractScore (_, _, _, x) = x
762
        solScore (gdx, sol) = (Group.allocPolicy (Container.find gdx gl),
763
                               (extractScore . fromJust . asSolution) sol)
764 765
    in sortBy (comparing solScore) sols

Iustin Pop's avatar
Iustin Pop committed
766
-- | Finds the best group for an instance on a multi-group cluster.
767 768 769 770 771
--
-- Only solutions in @preferred@ and @last_resort@ groups will be
-- accepted as valid, and additionally if the allowed groups parameter
-- is not null then allocation will only be run for those group
-- indices.
Iustin Pop's avatar
Iustin Pop committed
772 773 774
findBestAllocGroup :: Group.List           -- ^ The group list
                   -> Node.List            -- ^ The node list
                   -> Instance.List        -- ^ The instance list
775
                   -> Maybe [Gdx]          -- ^ The allowed groups
Iustin Pop's avatar
Iustin Pop committed
776 777 778
                   -> Instance.Instance    -- ^ The instance to allocate
                   -> Int                  -- ^ Required number of nodes
                   -> Result (Gdx, AllocSolution, [String])
779
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
780
  let groups = splitCluster mgnl mgil
781 782
      groups' = maybe groups (\gs -> filter ((`elem` gs) . fst) groups)
                allowed_gdxs
783
      sols = map (\(gid, (nl, il)) ->
784 785
                   (gid, genAllocNodes mggl nl cnt False >>=
                       tryAlloc nl il inst))
786
             groups'::[(Gdx, Result AllocSolution)]
787
      all_msgs = concatMap (solutionDescription mggl) sols
788 789
      goodSols = filterMGResults mggl sols
      sortedSols = sortMGResults mggl goodSols
790 791 792
  in if null sortedSols
     then Bad $ intercalate ", " all_msgs
     else let (final_group, final_sol) = head sortedSols
Iustin Pop's avatar
Iustin Pop committed
793 794 795 796 797 798 799 800 801 802 803
          in return (final_group, final_sol, all_msgs)

-- | Try to allocate an instance on a multi-group cluster.
tryMGAlloc :: Group.List           -- ^ The group list
           -> Node.List            -- ^ The node list
           -> Instance.List        -- ^ The instance list
           -> Instance.Instance    -- ^ The instance to allocate
           -> Int                  -- ^ Required number of nodes
           -> Result AllocSolution -- ^ Possible solution list
tryMGAlloc mggl mgnl mgil inst cnt = do
  (best_group, solution, all_msgs) <-
804
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
Iustin Pop's avatar
Iustin Pop committed
805 806 807
  let group_name = Group.name $ Container.find best_group mggl