Cluster.hs 68.9 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
2
3
{-| Implementation of cluster-wide logic.

This module holds all pure cluster-logic; I\/O related functionality
4
goes into the /Main/ module for the individual binaries.
Iustin Pop's avatar
Iustin Pop committed
5
6
7

-}

Iustin Pop's avatar
Iustin Pop committed
8
9
{-

Iustin Pop's avatar
Iustin Pop committed
10
Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
Iustin Pop's avatar
Iustin Pop committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.

-}

29
module Ganeti.HTools.Cluster
30
31
32
33
34
35
  (
    -- * Types
    AllocSolution(..)
  , EvacSolution(..)
  , Table(..)
  , CStats(..)
36
  , AllocNodes
37
38
  , AllocResult
  , AllocMethod
René Nussbaumer's avatar
René Nussbaumer committed
39
  , AllocSolutionList
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
  -- * Generic functions
  , totalResources
  , computeAllocationDelta
  -- * First phase functions
  , computeBadItems
  -- * Second phase functions
  , printSolutionLine
  , formatCmds
  , involvedNodes
  , splitJobs
  -- * Display functions
  , printNodes
  , printInsts
  -- * Balacing functions
  , checkMove
  , doNextBalance
  , tryBalance
  , compCV
  , compCVNodes
  , compDetailedCV
  , printStats
  , iMoveToJob
  -- * IAllocator functions
  , genAllocNodes
  , tryAlloc
  , tryMGAlloc
  , tryNodeEvac
  , tryChangeGroup
  , collapseFailures
René Nussbaumer's avatar
René Nussbaumer committed
69
  , allocList
70
71
72
73
74
75
76
77
  -- * Allocation functions
  , iterateAlloc
  , tieredAlloc
  -- * Node group functions
  , instanceGroup
  , findSplitInstances
  , splitCluster
  ) where
Iustin Pop's avatar
Iustin Pop committed
78

79
import Control.Applicative (liftA2)
80
import qualified Data.IntSet as IntSet
Iustin Pop's avatar
Iustin Pop committed
81
import Data.List
82
import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
83
import Data.Ord (comparing)
Iustin Pop's avatar
Iustin Pop committed
84
85
import Text.Printf (printf)

86
import Ganeti.BasicTypes
87
88
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
89
import qualified Ganeti.HTools.Nic as Nic
90
import qualified Ganeti.HTools.Node as Node
91
import qualified Ganeti.HTools.Group as Group
Iustin Pop's avatar
Iustin Pop committed
92
import Ganeti.HTools.Types
93
import Ganeti.Compat
94
import qualified Ganeti.OpCodes as OpCodes
95
import Ganeti.Utils
96
import Ganeti.Types (mkNonEmpty)
Iustin Pop's avatar
Iustin Pop committed
97

Iustin Pop's avatar
Iustin Pop committed
98
99
-- * Types

Iustin Pop's avatar
Iustin Pop committed
100
-- | Allocation\/relocation solution.
101
data AllocSolution = AllocSolution
102
103
104
105
  { asFailures :: [FailMode]              -- ^ Failure counts
  , asAllocs   :: Int                     -- ^ Good allocation count
  , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
  , asLog      :: [String]                -- ^ Informational messages
106
107
  }

108
109
110
111
-- | Node evacuation/group change iallocator result type. This result
-- type consists of actual opcodes (a restricted subset) that are
-- transmitted back to Ganeti.
data EvacSolution = EvacSolution
112
113
114
115
  { esMoved   :: [(Idx, Gdx, [Ndx])]  -- ^ Instances moved successfully
  , esFailed  :: [(Idx, String)]      -- ^ Instances which were not
                                      -- relocated
  , esOpCodes :: [[OpCodes.OpCode]]   -- ^ List of jobs
Iustin Pop's avatar
Iustin Pop committed
116
  } deriving (Show)
117

Iustin Pop's avatar
Iustin Pop committed
118
119
120
121
-- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
type AllocResult = (FailStats, Node.List, Instance.List,
                    [Instance.Instance], [CStats])

René Nussbaumer's avatar
René Nussbaumer committed
122
123
124
-- | Type alias for easier handling.
type AllocSolutionList = [(Instance.Instance, AllocSolution)]

125
-- | A type denoting the valid allocation mode/pairs.
126
--
127
128
129
130
131
132
-- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
-- for a two-node allocation, this will be a @Right [('Ndx',
-- ['Ndx'])]@. In the latter case, the list is basically an
-- association list, grouped by primary node and holding the potential
-- secondary nodes in the sub-list.
type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
133

134
-- | The empty solution we start with when computing allocations.
135
136
emptyAllocSolution :: AllocSolution
emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
137
                                   , asSolution = Nothing, asLog = [] }
138

139
140
141
142
143
144
145
-- | The empty evac solution.
emptyEvacSolution :: EvacSolution
emptyEvacSolution = EvacSolution { esMoved = []
                                 , esFailed = []
                                 , esOpCodes = []
                                 }

146
-- | The complete state for the balancing solution.
147
data Table = Table Node.List Instance.List Score [Placement]
148
             deriving (Show)
Iustin Pop's avatar
Iustin Pop committed
149

Iustin Pop's avatar
Iustin Pop committed
150
-- | Cluster statistics data type.
Iustin Pop's avatar
Iustin Pop committed
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
data CStats = CStats
  { csFmem :: Integer -- ^ Cluster free mem
  , csFdsk :: Integer -- ^ Cluster free disk
  , csAmem :: Integer -- ^ Cluster allocatable mem
  , csAdsk :: Integer -- ^ Cluster allocatable disk
  , csAcpu :: Integer -- ^ Cluster allocatable cpus
  , csMmem :: Integer -- ^ Max node allocatable mem
  , csMdsk :: Integer -- ^ Max node allocatable disk
  , csMcpu :: Integer -- ^ Max node allocatable cpu
  , csImem :: Integer -- ^ Instance used mem
  , csIdsk :: Integer -- ^ Instance used disk
  , csIcpu :: Integer -- ^ Instance used cpu
  , csTmem :: Double  -- ^ Cluster total mem
  , csTdsk :: Double  -- ^ Cluster total disk
  , csTcpu :: Double  -- ^ Cluster total cpus
166
167
168
  , csVcpu :: Integer -- ^ Cluster total virtual cpus
  , csNcpu :: Double  -- ^ Equivalent to 'csIcpu' but in terms of
                      -- physical CPUs, i.e. normalised used phys CPUs
Iustin Pop's avatar
Iustin Pop committed
169
170
171
172
  , csXmem :: Integer -- ^ Unnacounted for mem
  , csNmem :: Integer -- ^ Node own memory
  , csScore :: Score  -- ^ The cluster score
  , csNinst :: Int    -- ^ The total number of instances
173
  } deriving (Show)
174

175
176
177
178
179
180
181
182
183
184
-- | A simple type for allocation functions.
type AllocMethod =  Node.List           -- ^ Node list
                 -> Instance.List       -- ^ Instance list
                 -> Maybe Int           -- ^ Optional allocation limit
                 -> Instance.Instance   -- ^ Instance spec for allocation
                 -> AllocNodes          -- ^ Which nodes we should allocate on
                 -> [Instance.Instance] -- ^ Allocated instances
                 -> [CStats]            -- ^ Running cluster stats
                 -> Result AllocResult  -- ^ Allocation result

185
186
187
188
-- | A simple type for the running solution of evacuations.
type EvacInnerState =
  Either String (Node.List, Instance.Instance, Score, Ndx)

Iustin Pop's avatar
Iustin Pop committed
189
190
-- * Utility functions

Iustin Pop's avatar
Iustin Pop committed
191
192
-- | Verifies the N+1 status and return the affected nodes.
verifyN1 :: [Node.Node] -> [Node.Node]
Iustin Pop's avatar
Iustin Pop committed
193
verifyN1 = filter Node.failN1
Iustin Pop's avatar
Iustin Pop committed
194

Iustin Pop's avatar
Iustin Pop committed
195
196
197
198
199
200
201
202
203
204
{-| Computes the pair of bad nodes and instances.

The bad node list is computed via a simple 'verifyN1' check, and the
bad instance list is the list of primary and secondary instances of
those nodes.

-}
computeBadItems :: Node.List -> Instance.List ->
                   ([Node.Node], [Instance.Instance])
computeBadItems nl il =
205
  let bad_nodes = verifyN1 $ getOnline nl
206
      bad_instances = map (`Container.find` il) .
Iustin Pop's avatar
Iustin Pop committed
207
                      sort . nub $
208
                      concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
Iustin Pop's avatar
Iustin Pop committed
209
210
211
  in
    (bad_nodes, bad_instances)

212
213
214
215
216
217
218
219
220
221
222
223
-- | Extracts the node pairs for an instance. This can fail if the
-- instance is single-homed. FIXME: this needs to be improved,
-- together with the general enhancement for handling non-DRBD moves.
instanceNodes :: Node.List -> Instance.Instance ->
                 (Ndx, Ndx, Node.Node, Node.Node)
instanceNodes nl inst =
  let old_pdx = Instance.pNode inst
      old_sdx = Instance.sNode inst
      old_p = Container.find old_pdx nl
      old_s = Container.find old_sdx nl
  in (old_pdx, old_sdx, old_p, old_s)

224
-- | Zero-initializer for the CStats type.
225
emptyCStats :: CStats
226
emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
227

228
-- | Update stats with data from a new node.
229
230
updateCStats :: CStats -> Node.Node -> CStats
updateCStats cs node =
231
232
233
234
235
  let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
               csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
               csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
               csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
               csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
236
               csVcpu = x_vcpu, csNcpu = x_ncpu,
237
238
239
240
241
242
243
244
245
246
247
248
               csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst
             }
        = cs
      inc_amem = Node.fMem node - Node.rMem node
      inc_amem' = if inc_amem > 0 then inc_amem else 0
      inc_adsk = Node.availDisk node
      inc_imem = truncate (Node.tMem node) - Node.nMem node
                 - Node.xMem node - Node.fMem node
      inc_icpu = Node.uCpu node
      inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
      inc_vcpu = Node.hiCpu node
      inc_acpu = Node.availCpu node
249
250
      inc_ncpu = fromIntegral (Node.uCpu node) /
                 iPolicyVcpuRatio (Node.iPolicy node)
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
  in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
        , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
        , csAmem = x_amem + fromIntegral inc_amem'
        , csAdsk = x_adsk + fromIntegral inc_adsk
        , csAcpu = x_acpu + fromIntegral inc_acpu
        , csMmem = max x_mmem (fromIntegral inc_amem')
        , csMdsk = max x_mdsk (fromIntegral inc_adsk)
        , csMcpu = max x_mcpu (fromIntegral inc_acpu)
        , csImem = x_imem + fromIntegral inc_imem
        , csIdsk = x_idsk + fromIntegral inc_idsk
        , csIcpu = x_icpu + fromIntegral inc_icpu
        , csTmem = x_tmem + Node.tMem node
        , csTdsk = x_tdsk + Node.tDsk node
        , csTcpu = x_tcpu + Node.tCpu node
        , csVcpu = x_vcpu + fromIntegral inc_vcpu
266
        , csNcpu = x_ncpu + inc_ncpu
267
268
269
270
        , csXmem = x_xmem + fromIntegral (Node.xMem node)
        , csNmem = x_nmem + fromIntegral (Node.nMem node)
        , csNinst = x_ninst + length (Node.pList node)
        }
271

Iustin Pop's avatar
Iustin Pop committed
272
-- | Compute the total free disk and memory in the cluster.
273
totalResources :: Node.List -> CStats
274
totalResources nl =
275
276
  let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
  in cs { csScore = compCV nl }
Iustin Pop's avatar
Iustin Pop committed
277

278
279
280
-- | Compute the delta between two cluster state.
--
-- This is used when doing allocations, to understand better the
Iustin Pop's avatar
Iustin Pop committed
281
282
283
-- available cluster resources. The return value is a triple of the
-- current used values, the delta that was still allocated, and what
-- was left unallocated.
284
285
computeAllocationDelta :: CStats -> CStats -> AllocStats
computeAllocationDelta cini cfin =
286
287
  let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
              csNcpu = i_ncpu } = cini
288
      CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
              csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
              csNcpu = f_ncpu, csTcpu = f_tcpu } = cfin
      rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
                       , allocInfoNCpus = i_ncpu
                       , allocInfoMem   = fromIntegral i_imem
                       , allocInfoDisk  = fromIntegral i_idsk
                       }
      rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
                       , allocInfoNCpus = f_ncpu - i_ncpu
                       , allocInfoMem   = fromIntegral (f_imem - i_imem)
                       , allocInfoDisk  = fromIntegral (f_idsk - i_idsk)
                       }
      runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
                       , allocInfoNCpus = f_tcpu - f_ncpu
                       , allocInfoMem   = truncate t_mem - fromIntegral f_imem
                       , allocInfoDisk  = truncate t_dsk - fromIntegral f_idsk
                       }
306
  in (rini, rfin, runa)
307

308
-- | The names and weights of the individual elements in the CV list.
Iustin Pop's avatar
Iustin Pop committed
309
310
311
312
313
314
315
316
317
318
319
320
detailedCVInfo :: [(Double, String)]
detailedCVInfo = [ (1,  "free_mem_cv")
                 , (1,  "free_disk_cv")
                 , (1,  "n1_cnt")
                 , (1,  "reserved_mem_cv")
                 , (4,  "offline_all_cnt")
                 , (16, "offline_pri_cnt")
                 , (1,  "vcpu_ratio_cv")
                 , (1,  "cpu_load_cv")
                 , (1,  "mem_load_cv")
                 , (1,  "disk_load_cv")
                 , (1,  "net_load_cv")
Iustin Pop's avatar
Iustin Pop committed
321
                 , (2,  "pri_tags_score")
322
                 , (1,  "spindles_cv")
Iustin Pop's avatar
Iustin Pop committed
323
324
                 ]

Iustin Pop's avatar
Iustin Pop committed
325
-- | Holds the weights used by 'compCVNodes' for each metric.
Iustin Pop's avatar
Iustin Pop committed
326
327
detailedCVWeights :: [Double]
detailedCVWeights = map fst detailedCVInfo
328

Iustin Pop's avatar
Iustin Pop committed
329
-- | Compute the mem and disk covariance.
330
331
compDetailedCV :: [Node.Node] -> [Double]
compDetailedCV all_nodes =
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
  let (offline, nodes) = partition Node.offline all_nodes
      mem_l = map Node.pMem nodes
      dsk_l = map Node.pDsk nodes
      -- metric: memory covariance
      mem_cv = stdDev mem_l
      -- metric: disk covariance
      dsk_cv = stdDev dsk_l
      -- metric: count of instances living on N1 failing nodes
      n1_score = fromIntegral . sum . map (\n -> length (Node.sList n) +
                                                 length (Node.pList n)) .
                 filter Node.failN1 $ nodes :: Double
      res_l = map Node.pRem nodes
      -- metric: reserved memory covariance
      res_cv = stdDev res_l
      -- offline instances metrics
      offline_ipri = sum . map (length . Node.pList) $ offline
      offline_isec = sum . map (length . Node.sList) $ offline
      -- metric: count of instances on offline nodes
      off_score = fromIntegral (offline_ipri + offline_isec)::Double
      -- metric: count of primary instances on offline nodes (this
      -- helps with evacuation/failover of primary instances on
      -- 2-node clusters with one node offline)
      off_pri_score = fromIntegral offline_ipri::Double
      cpu_l = map Node.pCpu nodes
      -- metric: covariance of vcpu/pcpu ratio
      cpu_cv = stdDev cpu_l
      -- metrics: covariance of cpu, memory, disk and network load
      (c_load, m_load, d_load, n_load) =
        unzip4 $ map (\n ->
                      let DynUtil c1 m1 d1 n1 = Node.utilLoad n
                          DynUtil c2 m2 d2 n2 = Node.utilPool n
                      in (c1/c2, m1/m2, d1/d2, n1/n2)) nodes
      -- metric: conflicting instance count
      pri_tags_inst = sum $ map Node.conflictingPrimaries nodes
      pri_tags_score = fromIntegral pri_tags_inst::Double
367
368
      -- metric: spindles %
      spindles_cv = map (\n -> Node.instSpindles n / Node.hiSpindles n) nodes
369
370
  in [ mem_cv, dsk_cv, n1_score, res_cv, off_score, off_pri_score, cpu_cv
     , stdDev c_load, stdDev m_load , stdDev d_load, stdDev n_load
371
     , pri_tags_score, stdDev spindles_cv ]
Iustin Pop's avatar
Iustin Pop committed
372
373

-- | Compute the /total/ variance.
374
375
376
377
compCVNodes :: [Node.Node] -> Double
compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV

-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
Iustin Pop's avatar
Iustin Pop committed
378
compCV :: Node.List -> Double
379
380
compCV = compCVNodes . Container.elems

381
-- | Compute online nodes from a 'Node.List'.
382
383
384
getOnline :: Node.List -> [Node.Node]
getOnline = filter (not . Node.offline) . Container.elems

385
-- * Balancing functions
Iustin Pop's avatar
Iustin Pop committed
386
387
388
389

-- | Compute best table. Note that the ordering of the arguments is important.
compareTables :: Table -> Table -> Table
compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
390
  if a_cv > b_cv then b else a
Iustin Pop's avatar
Iustin Pop committed
391
392

-- | Applies an instance move to a given node list and instance.
393
applyMove :: Node.List -> Instance.Instance
394
          -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
Iustin Pop's avatar
Iustin Pop committed
395
-- Failover (f)
Iustin Pop's avatar
Iustin Pop committed
396
applyMove nl inst Failover =
397
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
398
399
400
      int_p = Node.removePri old_p inst
      int_s = Node.removeSec old_s inst
      new_nl = do -- Maybe monad
Iustin Pop's avatar
Iustin Pop committed
401
        new_p <- Node.addPriEx (Node.offline old_p) int_s inst
402
403
404
405
406
        new_s <- Node.addSec int_p inst old_sdx
        let new_inst = Instance.setBoth inst old_sdx old_pdx
        return (Container.addTwo old_pdx new_s old_sdx new_p nl,
                new_inst, old_sdx, old_pdx)
  in new_nl
Iustin Pop's avatar
Iustin Pop committed
407

408
409
410
411
412
413
414
415
416
417
418
-- Failover to any (fa)
applyMove nl inst (FailoverToAny new_pdx) = do
  let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
      new_pnode = Container.find new_pdx nl
      force_failover = Node.offline old_pnode
  new_pnode' <- Node.addPriEx force_failover new_pnode inst
  let old_pnode' = Node.removePri old_pnode inst
      inst' = Instance.setPri inst new_pdx
      nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
  return (nl', inst', new_pdx, old_sdx)

Iustin Pop's avatar
Iustin Pop committed
419
-- Replace the primary (f:, r:np, f)
Iustin Pop's avatar
Iustin Pop committed
420
applyMove nl inst (ReplacePrimary new_pdx) =
421
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
      tgt_n = Container.find new_pdx nl
      int_p = Node.removePri old_p inst
      int_s = Node.removeSec old_s inst
      force_p = Node.offline old_p
      new_nl = do -- Maybe monad
                  -- check that the current secondary can host the instance
                  -- during the migration
        tmp_s <- Node.addPriEx force_p int_s inst
        let tmp_s' = Node.removePri tmp_s inst
        new_p <- Node.addPriEx force_p tgt_n inst
        new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
        let new_inst = Instance.setPri inst new_pdx
        return (Container.add new_pdx new_p $
                Container.addTwo old_pdx int_p old_sdx new_s nl,
                new_inst, new_pdx, old_sdx)
  in new_nl
Iustin Pop's avatar
Iustin Pop committed
438

Iustin Pop's avatar
Iustin Pop committed
439
-- Replace the secondary (r:ns)
Iustin Pop's avatar
Iustin Pop committed
440
applyMove nl inst (ReplaceSecondary new_sdx) =
441
442
443
444
445
446
447
448
449
450
451
452
  let old_pdx = Instance.pNode inst
      old_sdx = Instance.sNode inst
      old_s = Container.find old_sdx nl
      tgt_n = Container.find new_sdx nl
      int_s = Node.removeSec old_s inst
      force_s = Node.offline old_s
      new_inst = Instance.setSec inst new_sdx
      new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
               \new_s -> return (Container.addTwo new_sdx
                                 new_s old_sdx int_s nl,
                                 new_inst, old_pdx, new_sdx)
  in new_nl
Iustin Pop's avatar
Iustin Pop committed
453

Iustin Pop's avatar
Iustin Pop committed
454
-- Replace the secondary and failover (r:np, f)
Iustin Pop's avatar
Iustin Pop committed
455
applyMove nl inst (ReplaceAndFailover new_pdx) =
456
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
457
458
459
460
461
462
463
464
465
466
467
468
      tgt_n = Container.find new_pdx nl
      int_p = Node.removePri old_p inst
      int_s = Node.removeSec old_s inst
      force_s = Node.offline old_s
      new_nl = do -- Maybe monad
        new_p <- Node.addPri tgt_n inst
        new_s <- Node.addSecEx force_s int_p inst new_pdx
        let new_inst = Instance.setBoth inst new_pdx old_pdx
        return (Container.add new_pdx new_p $
                Container.addTwo old_pdx new_s old_sdx int_s nl,
                new_inst, new_pdx, old_pdx)
  in new_nl
Iustin Pop's avatar
Iustin Pop committed
469

Iustin Pop's avatar
Iustin Pop committed
470
471
-- Failver and replace the secondary (f, r:ns)
applyMove nl inst (FailoverAndReplace new_sdx) =
472
  let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
473
474
475
476
477
478
479
480
481
482
483
484
      tgt_n = Container.find new_sdx nl
      int_p = Node.removePri old_p inst
      int_s = Node.removeSec old_s inst
      force_p = Node.offline old_p
      new_nl = do -- Maybe monad
        new_p <- Node.addPriEx force_p int_s inst
        new_s <- Node.addSecEx force_p tgt_n inst old_sdx
        let new_inst = Instance.setBoth inst old_sdx new_sdx
        return (Container.add new_sdx new_s $
                Container.addTwo old_sdx new_p old_pdx int_p nl,
                new_inst, old_sdx, new_sdx)
  in new_nl
Iustin Pop's avatar
Iustin Pop committed
485

Iustin Pop's avatar
Iustin Pop committed
486
-- | Tries to allocate an instance on one given node.
487
allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
488
                 -> OpResult Node.AllocElement
489
allocateOnSingle nl inst new_pdx =
490
491
  let p = Container.find new_pdx nl
      new_inst = Instance.setBoth inst new_pdx Node.noSecondary
492
493
494
  in do
    Instance.instMatchesPolicy inst (Node.iPolicy p)
    new_p <- Node.addPri p inst
495
    let new_nl = Container.add new_pdx new_p nl
496
        new_score = compCV new_nl
497
    return (new_nl, new_inst, [new_p], new_score)
498

Iustin Pop's avatar
Iustin Pop committed
499
-- | Tries to allocate an instance on a given pair of nodes.
500
allocateOnPair :: Node.List -> Instance.Instance -> Ndx -> Ndx
501
               -> OpResult Node.AllocElement
502
allocateOnPair nl inst new_pdx new_sdx =
503
504
505
  let tgt_p = Container.find new_pdx nl
      tgt_s = Container.find new_sdx nl
  in do
506
    Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
507
508
509
510
511
    new_p <- Node.addPri tgt_p inst
    new_s <- Node.addSec tgt_s inst new_pdx
    let new_inst = Instance.setBoth inst new_pdx new_sdx
        new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
    return (new_nl, new_inst, [new_p, new_s], compCV new_nl)
512

Iustin Pop's avatar
Iustin Pop committed
513
514
-- | Tries to perform an instance move and returns the best table
-- between the original one and the new one.
Iustin Pop's avatar
Iustin Pop committed
515
516
517
518
519
520
checkSingleStep :: Table -- ^ The original table
                -> Instance.Instance -- ^ The instance to move
                -> Table -- ^ The current best table
                -> IMove -- ^ The move to apply
                -> Table -- ^ The final best table
checkSingleStep ini_tbl target cur_tbl move =
521
522
523
  let Table ini_nl ini_il _ ini_plc = ini_tbl
      tmp_resu = applyMove ini_nl target move
  in case tmp_resu of
Iustin Pop's avatar
Iustin Pop committed
524
525
       Bad _ -> cur_tbl
       Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
526
527
528
529
530
531
         let tgt_idx = Instance.idx target
             upd_cvar = compCV upd_nl
             upd_il = Container.add tgt_idx new_inst ini_il
             upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
             upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
         in compareTables cur_tbl upd_tbl
Iustin Pop's avatar
Iustin Pop committed
532

533
534
535
-- | Given the status of the current secondary as a valid new node and
-- the current candidate target node, generate the possible moves for
-- a instance.
536
537
538
539
540
possibleMoves :: MirrorType -- ^ The mirroring type of the instance
              -> Bool       -- ^ Whether the secondary node is a valid new node
              -> Bool       -- ^ Whether we can change the primary node
              -> Ndx        -- ^ Target node candidate
              -> [IMove]    -- ^ List of valid result moves
541

542
possibleMoves MirrorNone _ _ _ = []
543

544
545
546
547
possibleMoves MirrorExternal _ False _ = []

possibleMoves MirrorExternal _ True tdx =
  [ FailoverToAny tdx ]
548
549
550
551
552

possibleMoves MirrorInternal _ False tdx =
  [ ReplaceSecondary tdx ]

possibleMoves MirrorInternal True True tdx =
553
554
555
556
557
  [ ReplaceSecondary tdx
  , ReplaceAndFailover tdx
  , ReplacePrimary tdx
  , FailoverAndReplace tdx
  ]
558

559
possibleMoves MirrorInternal False True tdx =
560
561
562
  [ ReplaceSecondary tdx
  , ReplaceAndFailover tdx
  ]
563
564

-- | Compute the best move for a given instance.
565
566
checkInstanceMove :: [Ndx]             -- ^ Allowed target node indices
                  -> Bool              -- ^ Whether disk moves are allowed
567
                  -> Bool              -- ^ Whether instance moves are allowed
568
569
570
                  -> Table             -- ^ Original table
                  -> Instance.Instance -- ^ Instance to move
                  -> Table             -- ^ Best new table for this instance
571
checkInstanceMove nodes_idx disk_moves inst_moves ini_tbl target =
572
573
  let opdx = Instance.pNode target
      osdx = Instance.sNode target
Iustin Pop's avatar
Iustin Pop committed
574
575
      bad_nodes = [opdx, osdx]
      nodes = filter (`notElem` bad_nodes) nodes_idx
576
      mir_type = Instance.mirrorType target
577
      use_secondary = elem osdx nodes_idx && inst_moves
578
579
      aft_failover = if mir_type == MirrorInternal && use_secondary
                       -- if drbd and allowed to failover
580
581
                       then checkSingleStep ini_tbl target ini_tbl Failover
                       else ini_tbl
582
583
584
585
586
      all_moves =
        if disk_moves
          then concatMap (possibleMoves mir_type use_secondary inst_moves)
               nodes
          else []
Iustin Pop's avatar
Iustin Pop committed
587
588
    in
      -- iterate over the possible nodes for this instance
589
      foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
Iustin Pop's avatar
Iustin Pop committed
590

Iustin Pop's avatar
Iustin Pop committed
591
-- | Compute the best next move.
592
checkMove :: [Ndx]               -- ^ Allowed target node indices
593
          -> Bool                -- ^ Whether disk moves are allowed
594
          -> Bool                -- ^ Whether instance moves are allowed
595
          -> Table               -- ^ The current solution
Iustin Pop's avatar
Iustin Pop committed
596
          -> [Instance.Instance] -- ^ List of instances still to move
597
          -> Table               -- ^ The new solution
598
checkMove nodes_idx disk_moves inst_moves ini_tbl victims =
599
600
601
602
603
604
605
606
607
608
609
610
611
  let Table _ _ _ ini_plc = ini_tbl
      -- we're using rwhnf from the Control.Parallel.Strategies
      -- package; we don't need to use rnf as that would force too
      -- much evaluation in single-threaded cases, and in
      -- multi-threaded case the weak head normal form is enough to
      -- spark the evaluation
      tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
                             inst_moves ini_tbl)
               victims
      -- iterate over all instances, computing the best move
      best_tbl = foldl' compareTables ini_tbl tables
      Table _ _ _ best_plc = best_tbl
  in if length best_plc == length ini_plc
612
613
       then ini_tbl -- no advancement
       else best_tbl
Iustin Pop's avatar
Iustin Pop committed
614

615
-- | Check if we are allowed to go deeper in the balancing.
616
617
618
619
doNextBalance :: Table     -- ^ The starting table
              -> Int       -- ^ Remaining length
              -> Score     -- ^ Score at which to stop
              -> Bool      -- ^ The resulting table and commands
620
doNextBalance ini_tbl max_rounds min_score =
621
622
623
  let Table _ _ ini_cv ini_plc = ini_tbl
      ini_plc_len = length ini_plc
  in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
624

625
-- | Run a balance move.
626
627
tryBalance :: Table       -- ^ The starting table
           -> Bool        -- ^ Allow disk moves
628
           -> Bool        -- ^ Allow instance moves
Iustin Pop's avatar
Iustin Pop committed
629
           -> Bool        -- ^ Only evacuate moves
630
631
           -> Score       -- ^ Min gain threshold
           -> Score       -- ^ Min gain
632
           -> Maybe Table -- ^ The resulting table and commands
633
tryBalance ini_tbl disk_moves inst_moves evac_mode mg_limit min_gain =
634
635
    let Table ini_nl ini_il ini_cv _ = ini_tbl
        all_inst = Container.elems ini_il
636
637
        all_nodes = Container.elems ini_nl
        (offline_nodes, online_nodes) = partition Node.offline all_nodes
Iustin Pop's avatar
Iustin Pop committed
638
        all_inst' = if evac_mode
639
640
641
642
                      then let bad_nodes = map Node.idx offline_nodes
                           in filter (any (`elem` bad_nodes) .
                                          Instance.allNodes) all_inst
                      else all_inst
643
644
        reloc_inst = filter (\i -> Instance.movable i &&
                                   Instance.autoBalance i) all_inst'
645
        node_idx = map Node.idx online_nodes
646
        fin_tbl = checkMove node_idx disk_moves inst_moves ini_tbl reloc_inst
647
        (Table _ _ fin_cv _) = fin_tbl
648
    in
649
      if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
650
      then Just fin_tbl -- this round made success, return the new table
651
652
      else Nothing

653
654
-- * Allocation functions

655
-- | Build failure stats out of a list of failures.
656
657
collapseFailures :: [FailMode] -> FailStats
collapseFailures flst =
658
659
    map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
            [minBound..maxBound]
660

Iustin Pop's avatar
Iustin Pop committed
661
-- | Compares two Maybe AllocElement and chooses the best score.
662
663
664
665
666
667
bestAllocElement :: Maybe Node.AllocElement
                 -> Maybe Node.AllocElement
                 -> Maybe Node.AllocElement
bestAllocElement a Nothing = a
bestAllocElement Nothing b = b
bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
668
  if ascore < bscore then a else b
669

670
-- | Update current Allocation solution and failure stats with new
671
-- elements.
672
concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
Iustin Pop's avatar
Iustin Pop committed
673
concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
674

Iustin Pop's avatar
Iustin Pop committed
675
concatAllocs as (Ok ns) =
676
677
678
679
680
  let -- Choose the old or new solution, based on the cluster score
    cntok = asAllocs as
    osols = asSolution as
    nsols = bestAllocElement osols (Just ns)
    nsuc = cntok + 1
681
682
683
684
    -- Note: we force evaluation of nsols here in order to keep the
    -- memory profile low - we know that we will need nsols for sure
    -- in the next cycle, so we force evaluation of nsols, since the
    -- foldl' in the caller will only evaluate the tuple, but not the
Iustin Pop's avatar
Iustin Pop committed
685
    -- elements of the tuple
686
  in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
687

688
689
690
691
-- | Sums two 'AllocSolution' structures.
sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
sumAllocs (AllocSolution aFails aAllocs aSols aLog)
          (AllocSolution bFails bAllocs bSols bLog) =
692
693
694
695
696
697
698
699
  -- note: we add b first, since usually it will be smaller; when
  -- fold'ing, a will grow and grow whereas b is the per-group
  -- result, hence smaller
  let nFails  = bFails ++ aFails
      nAllocs = aAllocs + bAllocs
      nSols   = bestAllocElement aSols bSols
      nLog    = bLog ++ aLog
  in AllocSolution nFails nAllocs nSols nLog
700

701
-- | Given a solution, generates a reasonable description for it.
702
703
704
describeSolution :: AllocSolution -> String
describeSolution as =
  let fcnt = asFailures as
705
      sols = asSolution as
706
707
708
      freasons =
        intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
        filter ((> 0) . snd) . collapseFailures $ fcnt
709
710
711
712
713
714
715
  in case sols of
     Nothing -> "No valid allocation solutions, failure reasons: " ++
                (if null fcnt then "unknown reasons" else freasons)
     Just (_, _, nodes, cv) ->
         printf ("score: %.8f, successes %d, failures %d (%s)" ++
                 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
               (intercalate "/" . map Node.name $ nodes)
716

717
-- | Annotates a solution with the appropriate string.
718
719
720
annotateSolution :: AllocSolution -> AllocSolution
annotateSolution as = as { asLog = describeSolution as : asLog as }

721
722
723
724
725
726
-- | Reverses an evacuation solution.
--
-- Rationale: we always concat the results to the top of the lists, so
-- for proper jobset execution, we should reverse all lists.
reverseEvacSolution :: EvacSolution -> EvacSolution
reverseEvacSolution (EvacSolution f m o) =
727
  EvacSolution (reverse f) (reverse m) (reverse o)
728

729
-- | Generate the valid node allocation singles or pairs for a new instance.
730
731
genAllocNodes :: Group.List        -- ^ Group list
              -> Node.List         -- ^ The node map
732
              -> Int               -- ^ The number of nodes required
733
734
              -> Bool              -- ^ Whether to drop or not
                                   -- unallocable nodes
735
              -> Result AllocNodes -- ^ The (monadic) result
736
genAllocNodes gl nl count drop_unalloc =
737
  let filter_fn = if drop_unalloc
738
739
                    then filter (Group.isAllocable .
                                 flip Container.find gl . Node.group)
740
                    else id
741
742
743
744
745
746
747
748
749
750
      all_nodes = filter_fn $ getOnline nl
      all_pairs = [(Node.idx p,
                    [Node.idx s | s <- all_nodes,
                                       Node.idx p /= Node.idx s,
                                       Node.group p == Node.group s]) |
                   p <- all_nodes]
  in case count of
       1 -> Ok (Left (map Node.idx all_nodes))
       2 -> Ok (Right (filter (not . null . snd) all_pairs))
       _ -> Bad "Unsupported number of nodes, only one or two  supported"
751

752
753
754
755
756
-- | Try to allocate an instance on the cluster.
tryAlloc :: (Monad m) =>
            Node.List         -- ^ The node list
         -> Instance.List     -- ^ The instance list
         -> Instance.Instance -- ^ The instance to allocate
757
         -> AllocNodes        -- ^ The allocation targets
758
         -> m AllocSolution   -- ^ Possible solution list
Iustin Pop's avatar
Iustin Pop committed
759
tryAlloc _  _ _    (Right []) = fail "Not enough online nodes"
760
tryAlloc nl _ inst (Right ok_pairs) =
761
762
763
764
765
766
767
  let psols = parMap rwhnf (\(p, ss) ->
                              foldl' (\cstate ->
                                        concatAllocs cstate .
                                        allocateOnPair nl inst p)
                              emptyAllocSolution ss) ok_pairs
      sols = foldl' sumAllocs emptyAllocSolution psols
  in return $ annotateSolution sols
768

Iustin Pop's avatar
Iustin Pop committed
769
tryAlloc _  _ _    (Left []) = fail "No online nodes"
770
tryAlloc nl _ inst (Left all_nodes) =
771
772
773
774
  let sols = foldl' (\cstate ->
                       concatAllocs cstate . allocateOnSingle nl inst
                    ) emptyAllocSolution all_nodes
  in return $ annotateSolution sols
775

776
-- | Given a group/result, describe it as a nice (list of) messages.
777
778
779
solutionDescription :: (Group.Group, Result AllocSolution)
                    -> [String]
solutionDescription (grp, result) =
780
  case result of
781
    Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
782
    Bad message -> [printf "Group %s: error %s" gname message]
783
  where gname = Group.name grp
784
        pol = allocPolicyToRaw (Group.allocPolicy grp)
785
786

-- | From a list of possibly bad and possibly empty solutions, filter
787
-- only the groups with a valid result. Note that the result will be
788
-- reversed compared to the original list.
789
790
791
792
793
filterMGResults :: [(Group.Group, Result AllocSolution)]
                -> [(Group.Group, AllocSolution)]
filterMGResults = foldl' fn []
  where unallocable = not . Group.isAllocable
        fn accu (grp, rasol) =
794
795
796
          case rasol of
            Bad _ -> accu
            Ok sol | isNothing (asSolution sol) -> accu
797
798
                   | unallocable grp -> accu
                   | otherwise -> (grp, sol):accu
799

800
-- | Sort multigroup results based on policy and score.
801
802
803
sortMGResults :: [(Group.Group, AllocSolution)]
              -> [(Group.Group, AllocSolution)]
sortMGResults sols =
804
  let extractScore (_, _, _, x) = x
805
      solScore (grp, sol) = (Group.allocPolicy grp,
806
807
                             (extractScore . fromJust . asSolution) sol)
  in sortBy (comparing solScore) sols
808

809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
-- | Removes node groups which can't accommodate the instance
filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
                  -> Instance.Instance
                  -> ([(Group.Group, (Node.List, Instance.List))], [String])
filterValidGroups [] _ = ([], [])
filterValidGroups (ng:ngs) inst =
  let (valid_ngs, msgs) = filterValidGroups ngs inst
      hasNetwork nic = case Nic.network nic of
        Just net -> net `elem` Group.networks (fst ng)
        Nothing -> True
      hasRequiredNetworks = all hasNetwork (Instance.nics inst)
  in if hasRequiredNetworks
      then (ng:valid_ngs, msgs)
      else (valid_ngs,
            ("group " ++ Group.name (fst ng) ++
             " is not connected to a network required by instance " ++
             Instance.name inst):msgs)

Iustin Pop's avatar
Iustin Pop committed
827
-- | Finds the best group for an instance on a multi-group cluster.
828
829
830
831
832
--
-- Only solutions in @preferred@ and @last_resort@ groups will be
-- accepted as valid, and additionally if the allowed groups parameter
-- is not null then allocation will only be run for those group
-- indices.
Iustin Pop's avatar
Iustin Pop committed
833
834
835
findBestAllocGroup :: Group.List           -- ^ The group list
                   -> Node.List            -- ^ The node list
                   -> Instance.List        -- ^ The instance list
836
                   -> Maybe [Gdx]          -- ^ The allowed groups
Iustin Pop's avatar
Iustin Pop committed
837
838
                   -> Instance.Instance    -- ^ The instance to allocate
                   -> Int                  -- ^ Required number of nodes
839
                   -> Result (Group.Group, AllocSolution, [String])
840
findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
841
842
843
844
  let groups_by_idx = splitCluster mgnl mgil
      groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
      groups' = maybe groups
                (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
845
                allowed_gdxs
846
847
848
849
850
      (groups'', filter_group_msgs) = filterValidGroups groups' inst
      sols = map (\(gr, (nl, il)) ->
                   (gr, genAllocNodes mggl nl cnt False >>=
                        tryAlloc nl il inst))
             groups''::[(Group.Group, Result AllocSolution)]
851
      all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
852
853
      goodSols = filterMGResults sols
      sortedSols = sortMGResults goodSols
854
855
856
857
858
859
860
  in case sortedSols of
       [] -> Bad $ if null groups'
                     then "no groups for evacuation: allowed groups was" ++
                          show allowed_gdxs ++ ", all groups: " ++
                          show (map fst groups)
                     else intercalate ", " all_msgs
       (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
Iustin Pop's avatar
Iustin Pop committed
861
862
863
864
865
866
867
868
869
870

-- | Try to allocate an instance on a multi-group cluster.
tryMGAlloc :: Group.List           -- ^ The group list
           -> Node.List            -- ^ The node list
           -> Instance.List        -- ^ The instance list
           -> Instance.Instance    -- ^ The instance to allocate
           -> Int                  -- ^ Required number of nodes
           -> Result AllocSolution -- ^ Possible solution list
tryMGAlloc mggl mgnl mgil inst cnt = do
  (best_group, solution, all_msgs) <-
871
      findBestAllocGroup mggl mgnl mgil Nothing inst cnt
872
  let group_name = Group.name best_group
Iustin Pop's avatar
Iustin Pop committed
873
874
      selmsg = "Selected group: " ++ group_name
  return $ solution { asLog = selmsg:all_msgs }
875

René Nussbaumer's avatar
René Nussbaumer committed
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
-- | Calculate the new instance list after allocation solution.
updateIl :: Instance.List           -- ^ The original instance list
         -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
         -> Instance.List           -- ^ The updated instance list
updateIl il Nothing = il
updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il

-- | Extract the the new node list from the allocation solution.
extractNl :: Node.List               -- ^ The original node list
          -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
          -> Node.List               -- ^ The new node list
extractNl nl Nothing = nl
extractNl _ (Just (xnl, _, _, _)) = xnl

-- | Try to allocate a list of instances on a multi-group cluster.
allocList :: Group.List                  -- ^ The group list
          -> Node.List                   -- ^ The node list
          -> Instance.List               -- ^ The instance list
          -> [(Instance.Instance, Int)]  -- ^ The instance to allocate
          -> AllocSolutionList           -- ^ Possible solution list
          -> Result (Node.List, Instance.List,
                     AllocSolutionList)  -- ^ The final solution list
allocList _  nl il [] result = Ok (nl, il, result)
allocList gl nl il ((xi, xicnt):xies) result = do
  ares <- tryMGAlloc gl nl il xi xicnt
  let sol = asSolution ares
      nl' = extractNl nl sol
      il' = updateIl il sol
  allocList gl nl' il' xies ((xi, ares):result)

906
907
908
909
910
911
912
913
-- | Function which fails if the requested mode is change secondary.
--
-- This is useful since except DRBD, no other disk template can
-- execute change secondary; thus, we can just call this function
-- instead of always checking for secondary mode. After the call to
-- this function, whatever mode we have is just a primary change.
failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
failOnSecondaryChange ChangeSecondary dt =
914
  fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
915
916
917
918
         "' can't execute change secondary"
failOnSecondaryChange _ _ = return ()

-- | Run evacuation for a single instance.
919
920
921
922
923
--
-- /Note:/ this function should correctly execute both intra-group
-- evacuations (in all modes) and inter-group evacuations (in the
-- 'ChangeAll' mode). Of course, this requires that the correct list
-- of target nodes is passed.
924
925
926
927
nodeEvacInstance :: Node.List         -- ^ The node list (cluster-wide)
                 -> Instance.List     -- ^ Instance list (cluster-wide)
                 -> EvacMode          -- ^ The evacuation mode
                 -> Instance.Instance -- ^ The instance to be evacuated
928
                 -> Gdx               -- ^ The group we're targetting
929
930
931
                 -> [Ndx]             -- ^ The list of available nodes
                                      -- for allocation
                 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
932
933
934
935
936
nodeEvacInstance nl il mode inst@(Instance.Instance
                                  {Instance.diskTemplate = dt@DTDiskless})
                 gdx avail_nodes =
                   failOnSecondaryChange mode dt >>
                   evacOneNodeOnly nl il inst gdx avail_nodes
937
938

nodeEvacInstance _ _ _ (Instance.Instance
939
                        {Instance.diskTemplate = DTPlain}) _ _ =
940
941
942
                  fail "Instances of type plain cannot be relocated"

nodeEvacInstance _ _ _ (Instance.Instance
943
                        {Instance.diskTemplate = DTFile}) _ _ =
944
945
                  fail "Instances of type file cannot be relocated"

946
947
948
949
950
nodeEvacInstance nl il mode inst@(Instance.Instance
                                  {Instance.diskTemplate = dt@DTSharedFile})
                 gdx avail_nodes =
                   failOnSecondaryChange mode dt >>
                   evacOneNodeOnly nl il inst gdx avail_nodes
951

952
953
954
955
956
nodeEvacInstance nl il mode inst@(Instance.Instance
                                  {Instance.diskTemplate = dt@DTBlock})
                 gdx avail_nodes =
                   failOnSecondaryChange mode dt >>
                   evacOneNodeOnly nl il inst gdx avail_nodes
957

958
959
960
961
962
nodeEvacInstance nl il mode inst@(Instance.Instance
                                  {Instance.diskTemplate = dt@DTRbd})
                 gdx avail_nodes =
                   failOnSecondaryChange mode dt >>
                   evacOneNodeOnly nl il inst gdx avail_nodes
963

964
965
966
967
968
969
nodeEvacInstance nl il mode inst@(Instance.Instance
                                  {Instance.diskTemplate = dt@DTExt})
                 gdx avail_nodes =
                   failOnSecondaryChange mode dt >>
                   evacOneNodeOnly nl il inst gdx avail_nodes

970
nodeEvacInstance nl il ChangePrimary
971
972
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
                 _ _ =
973
974
975
976
977
978
979
  do
    (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
    let idx = Instance.idx inst
        il' = Container.add idx inst' il
        ops = iMoveToJob nl' il' idx Failover
    return (nl', il', ops)

980
981
nodeEvacInstance nl il ChangeSecondary
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
982
                 gdx avail_nodes =
983
  evacOneNodeOnly nl il inst gdx avail_nodes
984

985
986
987
988
989
990
991
-- The algorithm for ChangeAll is as follows:
--
-- * generate all (primary, secondary) node pairs for the target groups
-- * for each pair, execute the needed moves (r:s, f, r:s) and compute
--   the final node list state and group score
-- * select the best choice via a foldl that uses the same Either
--   String solution as the ChangeSecondary mode
992
993
nodeEvacInstance nl il ChangeAll
                 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
994
                 gdx avail_nodes =
995
  do
996
997
998
    let no_nodes = Left "no nodes available"
        node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
    (nl', il', ops, _) <-
Iustin Pop's avatar
Iustin Pop committed
999
        annotateResult "Can't find any good nodes for relocation" .
1000
        eitherToResult $
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
        foldl'
        (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
                          Bad msg ->
                              case accu of
                                Right _ -> accu
                                -- we don't need more details (which
                                -- nodes, etc.) as we only selected
                                -- this group if we can allocate on
                                -- it, hence failures will not
                                -- propagate out of this fold loop
                                Left _ -> Left $ "Allocation failed: " ++ msg
                          Ok result@(_, _, _, new_cv) ->
                              let new_accu = Right result in
                              case accu of
                                Left _ -> new_accu
                                Right (_, _, _, old_cv) ->
                                    if old_cv < new_cv
                                    then accu
                                    else new_accu
        ) no_nodes node_pairs

    return (nl', il', ops)
1023

1024
1025
1026
-- | Generic function for changing one node of an instance.
--
-- This is similar to 'nodeEvacInstance' but will be used in a few of
1027
1028
1029
-- its sub-patterns. It folds the inner function 'evacOneNodeInner'
-- over the list of available nodes, which results in the best choice
-- for relocation.
1030
1031
1032
1033
1034
1035
1036
1037
evacOneNodeOnly :: Node.List         -- ^ The node list (cluster-wide)
                -> Instance.List     -- ^ Instance list (cluster-wide)
                -> Instance.Instance -- ^ The instance to be evacuated
                -> Gdx               -- ^ The group we're targetting
                -> [Ndx]             -- ^ The list of available nodes
                                      -- for allocation
                -> Result (Node.List, Instance.List, [OpCodes.OpCode])
evacOneNodeOnly nl il inst gdx avail_nodes = do
1038
  op_fn <- case Instance.mirrorType inst of
1039
1040
1041
             MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
             MirrorInternal -> Ok ReplaceSecondary
             MirrorExternal -> Ok FailoverToAny
Iustin Pop's avatar
Iustin Pop committed
1042
  (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1043
                          eitherToResult $
1044
                          foldl' (evacOneNodeInner nl inst gdx op_fn)
1045
1046
1047
1048
1049
1050
                          (Left "no nodes available") avail_nodes
  let idx = Instance.idx inst
      il' = Container.add idx inst' il
      ops = iMoveToJob nl' il' idx (op_fn ndx)
  return (nl', il', ops)

1051
1052
1053
1054
1055
-- | Inner fold function for changing one node of an instance.
--
-- Depending on the instance disk template, this will either change
-- the secondary (for DRBD) or the primary node (for shared
-- storage). However, the operation is generic otherwise.
1056
--
1057
-- The running solution is either a @Left String@, which means we
1058
1059
1060
1061
-- don't have yet a working solution, or a @Right (...)@, which
-- represents a valid solution; it holds the modified node list, the
-- modified instance (after evacuation), the score of that solution,
-- and the new secondary node index.
1062
1063
1064
1065
1066
1067
1068
1069
evacOneNodeInner :: Node.List         -- ^ Cluster node list
                 -> Instance.Instance -- ^ Instance being evacuated
                 -> Gdx               -- ^ The group index of the instance
                 -> (Ndx -> IMove)    -- ^ Operation constructor
                 -> EvacInnerState    -- ^ Current best solution
                 -> Ndx               -- ^ Node we're evaluating as target
                 -> EvacInnerState    -- ^ New best solution
evacOneNodeInner nl inst gdx op_fn accu ndx =
1070
  case applyMove nl inst (op_fn ndx) of
Iustin Pop's avatar
Iustin Pop committed
1071
1072
1073
1074
    Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
                             " failed: " ++ show fm
              in either (const $ Left fail_msg) (const accu) accu
    Ok (nl', inst', _, _) ->
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
      let nodes = Container.elems nl'
          -- The fromJust below is ugly (it can fail nastily), but
          -- at this point we should have any internal mismatches,
          -- and adding a monad here would be quite involved
          grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
          new_cv = compCVNodes grpnodes
          new_accu = Right (nl', inst', new_cv, ndx)
      in case accu of
           Left _ -> new_accu
           Right (_, _, old_cv, _) ->
             if old_cv < new_cv
               then accu
               else new_accu
1088