Hcheck.hs 11.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
{-| Cluster checker.

-}

{-

Copyright (C) 2012 Google Inc.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU Gene52al Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.

-}

26
27
28
29
30
module Ganeti.HTools.Program.Hcheck
  ( main
  , options
  , arguments
  ) where
31
32

import Control.Monad
Iustin Pop's avatar
Iustin Pop committed
33
import Data.List (transpose)
34
import System.Exit
35
36
37
38
import Text.Printf (printf)

import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Cluster as Cluster
39
import qualified Ganeti.HTools.Group as Group
40
41
42
43
import qualified Ganeti.HTools.Node as Node
import qualified Ganeti.HTools.Instance as Instance

import qualified Ganeti.HTools.Program.Hbal as Hbal
44

45
import Ganeti.Common
46
import Ganeti.HTools.CLI
47
48
49
import Ganeti.HTools.ExtLoader
import Ganeti.HTools.Loader
import Ganeti.HTools.Types
50
import Ganeti.Utils
51
52

-- | Options list and functions.
53
options :: IO [OptType]
54
options =
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  return
    [ oDataFile
    , oDiskMoves
    , oDynuFile
    , oEvacMode
    , oExInst
    , oExTags
    , oIAllocSrc
    , oInstMoves
    , oLuxiSocket
    , oMachineReadable
    , oMaxCpu
    , oMaxSolLength
    , oMinDisk
    , oMinGain
    , oMinGainLim
    , oMinScore
    , oNoSimulation
    , oOfflineNode
    , oQuiet
    , oRapiMaster
    , oSelInst
    , oVerbose
    ]
79

80
81
82
83
-- | The list of arguments supported by the program.
arguments :: [ArgCompletion]
arguments = []

84
85
86
87
-- | Check phase - are we before (initial) or after rebalance.
data Phase = Initial
           | Rebalanced

88
89
90
91
-- | Level of presented statistics.
data Level = GroupLvl
           | ClusterLvl

92
93
94
95
96
97
-- | A type alias for a group index and node\/instance lists.
type GroupInfo = (Gdx, (Node.List, Instance.List))

-- | A type alias for group stats.
type GroupStats = ((Group.Group, Double), [Int])

Agata Murawska's avatar
Agata Murawska committed
98
-- | Prefix for machine readable names.
99
100
101
102
103
104
105
htcPrefix :: String
htcPrefix = "HCHECK"

-- | Data showed both per group and per cluster.
commonData :: [(String, String)]
commonData =[ ("N1_FAIL", "Nodes not N+1 happy")
            , ("CONFLICT_TAGS", "Nodes with conflicting instances")
106
107
            , ("OFFLINE_PRI", "Instances having the primary node offline")
            , ("OFFLINE_SEC", "Instances having a secondary node offline")
108
109
110
111
112
113
114
115
            ]

-- | Data showed per group.
groupData :: [(String, String)]
groupData = commonData ++ [("SCORE", "Group score")]

-- | Data showed per cluster.
clusterData :: [(String, String)]
116
clusterData = commonData ++
117
              [ ("NEED_REBALANCE", "Cluster is not healthy") ]
118

119
120
121
122
123
124
125
126
127
128
-- | Phase-specific prefix for machine readable version.
phasePrefix :: Phase -> String
phasePrefix Initial = "INIT"
phasePrefix Rebalanced = "FINAL"

-- | Level-specific prefix for machine readable version.
levelPrefix :: Level -> String
levelPrefix GroupLvl = "GROUP"
levelPrefix ClusterLvl = "CLUSTER"

Agata Murawska's avatar
Agata Murawska committed
129
130
131
132
133
-- | Machine-readable keys to show depending on given level.
keysData :: Level -> [String]
keysData GroupLvl = map fst groupData
keysData ClusterLvl = map fst clusterData

134
135
136
137
138
-- | Description of phases for human readable version.
phaseDescr :: Phase -> String
phaseDescr Initial = "initially"
phaseDescr Rebalanced = "after rebalancing"

Agata Murawska's avatar
Agata Murawska committed
139
140
141
142
143
144
145
146
147
148
149
150
151
-- | Description to show depending on given level.
descrData :: Level -> [String]
descrData GroupLvl = map snd groupData
descrData ClusterLvl = map snd clusterData

-- | Human readable prefix for statistics.
phaseLevelDescr :: Phase -> Level -> Maybe String -> String
phaseLevelDescr phase GroupLvl (Just name) =
    printf "Statistics for group %s %s\n" name $ phaseDescr phase
phaseLevelDescr phase GroupLvl Nothing =
    printf "Statistics for group %s\n" $ phaseDescr phase
phaseLevelDescr phase ClusterLvl _ =
    printf "Cluster statistics %s\n" $ phaseDescr phase
152

153
154
155
156
-- | Format a list of key, value as a shell fragment.
printKeysHTC :: [(String, String)] -> IO ()
printKeysHTC = printKeys htcPrefix

157
158
159
160
161
162
163
164
-- | Prepare string from boolean value.
printBool :: Bool    -- ^ Whether the result should be machine readable
          -> Bool    -- ^ Value to be converted to string
          -> String
printBool True True = "1"
printBool True False = "0"
printBool False b = show b

Iustin Pop's avatar
Iustin Pop committed
165
166
-- | Print mapping from group idx to group uuid (only in machine
-- readable mode).
167
168
printGroupsMappings :: Group.List -> IO ()
printGroupsMappings gl = do
Iustin Pop's avatar
Iustin Pop committed
169
170
    let extract_vals g = (printf "GROUP_UUID_%d" $ Group.idx g :: String,
                          Group.uuid g)
171
172
173
        printpairs = map extract_vals (Container.elems gl)
    printKeysHTC printpairs

Agata Murawska's avatar
Agata Murawska committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
-- | Prepare a single key given a certain level and phase of simulation.
prepareKey :: Level -> Phase -> Maybe String -> String -> String
prepareKey level phase Nothing suffix =
  printf "%s_%s_%s" (phasePrefix phase) (levelPrefix level) suffix
prepareKey level phase (Just idx) suffix =
  printf "%s_%s_%s_%s" (phasePrefix phase) (levelPrefix level) idx suffix

-- | Print all the statistics for given level and phase.
printStats :: Int            -- ^ Verbosity level
           -> Bool           -- ^ If the output should be machine readable
           -> Level          -- ^ Level on which we are printing
           -> Phase          -- ^ Current phase of simulation
           -> [String]       -- ^ Values to print
           -> Maybe String   -- ^ Additional data for groups
           -> IO ()
printStats _ True level phase values gidx = do
  let keys = map (prepareKey level phase gidx) (keysData level)
  printKeysHTC $ zip keys values

printStats verbose False level phase values name = do
  let prefix = phaseLevelDescr phase level name
      descr = descrData level
196
  unless (verbose == 0) $ do
Iustin Pop's avatar
Iustin Pop committed
197
198
    putStrLn ""
    putStr prefix
Iustin Pop's avatar
Iustin Pop committed
199
    mapM_ (uncurry (printf "    %s: %s\n")) (zip descr values)
Agata Murawska's avatar
Agata Murawska committed
200
201
202

-- | Extract name or idx from group.
extractGroupData :: Bool -> Group.Group -> String
Iustin Pop's avatar
Iustin Pop committed
203
extractGroupData True grp = show $ Group.idx grp
Agata Murawska's avatar
Agata Murawska committed
204
205
206
207
208
extractGroupData False grp = Group.name grp

-- | Prepare values for group.
prepareGroupValues :: [Int] -> Double -> [String]
prepareGroupValues stats score =
Iustin Pop's avatar
Iustin Pop committed
209
  map show stats ++ [printf "%.8f" score]
Agata Murawska's avatar
Agata Murawska committed
210
211
212
213

-- | Prepare values for cluster.
prepareClusterValues :: Bool -> [Int] -> [Bool] -> [String]
prepareClusterValues machineread stats bstats =
Iustin Pop's avatar
Iustin Pop committed
214
  map show stats ++ map (printBool machineread) bstats
Agata Murawska's avatar
Agata Murawska committed
215
216

-- | Print all the statistics on a group level.
217
printGroupStats :: Int -> Bool -> Phase -> GroupStats -> IO ()
Iustin Pop's avatar
Iustin Pop committed
218
printGroupStats verbose machineread phase ((grp, score), stats) = do
Agata Murawska's avatar
Agata Murawska committed
219
220
221
  let values = prepareGroupValues stats score
      extradata = extractGroupData machineread grp
  printStats verbose machineread GroupLvl phase values (Just extradata)
222
223

-- | Print all the statistics on a cluster (global) level.
224
225
226
printClusterStats :: Int -> Bool -> Phase -> [Int] -> Bool -> IO ()
printClusterStats verbose machineread phase stats needhbal = do
  let values = prepareClusterValues machineread stats [needhbal]
Agata Murawska's avatar
Agata Murawska committed
227
  printStats verbose machineread ClusterLvl phase values Nothing
228
229
230
231

-- | Check if any of cluster metrics is non-zero.
clusterNeedsRebalance :: [Int] -> Bool
clusterNeedsRebalance stats = sum stats > 0
232
233
234
235
236

{- | Check group for N+1 hapiness, conflicts of primaries on nodes and
instances residing on offline nodes.

-}
237
perGroupChecks :: Group.List -> GroupInfo -> GroupStats
Iustin Pop's avatar
Iustin Pop committed
238
perGroupChecks gl (gidx, (nl, il)) =
239
240
  let grp = Container.find gidx gl
      offnl = filter Node.offline (Container.elems nl)
Iustin Pop's avatar
Iustin Pop committed
241
      n1violated = length . fst $ Cluster.computeBadItems nl il
242
243
244
245
246
247
248
249
250
251
      conflicttags = length $ filter (>0)
                     (map Node.conflictingPrimaries (Container.elems nl))
      offline_pri = sum . map length $ map Node.pList offnl
      offline_sec = length $ map Node.sList offnl
      score = Cluster.compCV nl
      groupstats = [ n1violated
                   , conflicttags
                   , offline_pri
                   , offline_sec
                   ]
Iustin Pop's avatar
Iustin Pop committed
252
  in ((grp, score), groupstats)
253
254

-- | Use Hbal's iterateDepth to simulate group rebalance.
Iustin Pop's avatar
Iustin Pop committed
255
256
executeSimulation :: Options -> Cluster.Table -> Double
                  -> Gdx -> Node.List -> Instance.List
257
                  -> IO GroupInfo
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
executeSimulation opts ini_tbl min_cv gidx nl il = do
  let imlen = maximum . map (length . Instance.alias) $ Container.elems il
      nmlen = maximum . map (length . Node.alias) $ Container.elems nl

  (fin_tbl, _) <- Hbal.iterateDepth False ini_tbl
                                    (optMaxLength opts)
                                    (optDiskMoves opts)
                                    (optInstMoves opts)
                                    nmlen imlen [] min_cv
                                    (optMinGainLim opts) (optMinGain opts)
                                    (optEvacMode opts)

  let (Cluster.Table fin_nl fin_il _ _) = fin_tbl
  return (gidx, (fin_nl, fin_il))

-- | Simulate group rebalance if group's score is not good
274
maybeSimulateGroupRebalance :: Options -> GroupInfo -> IO GroupInfo
275
maybeSimulateGroupRebalance opts (gidx, (nl, il)) = do
276
277
278
  let ini_cv = Cluster.compCV nl
      ini_tbl = Cluster.Table nl il ini_cv []
      min_cv = optMinScore opts
Iustin Pop's avatar
Iustin Pop committed
279
  if ini_cv < min_cv
280
    then return (gidx, (nl, il))
281
    else executeSimulation opts ini_tbl min_cv gidx nl il
282

283
284
285
-- | Decide whether to simulate rebalance.
maybeSimulateRebalance :: Bool             -- ^ Whether to simulate rebalance
                       -> Options          -- ^ Command line options
286
287
                       -> [GroupInfo]      -- ^ Group data
                       -> IO [GroupInfo]
288
maybeSimulateRebalance True opts cluster =
289
    mapM (maybeSimulateGroupRebalance opts) cluster
290
291
maybeSimulateRebalance False _ cluster = return cluster

292
293
294
295
-- | Prints the final @OK@ marker in machine readable output.
printFinalHTC :: Bool -> IO ()
printFinalHTC = printFinal htcPrefix

296
297
-- | Main function.
main :: Options -> [String] -> IO ()
298
main opts args = do
299
  unless (null args) $ exitErr "This program doesn't take any arguments."
300
301
302
303
304

  let verbose = optVerbose opts
      machineread = optMachineReadable opts
      nosimulation = optNoSimulation opts

305
  (ClusterData gl fixed_nl ilf _ _) <- loadExternalData opts
306
307
  nlf <- setNodeStatus opts fixed_nl

308
  let splitcluster = Cluster.splitCluster nlf ilf
309

310
  when machineread $ printGroupsMappings gl
311

Iustin Pop's avatar
Iustin Pop committed
312
313
  let groupsstats = map (perGroupChecks gl) splitcluster
      clusterstats = map sum . transpose . map snd $ groupsstats
314
      needrebalance = clusterNeedsRebalance clusterstats
315

316
317
318
319
320
321
  unless (verbose == 0 || machineread) .
    putStrLn $ if nosimulation
                 then "Running in no-simulation mode."
                 else if needrebalance
                        then "Cluster needs rebalancing."
                        else "No need to rebalance cluster, no problems found."
Iustin Pop's avatar
Iustin Pop committed
322
323

  mapM_ (printGroupStats verbose machineread Initial) groupsstats
324

Iustin Pop's avatar
Iustin Pop committed
325
  printClusterStats verbose machineread Initial clusterstats needrebalance
326

327
  let exitOK = nosimulation || not needrebalance
328
      simulate = not nosimulation && needrebalance
329
330
331
332

  rebalancedcluster <- maybeSimulateRebalance simulate opts splitcluster

  when (simulate || machineread) $ do
Iustin Pop's avatar
Iustin Pop committed
333
334
    let newgroupstats = map (perGroupChecks gl) rebalancedcluster
        newclusterstats = map sum . transpose . map snd $ newgroupstats
335
        newneedrebalance = clusterNeedsRebalance clusterstats
336

Iustin Pop's avatar
Iustin Pop committed
337
338
    mapM_ (printGroupStats verbose machineread Rebalanced) newgroupstats

339
    printClusterStats verbose machineread Rebalanced newclusterstats
340
                           newneedrebalance
341
342

  printFinalHTC machineread
343

Iustin Pop's avatar
Iustin Pop committed
344
  unless exitOK . exitWith $ ExitFailure 1