Commit aee68bb1 authored by Klaus Aehlig's avatar Klaus Aehlig

Make maintd wait for its jobs and also expose them

As described in the design, the maintenance daemon will only start
a new round once all jobs from the old round are finished. Add this
functionality now. As the list of jobs of the current round is also
relevant information, we expose it over HTTP. In order to do so, we
keep an in-memory copy of the job list (while the authoritative copy
is still kept in the configuration, as per our design).
Signed-off-by: 's avatarKlaus Aehlig <aehlig@google.com>
Reviewed-by: 's avatarPetr Pudlak <pudlak@google.com>
parent 2bbd580d
......@@ -975,6 +975,7 @@ HS_LIB_SRCS = \
src/Ganeti/Logging/WriterLog.hs \
src/Ganeti/Luxi.hs \
src/Ganeti/MaintD/Autorepairs.hs \
src/Ganeti/MaintD/MemoryState.hs \
src/Ganeti/MaintD/Server.hs \
src/Ganeti/MaintD/Utils.hs \
src/Ganeti/Network.hs \
......
{-| Memory copy of the state of the maintenance daemon.
While the autoritative state of the maintenance daemon is
stored in the configuration, the daemon keeps a copy of some
values at run time, so that they can easily be exposed over
HTTP.
This module also provides functions for the mirrored information
to update both, the authoritative state and the in-memory copy.
-}
{-
Copyright (C) 2015 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-}
module Ganeti.MaintD.MemoryState
( MemoryState(..)
, emptyMemoryState
, getJobs
, clearJobs
, appendJobs
) where
import Control.Monad.IO.Class (liftIO)
import Data.IORef (IORef, atomicModifyIORef)
import Ganeti.BasicTypes (ResultT, withErrorT)
import Ganeti.Types (JobId)
import Ganeti.Utils (ordNub)
import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceJobs, runModifyRpc
, clearMaintdJobs, appendMaintdJobs )
-- | In-memory copy of parts of the state of the maintenance
-- daemon.
data MemoryState = MemoryState
{ msJobs :: [ JobId ]
}
-- | Inital state of the in-memory copy. All parts will be updated
-- before use, after one round at the latest this copy is up to date.
emptyMemoryState :: MemoryState
emptyMemoryState = MemoryState {
msJobs = []
}
-- | Get the list of jobs from the authoritative copy, and update the
-- in-memory copy as well.
getJobs :: IORef MemoryState -> ResultT String IO [JobId]
getJobs memstate = do
jobs <- withErrorT show $ runNewWConfdClient maintenanceJobs
liftIO . atomicModifyIORef memstate $ \ s -> (s { msJobs = jobs }, ())
return jobs
-- | Reset the list of active jobs.
clearJobs :: IORef MemoryState -> IO ()
clearJobs memstate = do
runModifyRpc clearMaintdJobs
atomicModifyIORef memstate $ \ s -> ( s { msJobs = [] }, ())
-- | Append jobs to the list of active jobs, if not present already
appendJobs :: IORef MemoryState -> [JobId] -> IO ()
appendJobs memstate jobs = do
runModifyRpc $ appendMaintdJobs jobs
atomicModifyIORef memstate
$ \ s -> ( s { msJobs = ordNub $ msJobs s ++ jobs }, ())
......@@ -43,14 +43,17 @@ module Ganeti.MaintD.Server
import Control.Applicative ((<|>))
import Control.Concurrent (forkIO)
import Control.Monad (forever, void, unless, when)
import Control.Exception.Lifted (bracket)
import Control.Monad (forever, void, unless, when, liftM)
import Control.Monad.IO.Class (liftIO)
import Data.IORef (IORef, newIORef, readIORef)
import qualified Data.Set as Set
import Snap.Core (Snap, method, Method(GET), ifTop)
import Snap.Core (Snap, method, Method(GET), ifTop, dir, route)
import Snap.Http.Server (httpServe)
import Snap.Http.Server.Config (Config)
import System.IO.Error (tryIOError)
import System.Time (getClockTime)
import qualified Text.JSON as J
import Ganeti.BasicTypes ( GenericResult(..), ResultT, runResultT, mkResultT
, withErrorT, isBad)
......@@ -61,8 +64,11 @@ import Ganeti.Daemon.Utils (handleMasterVerificationOptions)
import qualified Ganeti.HTools.Backend.Luxi as Luxi
import qualified Ganeti.HTools.Container as Container
import Ganeti.HTools.Loader (ClusterData(..), mergeData, checkData)
import Ganeti.Jobs (waitForJobs)
import Ganeti.Logging.Lifted
import qualified Ganeti.Luxi as L
import Ganeti.MaintD.Autorepairs (harepTasks)
import Ganeti.MaintD.MemoryState
import qualified Ganeti.Path as Path
import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
import Ganeti.Types (JobId(..))
......@@ -109,22 +115,42 @@ loadClusterData = do
return $ cdata { cdNodes = nl }
-- | Perform one round of maintenance
maintenance :: ResultT String IO ()
maintenance = do
maintenance :: IORef MemoryState -> ResultT String IO ()
maintenance memstate = do
delay <- withErrorT show $ runNewWConfdClient maintenanceRoundDelay
liftIO $ threadDelaySeconds delay
oldjobs <- getJobs memstate
logDebug $ "Jobs submitted in the last round: "
++ show (map fromJobId oldjobs)
luxiSocket <- liftIO Path.defaultQuerySocket
bracket (mkResultT . liftM (either (Bad . show) Ok)
. tryIOError $ L.getLuxiClient luxiSocket)
(liftIO . L.closeClient)
$ void . mkResultT . waitForJobs oldjobs
liftIO $ clearJobs memstate
logDebug "New round of maintenance started"
cData <- loadClusterData
let il = cdInstances cData
nl = cdNodes cData
nidxs = Set.fromList $ Container.keys nl
(nidxs', jobs) <- harepTasks (nl, il) nidxs
liftIO $ appendJobs memstate jobs
logDebug $ "Unaffected nodes " ++ show (Set.toList nidxs')
++ ", jobs submitted " ++ show (map fromJobId jobs)
-- | Expose a part of the memory state
exposeState :: J.JSON a => (MemoryState -> a) -> IORef MemoryState -> Snap ()
exposeState selector ref = do
state <- liftIO $ readIORef ref
plainJSON $ selector state
-- | The information to serve via HTTP
httpInterface :: Snap ()
httpInterface = ifTop (method GET $ plainJSON [1 :: Int])
httpInterface :: IORef MemoryState -> Snap ()
httpInterface memstate =
ifTop (method GET $ plainJSON [1 :: Int])
<|> dir "1" (ifTop (plainJSON J.JSNull)
<|> route [ ("jobs", exposeState msJobs memstate)
])
<|> error404
-- | Check function for luxid.
......@@ -138,10 +164,11 @@ prepMain opts _ = httpConfFromOpts GanetiMaintd opts
-- | Main function.
main :: MainFn CheckResult PrepResult
main _ _ httpConf = do
memstate <- newIORef emptyMemoryState
void . forkIO . forever $ do
res <- runResultT maintenance
res <- runResultT $ maintenance memstate
logDebug $ "Maintenance round done, result is " ++ show res
when (isBad res) $ do
logInfo "Backing off after a round with internal errors"
threadDelaySeconds C.maintdDefaultRoundDelay
httpServe httpConf httpInterface
httpServe httpConf $ httpInterface memstate
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment