Commit 58d29849 authored by Klaus Aehlig's avatar Klaus Aehlig
Browse files

Add reason-trail entry on failing jobs



When failing a job, add an entry to the reason trail, indicating
what made the job fail (e.g., failed to fork or detected job death).
Signed-off-by: default avatarKlaus Aehlig <aehlig@google.com>
Reviewed-by: default avatarPetr Pudlak <pudlak@google.com>
parent 4c67ea74
...@@ -320,7 +320,10 @@ checkForDeath state jobWS = do ...@@ -320,7 +320,10 @@ checkForDeath state jobWS = do
jobWS' <- mkResultT $ readJobFromDisk jid jobWS' <- mkResultT $ readJobFromDisk jid
now <- liftIO currentTimestamp now <- liftIO currentTimestamp
qDir <- liftIO queueDir qDir <- liftIO queueDir
let failedJob = failQueuedJob now $ jJob jobWS' let reason = ( "gnt:daemon:wconfd:deathdetection"
, "detected death of job " ++ sjid
, reasonTrailTimestamp now )
failedJob = failQueuedJob reason now $ jJob jobWS'
cfg <- mkResultT . readIORef $ jqConfig state cfg <- mkResultT . readIORef $ jqConfig state
writeAndReplicateJob cfg qDir failedJob writeAndReplicateJob cfg qDir failedJob
......
...@@ -76,6 +76,7 @@ import Control.Arrow (first, second) ...@@ -76,6 +76,7 @@ import Control.Arrow (first, second)
import Control.Concurrent (forkIO, threadDelay) import Control.Concurrent (forkIO, threadDelay)
import Control.Concurrent.MVar import Control.Concurrent.MVar
import Control.Exception import Control.Exception
import Control.Lens (over)
import Control.Monad import Control.Monad
import Control.Monad.IO.Class import Control.Monad.IO.Class
import Control.Monad.Trans (lift) import Control.Monad.Trans (lift)
...@@ -99,12 +100,14 @@ import Ganeti.BasicTypes ...@@ -99,12 +100,14 @@ import Ganeti.BasicTypes
import qualified Ganeti.Config as Config import qualified Ganeti.Config as Config
import qualified Ganeti.Constants as C import qualified Ganeti.Constants as C
import Ganeti.Errors (ErrorResult, ResultG) import Ganeti.Errors (ErrorResult, ResultG)
import Ganeti.JQueue.Lens (qoInputL, validOpCodeL)
import Ganeti.JQueue.Objects import Ganeti.JQueue.Objects
import Ganeti.JSON import Ganeti.JSON
import Ganeti.Logging import Ganeti.Logging
import Ganeti.Luxi import Ganeti.Luxi
import Ganeti.Objects (ConfigData, Node) import Ganeti.Objects (ConfigData, Node)
import Ganeti.OpCodes import Ganeti.OpCodes
import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
import Ganeti.Path import Ganeti.Path
import Ganeti.Query.Exec as Exec import Ganeti.Query.Exec as Exec
import Ganeti.Rpc (executeRpcCall, ERpcError, logRpcErrors, import Ganeti.Rpc (executeRpcCall, ERpcError, logRpcErrors,
...@@ -268,15 +271,15 @@ cancelQueuedJob now job = ...@@ -268,15 +271,15 @@ cancelQueuedJob now job =
in job { qjOps = ops', qjEndTimestamp = Just now } in job { qjOps = ops', qjEndTimestamp = Just now }
-- | Set the state of a QueuedOpCode to canceled. -- | Set the state of a QueuedOpCode to canceled.
failOpCode :: Timestamp -> QueuedOpCode -> QueuedOpCode failOpCode :: ReasonElem -> Timestamp -> QueuedOpCode -> QueuedOpCode
failOpCode now op = failOpCode reason now op =
over (qoInputL . validOpCodeL . metaParamsL . opReasonL) (++ [reason])
op { qoStatus = OP_STATUS_ERROR, qoEndTimestamp = Just now } op { qoStatus = OP_STATUS_ERROR, qoEndTimestamp = Just now }
-- | Transform a QueuedJob that has not been started into its canceled form. -- | Transform a QueuedJob that has not been started into its canceled form.
failQueuedJob :: Timestamp -> QueuedJob -> QueuedJob failQueuedJob :: ReasonElem -> Timestamp -> QueuedJob -> QueuedJob
failQueuedJob now job = failQueuedJob reason now job =
-- TODO: Add a reason trail message let ops' = map (failOpCode reason now) $ qjOps job
let ops' = map (failOpCode now) $ qjOps job
in job { qjOps = ops', qjEndTimestamp = Just now } in job { qjOps = ops', qjEndTimestamp = Just now }
-- | Job file prefix. -- | Job file prefix.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment