Commit 58d29849 authored by Klaus Aehlig's avatar Klaus Aehlig

Add reason-trail entry on failing jobs

When failing a job, add an entry to the reason trail, indicating
what made the job fail (e.g., failed to fork or detected job death).
Signed-off-by: default avatarKlaus Aehlig <aehlig@google.com>
Reviewed-by: default avatarPetr Pudlak <pudlak@google.com>
parent 4c67ea74
......@@ -320,7 +320,10 @@ checkForDeath state jobWS = do
jobWS' <- mkResultT $ readJobFromDisk jid
now <- liftIO currentTimestamp
qDir <- liftIO queueDir
let failedJob = failQueuedJob now $ jJob jobWS'
let reason = ( "gnt:daemon:wconfd:deathdetection"
, "detected death of job " ++ sjid
, reasonTrailTimestamp now )
failedJob = failQueuedJob reason now $ jJob jobWS'
cfg <- mkResultT . readIORef $ jqConfig state
writeAndReplicateJob cfg qDir failedJob
......
......@@ -76,6 +76,7 @@ import Control.Arrow (first, second)
import Control.Concurrent (forkIO, threadDelay)
import Control.Concurrent.MVar
import Control.Exception
import Control.Lens (over)
import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Trans (lift)
......@@ -99,12 +100,14 @@ import Ganeti.BasicTypes
import qualified Ganeti.Config as Config
import qualified Ganeti.Constants as C
import Ganeti.Errors (ErrorResult, ResultG)
import Ganeti.JQueue.Lens (qoInputL, validOpCodeL)
import Ganeti.JQueue.Objects
import Ganeti.JSON
import Ganeti.Logging
import Ganeti.Luxi
import Ganeti.Objects (ConfigData, Node)
import Ganeti.OpCodes
import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
import Ganeti.Path
import Ganeti.Query.Exec as Exec
import Ganeti.Rpc (executeRpcCall, ERpcError, logRpcErrors,
......@@ -268,15 +271,15 @@ cancelQueuedJob now job =
in job { qjOps = ops', qjEndTimestamp = Just now }
-- | Set the state of a QueuedOpCode to canceled.
failOpCode :: Timestamp -> QueuedOpCode -> QueuedOpCode
failOpCode now op =
failOpCode :: ReasonElem -> Timestamp -> QueuedOpCode -> QueuedOpCode
failOpCode reason now op =
over (qoInputL . validOpCodeL . metaParamsL . opReasonL) (++ [reason])
op { qoStatus = OP_STATUS_ERROR, qoEndTimestamp = Just now }
-- | Transform a QueuedJob that has not been started into its canceled form.
failQueuedJob :: Timestamp -> QueuedJob -> QueuedJob
failQueuedJob now job =
-- TODO: Add a reason trail message
let ops' = map (failOpCode now) $ qjOps job
failQueuedJob :: ReasonElem -> Timestamp -> QueuedJob -> QueuedJob
failQueuedJob reason now job =
let ops' = map (failOpCode reason now) $ qjOps job
in job { qjOps = ops', qjEndTimestamp = Just now }
-- | Job file prefix.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment