Commit c666e6aa authored by Petr Pudlak's avatar Petr Pudlak
Browse files

When checking job death, check if its lock is the Luxi lock



In this case, the call trying to acquire a shared lock always succeeds,
because the daemon already has an exclusive lock, which falsely reports
that the job has died.
Signed-off-by: default avatarPetr Pudlak <pudlak@google.com>
Reviewed-by: default avatarKlaus Aehlig <aehlig@google.com>
parent cab9400a
...@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA ...@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
module Ganeti.JQScheduler module Ganeti.JQScheduler
( JQStatus ( JQStatus
, jqLivelock
, emptyJQStatus , emptyJQStatus
, initJQScheduler , initJQScheduler
, enqueueNewJobs , enqueueNewJobs
......
...@@ -560,8 +560,10 @@ waitForJob jid tmout = do ...@@ -560,8 +560,10 @@ waitForJob jid tmout = do
-- | Try to cancel a job that has already been handed over to execution, -- | Try to cancel a job that has already been handed over to execution,
-- by terminating the process. -- by terminating the process.
cancelJob :: JobId -> IO (ErrorResult (Bool, String)) cancelJob :: Livelock -- ^ Luxi's livelock path
cancelJob jid = runResultT $ do -> JobId -- ^ the job to cancel
-> IO (ErrorResult (Bool, String))
cancelJob luxiLivelock jid = runResultT $ do
-- we can't terminate the job if it's just being started, so -- we can't terminate the job if it's just being started, so
-- retry several times in such a case -- retry several times in such a case
result <- runMaybeT . msum . flip map [0..5 :: Int] $ \tryNo -> do result <- runMaybeT . msum . flip map [0..5 :: Int] $ \tryNo -> do
...@@ -573,7 +575,9 @@ cancelJob jid = runResultT $ do ...@@ -573,7 +575,9 @@ cancelJob jid = runResultT $ do
qDir <- liftIO queueDir qDir <- liftIO queueDir
(job, _) <- lift . mkResultT $ loadJobFromDisk qDir True jid (job, _) <- lift . mkResultT $ loadJobFromDisk qDir True jid
let jName = ("Job " ++) . show . fromJobId . qjId $ job let jName = ("Job " ++) . show . fromJobId . qjId $ job
dead <- maybe (return False) (liftIO . isDead) (qjLivelock job) dead <- maybe (return False) (liftIO . isDead)
. mfilter (/= luxiLivelock)
$ qjLivelock job
case qjProcessId job of case qjProcessId job of
_ | dead -> _ | dead ->
return (True, jName ++ " has been already dead") return (True, jName ++ " has been already dead")
......
...@@ -347,7 +347,7 @@ handleCall _ qstat cfg (CancelJob jid) = do ...@@ -347,7 +347,7 @@ handleCall _ qstat cfg (CancelJob jid) = do
writeAndReplicateJob cfg qDir job' writeAndReplicateJob cfg qDir job'
Ok False -> do Ok False -> do
logDebug $ jName ++ " not queued; trying to cancel directly" logDebug $ jName ++ " not queued; trying to cancel directly"
fmap showJSON <$> cancelJob jid fmap showJSON <$> cancelJob (jqLivelock qstat) jid
Bad s -> return . Ok . showJSON $ (False, s) Bad s -> return . Ok . showJSON $ (False, s)
handleCall qlock _ cfg (ArchiveJob jid) = do handleCall qlock _ cfg (ArchiveJob jid) = do
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment