jqueue.py 15.8 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Module implementing the job queue handling."""

24
import os
Michael Hanselmann's avatar
Michael Hanselmann committed
25
26
import logging
import threading
27
28
import errno
import re
29
import time
Iustin Pop's avatar
Iustin Pop committed
30

Michael Hanselmann's avatar
Michael Hanselmann committed
31
from ganeti import constants
32
from ganeti import serializer
Michael Hanselmann's avatar
Michael Hanselmann committed
33
from ganeti import workerpool
34
from ganeti import opcodes
Iustin Pop's avatar
Iustin Pop committed
35
from ganeti import errors
Michael Hanselmann's avatar
Michael Hanselmann committed
36
from ganeti import mcpu
37
from ganeti import utils
38
from ganeti import rpc
Michael Hanselmann's avatar
Michael Hanselmann committed
39
40
41
42


JOBQUEUE_THREADS = 5

Iustin Pop's avatar
Iustin Pop committed
43

Michael Hanselmann's avatar
Michael Hanselmann committed
44
45
46
class _QueuedOpCode(object):
  """Encasulates an opcode object.

47
  Access is synchronized by the '_lock' attribute.
Michael Hanselmann's avatar
Michael Hanselmann committed
48

49
50
51
  The 'log' attribute holds the execution log and consists of tuples
  of the form (timestamp, level, message).

Michael Hanselmann's avatar
Michael Hanselmann committed
52
53
  """
  def __init__(self, op):
54
    self.__Setup(op, constants.OP_STATUS_QUEUED, None, [])
55

56
  def __Setup(self, input_, status, result, log):
57
    self._lock = threading.Lock()
58
    self.input = input_
59
60
    self.status = status
    self.result = result
61
    self.log = log
62
63
64
65
66

  @classmethod
  def Restore(cls, state):
    obj = object.__new__(cls)
    obj.__Setup(opcodes.OpCode.LoadOpCode(state["input"]),
67
                state["status"], state["result"], state["log"])
68
69
70
71
72
73
74
75
    return obj

  @utils.LockedMethod
  def Serialize(self):
    return {
      "input": self.input.__getstate__(),
      "status": self.status,
      "result": self.result,
76
      "log": self.log,
77
      }
78

79
80
81
82
83
84
85
  @utils.LockedMethod
  def GetInput(self):
    """Returns the original opcode.

    """
    return self.input

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
  @utils.LockedMethod
  def SetStatus(self, status, result):
    """Update the opcode status and result.

    """
    self.status = status
    self.result = result

  @utils.LockedMethod
  def GetStatus(self):
    """Get the opcode status.

    """
    return self.status

  @utils.LockedMethod
  def GetResult(self):
    """Get the opcode result.

    """
    return self.result
Michael Hanselmann's avatar
Michael Hanselmann committed
107

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  @utils.LockedMethod
  def Log(self, *args):
    """Append a log entry.

    """
    assert len(args) < 2

    if len(args) == 1:
      log_type = constants.ELOG_MESSAGE
      log_msg = args[0]
    else:
      log_type, log_msg = args
    self.log.append((time.time(), log_type, log_msg))

  @utils.LockedMethod
  def RetrieveLog(self, start_at=0):
    """Retrieve (a part of) the execution log.

    """
    return self.log[start_at:]

Michael Hanselmann's avatar
Michael Hanselmann committed
129
130
131
132
133
134
135

class _QueuedJob(object):
  """In-memory job representation.

  This is what we use to track the user-submitted jobs.

  """
136
  def __init__(self, storage, job_id, ops):
Michael Hanselmann's avatar
Michael Hanselmann committed
137
138
139
140
    if not ops:
      # TODO
      raise Exception("No opcodes")

141
    self.__Setup(storage, job_id, [_QueuedOpCode(op) for op in ops], -1)
Michael Hanselmann's avatar
Michael Hanselmann committed
142

143
144
  def __Setup(self, storage, job_id, ops, run_op_index):
    self._lock = threading.Lock()
145
146
147
    self.storage = storage
    self.id = job_id
    self._ops = ops
148
    self.run_op_index = run_op_index
149
150
151
152

  @classmethod
  def Restore(cls, storage, state):
    obj = object.__new__(cls)
153
154
    op_list = [_QueuedOpCode.Restore(op_state) for op_state in state["ops"]]
    obj.__Setup(storage, state["id"], op_list, state["run_op_index"])
155
156
157
158
159
160
    return obj

  def Serialize(self):
    return {
      "id": self.id,
      "ops": [op.Serialize() for op in self._ops],
161
      "run_op_index": self.run_op_index,
162
163
164
165
166
167
168
169
      }

  def SetUnclean(self, msg):
    try:
      for op in self._ops:
        op.SetStatus(constants.OP_STATUS_ERROR, msg)
    finally:
      self.storage.UpdateJob(self)
Michael Hanselmann's avatar
Michael Hanselmann committed
170

171
  def GetStatus(self):
Michael Hanselmann's avatar
Michael Hanselmann committed
172
173
174
175
    status = constants.JOB_STATUS_QUEUED

    all_success = True
    for op in self._ops:
176
177
      op_status = op.GetStatus()
      if op_status == constants.OP_STATUS_SUCCESS:
Michael Hanselmann's avatar
Michael Hanselmann committed
178
179
180
181
        continue

      all_success = False

182
      if op_status == constants.OP_STATUS_QUEUED:
Michael Hanselmann's avatar
Michael Hanselmann committed
183
        pass
184
      elif op_status == constants.OP_STATUS_RUNNING:
Michael Hanselmann's avatar
Michael Hanselmann committed
185
        status = constants.JOB_STATUS_RUNNING
186
187
188
189
      elif op_status == constants.OP_STATUS_ERROR:
        status = constants.JOB_STATUS_ERROR
        # The whole job fails if one opcode failed
        break
Michael Hanselmann's avatar
Michael Hanselmann committed
190
191
192
193
194
195

    if all_success:
      status = constants.JOB_STATUS_SUCCESS

    return status

196
197
198
199
  @utils.LockedMethod
  def GetRunOpIndex(self):
    return self.run_op_index

Michael Hanselmann's avatar
Michael Hanselmann committed
200
201
202
203
204
205
206
207
208
209
210
  def Run(self, proc):
    """Job executor.

    This functions processes a this job in the context of given processor
    instance.

    Args:
    - proc: Ganeti Processor to run the job with

    """
    try:
211
212
      count = len(self._ops)
      for idx, op in enumerate(self._ops):
Michael Hanselmann's avatar
Michael Hanselmann committed
213
        try:
214
          logging.debug("Op %s/%s: Starting %s", idx + 1, count, op)
215
216
217
218
219
220
221

          self._lock.acquire()
          try:
            self.run_op_index = idx
          finally:
            self._lock.release()

222
          op.SetStatus(constants.OP_STATUS_RUNNING, None)
223
          self.storage.UpdateJob(self)
Michael Hanselmann's avatar
Michael Hanselmann committed
224

225
          result = proc.ExecOpCode(op.input, op.Log)
Michael Hanselmann's avatar
Michael Hanselmann committed
226

227
          op.SetStatus(constants.OP_STATUS_SUCCESS, result)
228
          self.storage.UpdateJob(self)
229
230
          logging.debug("Op %s/%s: Successfully finished %s",
                        idx + 1, count, op)
Michael Hanselmann's avatar
Michael Hanselmann committed
231
        except Exception, err:
232
233
234
235
236
          try:
            op.SetStatus(constants.OP_STATUS_ERROR, str(err))
            logging.debug("Op %s/%s: Error in %s", idx + 1, count, op)
          finally:
            self.storage.UpdateJob(self)
Michael Hanselmann's avatar
Michael Hanselmann committed
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
          raise

    except errors.GenericError, err:
      logging.error("ganeti exception %s", exc_info=err)
    except Exception, err:
      logging.error("unhandled exception %s", exc_info=err)
    except:
      logging.error("unhandled unknown exception %s", exc_info=err)


class _JobQueueWorker(workerpool.BaseWorker):
  def RunTask(self, job):
    logging.debug("Worker %s processing job %s",
                  self.worker_id, job.id)
    # TODO: feedback function
252
    proc = mcpu.Processor(self.pool.context)
Michael Hanselmann's avatar
Michael Hanselmann committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
    try:
      job.Run(proc)
    finally:
      logging.debug("Worker %s finished job %s, status = %s",
                    self.worker_id, job.id, job.GetStatus())


class _JobQueueWorkerPool(workerpool.WorkerPool):
  def __init__(self, context):
    super(_JobQueueWorkerPool, self).__init__(JOBQUEUE_THREADS,
                                              _JobQueueWorker)
    self.context = context


267
class DiskJobStorage(object):
268
  _RE_JOB_FILE = re.compile(r"^job-(%s)$" % constants.JOB_ID_TEMPLATE)
269
270
271

  def __init__(self):
    self._lock = threading.Lock()
Iustin Pop's avatar
Iustin Pop committed
272
    self._memcache = {}
273
    self._my_hostname = utils.HostInfo().name
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313

    # Make sure our directory exists
    try:
      os.mkdir(constants.QUEUE_DIR, 0700)
    except OSError, err:
      if err.errno not in (errno.EEXIST, ):
        raise

    # Get queue lock
    self.lock_fd = open(constants.JOB_QUEUE_LOCK_FILE, "w")
    try:
      utils.LockFile(self.lock_fd)
    except:
      self.lock_fd.close()
      raise

    # Read version
    try:
      version_fd = open(constants.JOB_QUEUE_VERSION_FILE, "r")
    except IOError, err:
      if err.errno not in (errno.ENOENT, ):
        raise

      # Setup a new queue
      self._InitQueueUnlocked()

      # Try to open again
      version_fd = open(constants.JOB_QUEUE_VERSION_FILE, "r")

    try:
      # Try to read version
      version = int(version_fd.read(128))

      # Verify version
      if version != constants.JOB_QUEUE_VERSION:
        raise errors.JobQueueError("Found version %s, expected %s",
                                   version, constants.JOB_QUEUE_VERSION)
    finally:
      version_fd.close()

314
315
316
317
318
319
320
321
322
323
324
325
326
327
    self._last_serial = self._ReadSerial()
    if self._last_serial is None:
      raise errors.ConfigurationError("Can't read/parse the job queue serial"
                                      " file")

  @staticmethod
  def _ReadSerial():
    """Try to read the job serial file.

    @rtype: None or int
    @return: If the serial can be read, then it is returned. Otherwise None
             is returned.

    """
328
    try:
329
330
331
332
333
334
335
336
337
338
      serial_fd = open(constants.JOB_QUEUE_SERIAL_FILE, "r")
      try:
        # Read last serial
        serial = int(serial_fd.read(1024).strip())
      finally:
        serial_fd.close()
    except (ValueError, EnvironmentError):
      serial = None

    return serial
339
340
341
342
343
344
345
346
347
348
349
350

  def Close(self):
    assert self.lock_fd, "Queue should be open"

    self.lock_fd.close()
    self.lock_fd = None

  def _InitQueueUnlocked(self):
    assert self.lock_fd, "Queue should be open"

    utils.WriteFile(constants.JOB_QUEUE_VERSION_FILE,
                    data="%s\n" % constants.JOB_QUEUE_VERSION)
351
352
353
    if self._ReadSerial() is None:
      utils.WriteFile(constants.JOB_QUEUE_SERIAL_FILE,
                      data="%s\n" % 0)
354

355
  def _NewSerialUnlocked(self, nodes):
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
    """Generates a new job identifier.

    Job identifiers are unique during the lifetime of a cluster.

    Returns: A string representing the job identifier.

    """
    assert self.lock_fd, "Queue should be open"

    # New number
    serial = self._last_serial + 1

    # Write to file
    utils.WriteFile(constants.JOB_QUEUE_SERIAL_FILE,
                    data="%s\n" % serial)

    # Keep it only if we were able to write the file
    self._last_serial = serial

375
376
377
378
379
380
381
382
383
384
385
    # Distribute the serial to the other nodes
    try:
      nodes.remove(self._my_hostname)
    except ValueError:
      pass

    result = rpc.call_upload_file(nodes, constants.JOB_QUEUE_SERIAL_FILE)
    for node in nodes:
      if not result[node]:
        logging.error("copy of job queue file to node %s failed", node)

Michael Hanselmann's avatar
Michael Hanselmann committed
386
    return str(serial)
387
388
389
390

  def _GetJobPath(self, job_id):
    return os.path.join(constants.QUEUE_DIR, "job-%s" % job_id)

391
392
393
394
395
396
  def _GetJobIDsUnlocked(self, archived=False):
    """Return all known job IDs.

    If the parameter archived is True, archived jobs IDs will be
    included. Currently this argument is unused.

Iustin Pop's avatar
Iustin Pop committed
397
398
399
400
    The method only looks at disk because it's a requirement that all
    jobs are present on disk (so in the _memcache we don't have any
    extra IDs).

401
402
    """
    jfiles = self._ListJobFiles()
403
404
405
406
    jlist = [int(m.group(1)) for m in
             [self._RE_JOB_FILE.match(name) for name in jfiles]]
    jlist.sort()
    return jlist
407

408
409
410
411
412
413
  def _ListJobFiles(self):
    assert self.lock_fd, "Queue should be open"

    return [name for name in utils.ListVisibleFiles(constants.QUEUE_DIR)
            if self._RE_JOB_FILE.match(name)]

414
  def _LoadJobUnlocked(self, job_id):
415
416
    assert self.lock_fd, "Queue should be open"

Iustin Pop's avatar
Iustin Pop committed
417
    if job_id in self._memcache:
418
      logging.debug("Found job %s in memcache", job_id)
Iustin Pop's avatar
Iustin Pop committed
419
420
      return self._memcache[job_id]

421
    filepath = self._GetJobPath(job_id)
422
423
424
425
426
427
428
429
430
431
432
433
    logging.debug("Loading job from %s", filepath)
    try:
      fd = open(filepath, "r")
    except IOError, err:
      if err.errno in (errno.ENOENT, ):
        return None
      raise
    try:
      data = serializer.LoadJson(fd.read())
    finally:
      fd.close()

Iustin Pop's avatar
Iustin Pop committed
434
435
    job = _QueuedJob.Restore(self, data)
    self._memcache[job_id] = job
436
    logging.debug("Added job %s to the cache", job_id)
Iustin Pop's avatar
Iustin Pop committed
437
    return job
438
439

  def _GetJobsUnlocked(self, job_ids):
440
441
    if not job_ids:
      job_ids = self._GetJobIDsUnlocked()
442

443
    return [self._LoadJobUnlocked(job_id) for job_id in job_ids]
444
445
446
447
448
449

  @utils.LockedMethod
  def GetJobs(self, job_ids):
    return self._GetJobsUnlocked(job_ids)

  @utils.LockedMethod
450
451
452
453
  def AddJob(self, ops, nodes):
    """Create and store on disk a new job.

    @type ops: list
454
    @param ops: The list of OpCodes that will become the new job.
455
456
457
458
459
    @type nodes: list
    @param nodes: The list of nodes to which the new job serial will be
                  distributed.

    """
460
461
462
    assert self.lock_fd, "Queue should be open"

    # Get job identifier
463
    job_id = self._NewSerialUnlocked(nodes)
464
465
466
467
468
    job = _QueuedJob(self, job_id, ops)

    # Write to disk
    self._UpdateJobUnlocked(job)

469
    logging.debug("Added new job %s to the cache", job_id)
Iustin Pop's avatar
Iustin Pop committed
470
471
    self._memcache[job_id] = job

472
473
474
475
476
477
478
479
480
    return job

  def _UpdateJobUnlocked(self, job):
    assert self.lock_fd, "Queue should be open"

    filename = self._GetJobPath(job.id)
    logging.debug("Writing job %s to %s", job.id, filename)
    utils.WriteFile(filename,
                    data=serializer.DumpJson(job.Serialize(), indent=False))
481
    self._CleanCacheUnlocked([job.id])
Iustin Pop's avatar
Iustin Pop committed
482

483
  def _CleanCacheUnlocked(self, exclude):
Iustin Pop's avatar
Iustin Pop committed
484
485
486
487
488
489
    """Clean the memory cache.

    The exceptions argument contains job IDs that should not be
    cleaned.

    """
490
    assert isinstance(exclude, list)
Iustin Pop's avatar
Iustin Pop committed
491
    for job in self._memcache.values():
492
      if job.id in exclude:
Iustin Pop's avatar
Iustin Pop committed
493
494
495
        continue
      if job.GetStatus() not in (constants.JOB_STATUS_QUEUED,
                                 constants.JOB_STATUS_RUNNING):
496
        logging.debug("Cleaning job %s from the cache", job.id)
Iustin Pop's avatar
Iustin Pop committed
497
498
499
500
        try:
          del self._memcache[job.id]
        except KeyError:
          pass
501
502
503
504
505
506
507
508
509

  @utils.LockedMethod
  def UpdateJob(self, job):
    return self._UpdateJobUnlocked(job)

  def ArchiveJob(self, job_id):
    raise NotImplementedError()


Michael Hanselmann's avatar
Michael Hanselmann committed
510
511
512
513
514
515
class JobQueue:
  """The job queue.

   """
  def __init__(self, context):
    self._lock = threading.Lock()
516
    self._jobs = DiskJobStorage()
Michael Hanselmann's avatar
Michael Hanselmann committed
517
518
    self._wpool = _JobQueueWorkerPool(context)

519
520
521
522
    for job in self._jobs.GetJobs(None):
      status = job.GetStatus()
      if status in (constants.JOB_STATUS_QUEUED, ):
        self._wpool.AddTask(job)
Michael Hanselmann's avatar
Michael Hanselmann committed
523

524
525
526
      elif status in (constants.JOB_STATUS_RUNNING, ):
        logging.warning("Unfinished job %s found: %s", job.id, job)
        job.SetUnclean("Unclean master daemon shutdown")
Michael Hanselmann's avatar
Michael Hanselmann committed
527

528
  @utils.LockedMethod
529
  def SubmitJob(self, ops, nodes):
Michael Hanselmann's avatar
Michael Hanselmann committed
530
531
532
533
534
    """Add a new job to the queue.

    This enters the job into our job queue and also puts it on the new
    queue, in order for it to be picked up by the queue processors.

535
536
537
538
539
    @type ops: list
    @param ops: the sequence of opcodes that will become the new job
    @type nodes: list
    @param nodes: the list of nodes to which the queue should be
                  distributed
Michael Hanselmann's avatar
Michael Hanselmann committed
540
541

    """
542
    job = self._jobs.AddJob(ops, nodes)
Michael Hanselmann's avatar
Michael Hanselmann committed
543
544
545
546

    # Add to worker pool
    self._wpool.AddTask(job)

547
    return job.id
Michael Hanselmann's avatar
Michael Hanselmann committed
548
549
550
551
552
553
554
555
556
557
558
559
560
561

  def ArchiveJob(self, job_id):
    raise NotImplementedError()

  def CancelJob(self, job_id):
    raise NotImplementedError()

  def _GetJobInfo(self, job, fields):
    row = []
    for fname in fields:
      if fname == "id":
        row.append(job.id)
      elif fname == "status":
        row.append(job.GetStatus())
562
563
564
      elif fname == "ops":
        row.append([op.GetInput().__getstate__() for op in job._ops])
      elif fname == "opresult":
565
        row.append([op.GetResult() for op in job._ops])
566
567
      elif fname == "opstatus":
        row.append([op.GetStatus() for op in job._ops])
568
569
570
571
572
573
574
575
576
577
578
579
      elif fname == "ticker":
        ji = job.GetRunOpIndex()
        if ji < 0:
          lmsg = None
        else:
          lmsg = job._ops[ji].RetrieveLog(-1)
          # message might be empty here
          if lmsg:
            lmsg = lmsg[0]
          else:
            lmsg = None
        row.append(lmsg)
Michael Hanselmann's avatar
Michael Hanselmann committed
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
      else:
        raise errors.OpExecError("Invalid job query field '%s'" % fname)
    return row

  def QueryJobs(self, job_ids, fields):
    """Returns a list of jobs in queue.

    Args:
    - job_ids: Sequence of job identifiers or None for all
    - fields: Names of fields to return

    """
    self._lock.acquire()
    try:
      jobs = []

596
      for job in self._jobs.GetJobs(job_ids):
Michael Hanselmann's avatar
Michael Hanselmann committed
597
598
599
600
601
602
603
604
605
        if job is None:
          jobs.append(None)
        else:
          jobs.append(self._GetJobInfo(job, fields))

      return jobs
    finally:
      self._lock.release()

606
  @utils.LockedMethod
Michael Hanselmann's avatar
Michael Hanselmann committed
607
608
609
610
611
  def Shutdown(self):
    """Stops the job queue.

    """
    self._wpool.TerminateWorkers()
612
    self._jobs.Close()