backend.py 111 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
#
Iustin Pop's avatar
Iustin Pop committed
2
3
#

4
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
Iustin Pop's avatar
Iustin Pop committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


22
23
24
25
"""Functions used by the node daemon

@var _ALLOWED_UPLOAD_FILES: denotes which files are accepted in
     the L{UploadFile} function
26
27
@var _ALLOWED_CLEAN_DIRS: denotes which directories are accepted
     in the L{_CleanDirectory} function
28
29

"""
Iustin Pop's avatar
Iustin Pop committed
30

31
# pylint: disable=E1103
Iustin Pop's avatar
Iustin Pop committed
32
33
34
35
36

# E1103: %s %r has no %r member (but some types could not be
# inferred), because the _TryOSFromDisk returns either (True, os_obj)
# or (False, "string") which confuses pylint

Iustin Pop's avatar
Iustin Pop committed
37
38
39
40
41
42
43
44

import os
import os.path
import shutil
import time
import stat
import errno
import re
45
import random
46
import logging
47
import tempfile
48
49
import zlib
import base64
50
import signal
Iustin Pop's avatar
Iustin Pop committed
51
52
53
54
55
56
57
58

from ganeti import errors
from ganeti import utils
from ganeti import ssh
from ganeti import hypervisor
from ganeti import constants
from ganeti import bdev
from ganeti import objects
59
from ganeti import ssconf
60
from ganeti import serializer
61
from ganeti import netutils
62
from ganeti import runtime
63
from ganeti import mcpu
Iustin Pop's avatar
Iustin Pop committed
64
65


66
_BOOT_ID_PATH = "/proc/sys/kernel/random/boot_id"
67
68
69
70
_ALLOWED_CLEAN_DIRS = frozenset([
  constants.DATA_DIR,
  constants.JOB_QUEUE_ARCHIVE_DIR,
  constants.QUEUE_DIR,
71
  constants.CRYPTO_KEYS_DIR,
72
  ])
73
74
75
_MAX_SSL_CERT_VALIDITY = 7 * 24 * 60 * 60
_X509_KEY_FILE = "key"
_X509_CERT_FILE = "cert"
76
77
78
_IES_STATUS_FILE = "status"
_IES_PID_FILE = "pid"
_IES_CA_FILE = "ca"
79

80
#: Valid LVS output line regex
81
_LVSLINE_REGEX = re.compile("^ *([^|]+)\|([^|]+)\|([0-9.]+)\|([^|]{6})\|?$")
82

83

84
85
86
87
88
89
90
class RPCFail(Exception):
  """Class denoting RPC failure.

  Its argument is the error message.

  """

91

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
def _Fail(msg, *args, **kwargs):
  """Log an error and the raise an RPCFail exception.

  This exception is then handled specially in the ganeti daemon and
  turned into a 'failed' return type. As such, this function is a
  useful shortcut for logging the error and returning it to the master
  daemon.

  @type msg: string
  @param msg: the text of the exception
  @raise RPCFail

  """
  if args:
    msg = msg % args
107
108
109
110
111
  if "log" not in kwargs or kwargs["log"]: # if we should log this error
    if "exc" in kwargs and kwargs["exc"]:
      logging.exception(msg)
    else:
      logging.error(msg)
112
113
114
  raise RPCFail(msg)


Michael Hanselmann's avatar
Michael Hanselmann committed
115
def _GetConfig():
Iustin Pop's avatar
Iustin Pop committed
116
  """Simple wrapper to return a SimpleStore.
Iustin Pop's avatar
Iustin Pop committed
117

Iustin Pop's avatar
Iustin Pop committed
118
119
  @rtype: L{ssconf.SimpleStore}
  @return: a SimpleStore instance
Iustin Pop's avatar
Iustin Pop committed
120
121

  """
Iustin Pop's avatar
Iustin Pop committed
122
  return ssconf.SimpleStore()
Michael Hanselmann's avatar
Michael Hanselmann committed
123
124


125
def _GetSshRunner(cluster_name):
Iustin Pop's avatar
Iustin Pop committed
126
127
128
129
130
131
132
133
134
  """Simple wrapper to return an SshRunner.

  @type cluster_name: str
  @param cluster_name: the cluster name, which is needed
      by the SshRunner constructor
  @rtype: L{ssh.SshRunner}
  @return: an SshRunner instance

  """
135
  return ssh.SshRunner(cluster_name)
136
137


138
139
140
141
142
143
144
145
146
def _Decompress(data):
  """Unpacks data compressed by the RPC client.

  @type data: list or tuple
  @param data: Data sent by RPC client
  @rtype: str
  @return: Decompressed data

  """
147
  assert isinstance(data, (list, tuple))
148
149
150
151
152
153
154
155
156
157
  assert len(data) == 2
  (encoding, content) = data
  if encoding == constants.RPC_ENCODING_NONE:
    return content
  elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
    return zlib.decompress(base64.b64decode(content))
  else:
    raise AssertionError("Unknown data encoding")


158
def _CleanDirectory(path, exclude=None):
159
160
  """Removes all regular files in a directory.

Iustin Pop's avatar
Iustin Pop committed
161
162
  @type path: str
  @param path: the directory to clean
163
  @type exclude: list
Iustin Pop's avatar
Iustin Pop committed
164
165
  @param exclude: list of files to be excluded, defaults
      to the empty list
166
167

  """
168
169
170
171
  if path not in _ALLOWED_CLEAN_DIRS:
    _Fail("Path passed to _CleanDirectory not in allowed clean targets: '%s'",
          path)

172
173
  if not os.path.isdir(path):
    return
174
175
176
177
178
  if exclude is None:
    exclude = []
  else:
    # Normalize excluded paths
    exclude = [os.path.normpath(i) for i in exclude]
179

180
  for rel_name in utils.ListVisibleFiles(path):
181
    full_name = utils.PathJoin(path, rel_name)
182
183
    if full_name in exclude:
      continue
184
185
186
187
    if os.path.isfile(full_name) and not os.path.islink(full_name):
      utils.RemoveFile(full_name)


188
189
190
191
192
193
def _BuildUploadFileList():
  """Build the list of allowed upload files.

  This is abstracted so that it's built only once at module import time.

  """
194
195
196
197
198
199
  allowed_files = set([
    constants.CLUSTER_CONF_FILE,
    constants.ETC_HOSTS,
    constants.SSH_KNOWN_HOSTS_FILE,
    constants.VNC_PASSWORD_FILE,
    constants.RAPI_CERT_FILE,
200
201
    constants.SPICE_CERT_FILE,
    constants.SPICE_CACERT_FILE,
202
    constants.RAPI_USERS_FILE,
203
    constants.CONFD_HMAC_KEY,
204
    constants.CLUSTER_DOMAIN_SECRET_FILE,
205
206
207
    ])

  for hv_name in constants.HYPER_TYPES:
208
    hv_class = hypervisor.GetHypervisorClass(hv_name)
209
    allowed_files.update(hv_class.GetAncillaryFiles()[0])
210
211

  return frozenset(allowed_files)
212
213
214
215
216


_ALLOWED_UPLOAD_FILES = _BuildUploadFileList()


217
def JobQueuePurge():
Iustin Pop's avatar
Iustin Pop committed
218
219
  """Removes job queue files and archived jobs.

220
221
  @rtype: tuple
  @return: True, None
222
223

  """
224
  _CleanDirectory(constants.QUEUE_DIR, exclude=[constants.JOB_QUEUE_LOCK_FILE])
225
226
227
  _CleanDirectory(constants.JOB_QUEUE_ARCHIVE_DIR)


228
229
230
231
232
233
234
def GetMasterInfo():
  """Returns master information.

  This is an utility function to compute master information, either
  for consumption here or from the node daemon.

  @rtype: tuple
235
236
  @return: master_netdev, master_ip, master_name, primary_ip_family,
    master_netmask
237
  @raise RPCFail: in case of errors
238
239
240

  """
  try:
Michael Hanselmann's avatar
Michael Hanselmann committed
241
242
243
    cfg = _GetConfig()
    master_netdev = cfg.GetMasterNetdev()
    master_ip = cfg.GetMasterIP()
244
    master_netmask = cfg.GetMasterNetmask()
Michael Hanselmann's avatar
Michael Hanselmann committed
245
    master_node = cfg.GetMasterNode()
246
    primary_ip_family = cfg.GetPrimaryIPFamily()
247
  except errors.ConfigurationError, err:
Iustin Pop's avatar
Iustin Pop committed
248
    _Fail("Cluster configuration incomplete: %s", err, exc=True)
249
250
  return (master_netdev, master_ip, master_node, primary_ip_family,
      master_netmask)
251
252


253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
def RunLocalHooks(hook_opcode, hooks_path, env_builder_fn):
  """Decorator that runs hooks before and after the decorated function.

  @type hook_opcode: string
  @param hook_opcode: opcode of the hook
  @type hooks_path: string
  @param hooks_path: path of the hooks
  @type env_builder_fn: function
  @param env_builder_fn: function that returns a dictionary containing the
    environment variables for the hooks.
  @raise RPCFail: in case of pre-hook failure

  """
  def decorator(fn):
    def wrapper(*args, **kwargs):
      _, myself = ssconf.GetMasterAndMyself()
      nodes = ([myself], [myself])  # these hooks run locally

      cfg = _GetConfig()
      hr = HooksRunner()
      hm = mcpu.HooksMaster(hook_opcode, hooks_path, nodes, hr.RunLocalHooks,
                            None, env_builder_fn, logging.warning,
                            cfg.GetClusterName(), cfg.GetMasterNode())

      hm.RunPhase(constants.HOOKS_PHASE_PRE)
      result = fn(*args, **kwargs)
      hm.RunPhase(constants.HOOKS_PHASE_POST)

      return result
    return wrapper
  return decorator


286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
def _BuildMasterIpHookEnv():
  """Builds environment variables for master IP hooks.

  """
  cfg = _GetConfig()
  env = {
    "MASTER_NETDEV": cfg.GetMasterNetdev(),
    "MASTER_IP": cfg.GetMasterIP(),
  }

  return env


@RunLocalHooks(constants.FAKE_OP_MASTER_TURNUP, "master-ip-turnup",
               _BuildMasterIpHookEnv)
301
def ActivateMasterIp(master_ip, master_netmask, master_netdev, family):
302
303
  """Activate the IP address of the master daemon.

304
305
306
307
308
  @param master_ip: the master IP
  @param master_netmask: the master IP netmask
  @param master_netdev: the master network device
  @param family: the IP family

309
310
  """
  # GetMasterInfo will raise an exception if not able to return data
311
  master_netdev, master_ip, _, family, master_netmask = GetMasterInfo()
312
313
314
315
316
317
318
319
320
321

  err_msg = None
  if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
    if netutils.IPAddress.Own(master_ip):
      # we already have the ip:
      logging.debug("Master IP already configured, doing nothing")
    else:
      err_msg = "Someone else has the master ip, not activating"
      logging.error(err_msg)
  else:
322
    ipcls = netutils.IPAddress.GetClassFromIpFamily(family)
323
324

    result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
325
                           "%s/%s" % (master_ip, master_netmask),
326
327
328
329
330
331
                           "dev", master_netdev, "label",
                           "%s:0" % master_netdev])
    if result.failed:
      err_msg = "Can't activate master IP: %s" % result.output
      logging.error(err_msg)

332
333
334
335
336
337
338
339
340
341
342
    else:
      # we ignore the exit code of the following cmds
      if ipcls == netutils.IP4Address:
        utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev, "-s",
                      master_ip, master_ip])
      elif ipcls == netutils.IP6Address:
        try:
          utils.RunCmd(["ndisc6", "-q", "-r 3", master_ip, master_netdev])
        except errors.OpExecError:
          # TODO: Better error reporting
          logging.warning("Can't execute ndisc6, please install if missing")
343
344
345
346
347
348

  if err_msg:
    _Fail(err_msg)


def StartMasterDaemons(no_voting):
Iustin Pop's avatar
Iustin Pop committed
349
350
  """Activate local node as master node.

351
  The function will start the master daemons (ganeti-masterd and ganeti-rapi).
Iustin Pop's avatar
Iustin Pop committed
352

353
354
  @type no_voting: boolean
  @param no_voting: whether to start ganeti-masterd without a node vote
355
      but still non-interactively
Iustin Pop's avatar
Iustin Pop committed
356
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
357
358
359

  """

360
361
362
363
  if no_voting:
    masterd_args = "--no-voting --yes-do-it"
  else:
    masterd_args = ""
364

365
366
367
368
369
370
371
372
373
  env = {
    "EXTRA_MASTERD_ARGS": masterd_args,
    }

  result = utils.RunCmd([constants.DAEMON_UTIL, "start-master"], env=env)
  if result.failed:
    msg = "Can't start Ganeti master: %s" % result.output
    logging.error(msg)
    _Fail(msg)
374

375

376
377
@RunLocalHooks(constants.FAKE_OP_MASTER_TURNDOWN, "master-ip-turndown",
               _BuildMasterIpHookEnv)
378
def DeactivateMasterIp(master_ip, master_netmask, master_netdev):
379
  """Deactivate the master IP on this node.
Iustin Pop's avatar
Iustin Pop committed
380

381
382
383
384
  @param master_ip: the master IP
  @param master_netmask: the master IP netmask
  @param master_netdev: the master network device

Iustin Pop's avatar
Iustin Pop committed
385
  """
386
387
  # TODO: log and report back to the caller the error failures; we
  # need to decide in which case we fail the RPC for this
388

389
  result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
390
                         "%s/%s" % (master_ip, master_netmask),
391
                         "dev", master_netdev])
Iustin Pop's avatar
Iustin Pop committed
392
  if result.failed:
393
    logging.error("Can't remove the master IP, error: %s", result.output)
394
395
    # but otherwise ignore the failure

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412

def StopMasterDaemons():
  """Stop the master daemons on this node.

  Stop the master daemons (ganeti-masterd and ganeti-rapi) on this node.

  @rtype: None

  """
  # TODO: log and report back to the caller the error failures; we
  # need to decide in which case we fail the RPC for this

  result = utils.RunCmd([constants.DAEMON_UTIL, "stop-master"])
  if result.failed:
    logging.error("Could not stop Ganeti master, command %s had exitcode %s"
                  " and error %s",
                  result.cmd, result.exit_code, result.output)
Iustin Pop's avatar
Iustin Pop committed
413
414


415
def ChangeMasterNetmask(old_netmask, netmask, master_ip, master_netdev):
416
417
  """Change the netmask of the master IP.

418
419
420
421
422
  @param old_netmask: the old value of the netmask
  @param netmask: the new value of the netmask
  @param master_ip: the master IP
  @param master_netdev: the master network device

423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
  """
  if old_netmask == netmask:
    return

  result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "add",
                         "%s/%s" % (master_ip, netmask),
                         "dev", master_netdev, "label",
                         "%s:0" % master_netdev])
  if result.failed:
    _Fail("Could not change the master IP netmask")

  result = utils.RunCmd([constants.IP_COMMAND_PATH, "address", "del",
                         "%s/%s" % (master_ip, old_netmask),
                         "dev", master_netdev, "label",
                         "%s:0" % master_netdev])
  if result.failed:
    _Fail("Could not change the master IP netmask")


442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
def EtcHostsModify(mode, host, ip):
  """Modify a host entry in /etc/hosts.

  @param mode: The mode to operate. Either add or remove entry
  @param host: The host to operate on
  @param ip: The ip associated with the entry

  """
  if mode == constants.ETC_HOSTS_ADD:
    if not ip:
      RPCFail("Mode 'add' needs 'ip' parameter, but parameter not"
              " present")
    utils.AddHostToEtcHosts(host, ip)
  elif mode == constants.ETC_HOSTS_REMOVE:
    if ip:
      RPCFail("Mode 'remove' does not allow 'ip' parameter, but"
              " parameter is present")
    utils.RemoveHostFromEtcHosts(host)
  else:
    RPCFail("Mode not supported")


464
def LeaveCluster(modify_ssh_setup):
Iustin Pop's avatar
Iustin Pop committed
465
466
467
468
469
470
  """Cleans up and remove the current node.

  This function cleans up and prepares the current node to be removed
  from the cluster.

  If processing is successful, then it raises an
Iustin Pop's avatar
Iustin Pop committed
471
  L{errors.QuitGanetiException} which is used as a special case to
Iustin Pop's avatar
Iustin Pop committed
472
  shutdown the node daemon.
Iustin Pop's avatar
Iustin Pop committed
473

474
475
  @param modify_ssh_setup: boolean

Iustin Pop's avatar
Iustin Pop committed
476
  """
477
  _CleanDirectory(constants.DATA_DIR)
478
  _CleanDirectory(constants.CRYPTO_KEYS_DIR)
479
  JobQueuePurge()
480

481
482
483
  if modify_ssh_setup:
    try:
      priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
484

485
      utils.RemoveAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
Iustin Pop's avatar
Iustin Pop committed
486

487
488
489
490
      utils.RemoveFile(priv_key)
      utils.RemoveFile(pub_key)
    except errors.OpExecError:
      logging.exception("Error while processing ssh files")
Iustin Pop's avatar
Iustin Pop committed
491

492
  try:
493
    utils.RemoveFile(constants.CONFD_HMAC_KEY)
494
    utils.RemoveFile(constants.RAPI_CERT_FILE)
495
496
    utils.RemoveFile(constants.SPICE_CERT_FILE)
    utils.RemoveFile(constants.SPICE_CACERT_FILE)
497
    utils.RemoveFile(constants.NODED_CERT_FILE)
498
  except: # pylint: disable=W0702
499
500
    logging.exception("Error while removing cluster secrets")

501
502
503
504
  result = utils.RunCmd([constants.DAEMON_UTIL, "stop", constants.CONFD])
  if result.failed:
    logging.error("Command %s failed with exitcode %s and error %s",
                  result.cmd, result.exit_code, result.output)
505

506
  # Raise a custom exception (handled in ganeti-noded)
Iustin Pop's avatar
Iustin Pop committed
507
  raise errors.QuitGanetiException(True, "Shutdown scheduled")
508

Iustin Pop's avatar
Iustin Pop committed
509

510
def GetNodeInfo(vgname, hypervisor_type):
Michael Hanselmann's avatar
Michael Hanselmann committed
511
  """Gives back a hash with different information about the node.
Iustin Pop's avatar
Iustin Pop committed
512

513
514
515
516
517
518
519
520
521
522
523
524
  @type vgname: C{string}
  @param vgname: the name of the volume group to ask for disk space information
  @type hypervisor_type: C{str}
  @param hypervisor_type: the name of the hypervisor to ask for
      memory information
  @rtype: C{dict}
  @return: dictionary with the following keys:
      - vg_size is the size of the configured volume group in MiB
      - vg_free is the free size of the volume group in MiB
      - memory_dom0 is the memory allocated for domain0 in MiB
      - memory_free is the currently available (free) ram in MiB
      - memory_total is the total number of ram in MiB
525
      - hv_version: the hypervisor version, if available
Iustin Pop's avatar
Iustin Pop committed
526

527
  """
Iustin Pop's avatar
Iustin Pop committed
528
  outputarray = {}
529

530
531
532
533
534
535
  if vgname is not None:
    vginfo = bdev.LogicalVolume.GetVGInfo([vgname])
    vg_free = vg_size = None
    if vginfo:
      vg_free = int(round(vginfo[0][0], 0))
      vg_size = int(round(vginfo[0][1], 0))
Iustin Pop's avatar
Iustin Pop committed
536
537
    outputarray["vg_size"] = vg_size
    outputarray["vg_free"] = vg_free
538
539
540
541
542
543

  if hypervisor_type is not None:
    hyper = hypervisor.GetHypervisor(hypervisor_type)
    hyp_info = hyper.GetNodeInfo()
    if hyp_info is not None:
      outputarray.update(hyp_info)
Iustin Pop's avatar
Iustin Pop committed
544

545
  outputarray["bootid"] = utils.ReadFile(_BOOT_ID_PATH, size=128).rstrip("\n")
546

547
  return outputarray
Iustin Pop's avatar
Iustin Pop committed
548
549


550
def VerifyNode(what, cluster_name):
Iustin Pop's avatar
Iustin Pop committed
551
552
  """Verify the status of the local node.

553
554
555
556
557
558
559
560
561
  Based on the input L{what} parameter, various checks are done on the
  local node.

  If the I{filelist} key is present, this list of
  files is checksummed and the file/checksum pairs are returned.

  If the I{nodelist} key is present, we check that we have
  connectivity via ssh with the target nodes (and check the hostname
  report).
Iustin Pop's avatar
Iustin Pop committed
562

563
564
565
566
567
568
569
570
571
572
573
  If the I{node-net-test} key is present, we check that we have
  connectivity to the given nodes via both primary IP and, if
  applicable, secondary IPs.

  @type what: C{dict}
  @param what: a dictionary of things to check:
      - filelist: list of files for which to compute checksums
      - nodelist: list of nodes we should check ssh communication with
      - node-net-test: list of nodes we should check node daemon port
        connectivity with
      - hypervisor: list with hypervisors to run the verify for
Iustin Pop's avatar
Iustin Pop committed
574
575
576
  @rtype: dict
  @return: a dictionary with the same keys as the input dict, and
      values representing the result of the checks
Iustin Pop's avatar
Iustin Pop committed
577
578
579

  """
  result = {}
580
  my_name = netutils.Hostname.GetSysName()
581
  port = netutils.GetDaemonPort(constants.NODED)
582
  vm_capable = my_name not in what.get(constants.NV_VMNODES, [])
Iustin Pop's avatar
Iustin Pop committed
583

584
  if constants.NV_HYPERVISOR in what and vm_capable:
585
586
    result[constants.NV_HYPERVISOR] = tmp = {}
    for hv_name in what[constants.NV_HYPERVISOR]:
587
588
589
590
591
      try:
        val = hypervisor.GetHypervisor(hv_name).Verify()
      except errors.HypervisorError, err:
        val = "Error while checking hypervisor: %s" % str(err)
      tmp[hv_name] = val
592

593
594
595
596
597
598
599
600
601
  if constants.NV_HVPARAMS in what and vm_capable:
    result[constants.NV_HVPARAMS] = tmp = []
    for source, hv_name, hvparms in what[constants.NV_HVPARAMS]:
      try:
        logging.info("Validating hv %s, %s", hv_name, hvparms)
        hypervisor.GetHypervisor(hv_name).ValidateParameters(hvparms)
      except errors.HypervisorError, err:
        tmp.append((source, hv_name, str(err)))

602
603
604
605
606
  if constants.NV_FILELIST in what:
    result[constants.NV_FILELIST] = utils.FingerprintFiles(
      what[constants.NV_FILELIST])

  if constants.NV_NODELIST in what:
607
608
609
610
611
612
613
614
615
616
617
618
619
620
    (nodes, bynode) = what[constants.NV_NODELIST]

    # Add nodes from other groups (different for each node)
    try:
      nodes.extend(bynode[my_name])
    except KeyError:
      pass

    # Use a random order
    random.shuffle(nodes)

    # Try to contact all nodes
    val = {}
    for node in nodes:
621
      success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node)
Iustin Pop's avatar
Iustin Pop committed
622
      if not success:
623
624
625
        val[node] = message

    result[constants.NV_NODELIST] = val
626
627
628

  if constants.NV_NODENETTEST in what:
    result[constants.NV_NODENETTEST] = tmp = {}
629
    my_pip = my_sip = None
630
    for name, pip, sip in what[constants.NV_NODENETTEST]:
631
632
633
634
635
      if name == my_name:
        my_pip = pip
        my_sip = sip
        break
    if not my_pip:
636
637
      tmp[my_name] = ("Can't find my own primary/secondary IP"
                      " in the node list")
638
    else:
639
      for name, pip, sip in what[constants.NV_NODENETTEST]:
640
        fail = []
641
        if not netutils.TcpPing(pip, port, source=my_pip):
642
643
          fail.append("primary")
        if sip != pip:
644
          if not netutils.TcpPing(sip, port, source=my_sip):
645
646
            fail.append("secondary")
        if fail:
647
648
649
          tmp[name] = ("failure using the %s interface(s)" %
                       " and ".join(fail))

650
651
652
653
654
  if constants.NV_MASTERIP in what:
    # FIXME: add checks on incoming data structures (here and in the
    # rest of the function)
    master_name, master_ip = what[constants.NV_MASTERIP]
    if master_name == my_name:
655
      source = constants.IP4_ADDRESS_LOCALHOST
656
657
    else:
      source = None
658
    result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
659
660
                                                  source=source)

661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
  if constants.NV_OOB_PATHS in what:
    result[constants.NV_OOB_PATHS] = tmp = []
    for path in what[constants.NV_OOB_PATHS]:
      try:
        st = os.stat(path)
      except OSError, err:
        tmp.append("error stating out of band helper: %s" % err)
      else:
        if stat.S_ISREG(st.st_mode):
          if stat.S_IMODE(st.st_mode) & stat.S_IXUSR:
            tmp.append(None)
          else:
            tmp.append("out of band helper %s is not executable" % path)
        else:
          tmp.append("out of band helper %s is not a file" % path)

677
  if constants.NV_LVLIST in what and vm_capable:
678
    try:
679
      val = GetVolumeList(utils.ListVolumeGroups().keys())
680
681
682
    except RPCFail, err:
      val = str(err)
    result[constants.NV_LVLIST] = val
683

684
  if constants.NV_INSTANCELIST in what and vm_capable:
685
686
687
688
689
690
    # GetInstanceList can fail
    try:
      val = GetInstanceList(what[constants.NV_INSTANCELIST])
    except RPCFail, err:
      val = str(err)
    result[constants.NV_INSTANCELIST] = val
691

692
  if constants.NV_VGLIST in what and vm_capable:
693
    result[constants.NV_VGLIST] = utils.ListVolumeGroups()
694

695
  if constants.NV_PVLIST in what and vm_capable:
696
697
698
699
    result[constants.NV_PVLIST] = \
      bdev.LogicalVolume.GetPVInfo(what[constants.NV_PVLIST],
                                   filter_allocatable=False)

700
  if constants.NV_VERSION in what:
701
702
    result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
                                    constants.RELEASE_VERSION)
703

704
  if constants.NV_HVINFO in what and vm_capable:
705
706
    hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO])
    result[constants.NV_HVINFO] = hyper.GetNodeInfo()
707

708
  if constants.NV_DRBDLIST in what and vm_capable:
709
710
    try:
      used_minors = bdev.DRBD8.GetUsedDevs().keys()
711
    except errors.BlockDeviceError, err:
712
      logging.warning("Can't get used minors list", exc_info=True)
713
      used_minors = str(err)
714
715
    result[constants.NV_DRBDLIST] = used_minors

716
  if constants.NV_DRBDHELPER in what and vm_capable:
717
718
719
720
721
722
723
724
725
    status = True
    try:
      payload = bdev.BaseDRBD.GetUsermodeHelper()
    except errors.BlockDeviceError, err:
      logging.error("Can't get DRBD usermode helper: %s", str(err))
      status = False
      payload = str(err)
    result[constants.NV_DRBDHELPER] = (status, payload)

726
727
728
729
730
731
732
733
734
735
736
  if constants.NV_NODESETUP in what:
    result[constants.NV_NODESETUP] = tmpr = []
    if not os.path.isdir("/sys/block") or not os.path.isdir("/sys/class/net"):
      tmpr.append("The sysfs filesytem doesn't seem to be mounted"
                  " under /sys, missing required directories /sys/block"
                  " and /sys/class/net")
    if (not os.path.isdir("/proc/sys") or
        not os.path.isfile("/proc/sysrq-trigger")):
      tmpr.append("The procfs filesystem doesn't seem to be mounted"
                  " under /proc, missing required directory /proc/sys and"
                  " the file /proc/sysrq-trigger")
737
738
739
740

  if constants.NV_TIME in what:
    result[constants.NV_TIME] = utils.SplitTime(time.time())

741
  if constants.NV_OSLIST in what and vm_capable:
742
743
    result[constants.NV_OSLIST] = DiagnoseOS()

744
745
746
747
  if constants.NV_BRIDGES in what and vm_capable:
    result[constants.NV_BRIDGES] = [bridge
                                    for bridge in what[constants.NV_BRIDGES]
                                    if not utils.BridgeExists(bridge)]
748
  return result
Iustin Pop's avatar
Iustin Pop committed
749
750


751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
def GetBlockDevSizes(devices):
  """Return the size of the given block devices

  @type devices: list
  @param devices: list of block device nodes to query
  @rtype: dict
  @return:
    dictionary of all block devices under /dev (key). The value is their
    size in MiB.

    {'/dev/disk/by-uuid/123456-12321231-312312-312': 124}

  """
  DEV_PREFIX = "/dev/"
  blockdevs = {}

  for devpath in devices:
768
    if not utils.IsBelowDir(DEV_PREFIX, devpath):
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
      continue

    try:
      st = os.stat(devpath)
    except EnvironmentError, err:
      logging.warning("Error stat()'ing device %s: %s", devpath, str(err))
      continue

    if stat.S_ISBLK(st.st_mode):
      result = utils.RunCmd(["blockdev", "--getsize64", devpath])
      if result.failed:
        # We don't want to fail, just do not list this device as available
        logging.warning("Cannot get size for block device %s", devpath)
        continue

      size = int(result.stdout) / (1024 * 1024)
      blockdevs[devpath] = size
  return blockdevs


789
def GetVolumeList(vg_names):
Iustin Pop's avatar
Iustin Pop committed
790
791
  """Compute list of logical volumes and their size.

792
  @type vg_names: list
793
794
  @param vg_names: the volume groups whose LVs we should list, or
      empty for all volume groups
Iustin Pop's avatar
Iustin Pop committed
795
796
797
798
799
  @rtype: dict
  @return:
      dictionary of all partions (key) with value being a tuple of
      their size (in MiB), inactive and online status::

800
        {'xenvg/test1': ('20.06', True, True)}
Iustin Pop's avatar
Iustin Pop committed
801
802
803

      in case of errors, a string is returned with the error
      details.
Iustin Pop's avatar
Iustin Pop committed
804
805

  """
806
  lvs = {}
Iustin Pop's avatar
Iustin Pop committed
807
  sep = "|"
808
809
  if not vg_names:
    vg_names = []
810
811
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=%s" % sep,
812
                         "-ovg_name,lv_name,lv_size,lv_attr"] + vg_names)
Iustin Pop's avatar
Iustin Pop committed
813
  if result.failed:
814
    _Fail("Failed to list logical volumes, lvs output: %s", result.output)
815
816

  for line in result.stdout.splitlines():
817
    line = line.strip()
818
    match = _LVSLINE_REGEX.match(line)
819
    if not match:
820
      logging.error("Invalid line returned from lvs output: '%s'", line)
821
      continue
822
    vg_name, name, size, attr = match.groups()
Iustin Pop's avatar
Iustin Pop committed
823
824
825
    inactive = attr[4] == "-"
    online = attr[5] == "o"
    virtual = attr[0] == "v"
Iustin Pop's avatar
Iustin Pop committed
826
827
828
829
    if virtual:
      # we don't want to report such volumes as existing, since they
      # don't really hold data
      continue
Michael Hanselmann's avatar
Michael Hanselmann committed
830
    lvs[vg_name + "/" + name] = (size, inactive, online)
831
832

  return lvs
Iustin Pop's avatar
Iustin Pop committed
833
834
835


def ListVolumeGroups():
Alexander Schreiber's avatar
Alexander Schreiber committed
836
  """List the volume groups and their size.
Iustin Pop's avatar
Iustin Pop committed
837

Iustin Pop's avatar
Iustin Pop committed
838
839
840
  @rtype: dict
  @return: dictionary with keys volume name and values the
      size of the volume
Iustin Pop's avatar
Iustin Pop committed
841
842

  """
843
  return utils.ListVolumeGroups()
Iustin Pop's avatar
Iustin Pop committed
844
845


846
847
848
def NodeVolumes():
  """List all volumes on this node.

Iustin Pop's avatar
Iustin Pop committed
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
  @rtype: list
  @return:
    A list of dictionaries, each having four keys:
      - name: the logical volume name,
      - size: the size of the logical volume
      - dev: the physical device on which the LV lives
      - vg: the volume group to which it belongs

    In case of errors, we return an empty list and log the
    error.

    Note that since a logical volume can live on multiple physical
    volumes, the resulting list might include a logical volume
    multiple times.

864
865
866
867
868
  """
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=|",
                         "--options=lv_name,lv_size,devices,vg_name"])
  if result.failed:
869
870
    _Fail("Failed to list logical volumes, lvs output: %s",
          result.output)
871
872

  def parse_dev(dev):
Iustin Pop's avatar
Iustin Pop committed
873
    return dev.split("(")[0]
874
875
876

  def handle_dev(dev):
    return [parse_dev(x) for x in dev.split(",")]
877
878

  def map_line(line):
879
    line = [v.strip() for v in line]
Iustin Pop's avatar
Iustin Pop committed
880
881
    return [{"name": line[0], "size": line[1],
             "dev": dev, "vg": line[3]} for dev in handle_dev(line[2])]
882
883
884

  all_devs = []
  for line in result.stdout.splitlines():
Iustin Pop's avatar
Iustin Pop committed
885
886
    if line.count("|") >= 3:
      all_devs.extend(map_line(line.split("|")))
887
888
889
    else:
      logging.warning("Strange line in the output from lvs: '%s'", line)
  return all_devs
890
891


Iustin Pop's avatar
Iustin Pop committed
892
def BridgesExist(bridges_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
893
  """Check if a list of bridges exist on the current node.
Iustin Pop's avatar
Iustin Pop committed
894

Iustin Pop's avatar
Iustin Pop committed
895
896
  @rtype: boolean
  @return: C{True} if all of them exist, C{False} otherwise
Iustin Pop's avatar
Iustin Pop committed
897
898

  """
899
  missing = []
Iustin Pop's avatar
Iustin Pop committed
900
901
  for bridge in bridges_list:
    if not utils.BridgeExists(bridge):
902
      missing.append(bridge)
Iustin Pop's avatar
Iustin Pop committed
903

904
  if missing:
905
    _Fail("Missing bridges %s", utils.CommaJoin(missing))
906

Iustin Pop's avatar
Iustin Pop committed
907

908
def GetInstanceList(hypervisor_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
909
  """Provides a list of instances.
Iustin Pop's avatar
Iustin Pop committed
910

911
912
913
914
915
  @type hypervisor_list: list
  @param hypervisor_list: the list of hypervisors to query information

  @rtype: list
  @return: a list of all running instances on the current node
Iustin Pop's avatar
Iustin Pop committed
916
917
    - instance1.example.com
    - instance2.example.com
Iustin Pop's avatar
Iustin Pop committed
918

919
  """
920
921
922
923
924
925
  results = []
  for hname in hypervisor_list:
    try:
      names = hypervisor.GetHypervisor(hname).ListInstances()
      results.extend(names)
    except errors.HypervisorError, err:
926
927
      _Fail("Error enumerating instances (hypervisor %s): %s",
            hname, err, exc=True)
Iustin Pop's avatar
Iustin Pop committed
928

929
  return results
Iustin Pop's avatar
Iustin Pop committed
930
931


932
def GetInstanceInfo(instance, hname):
Michael Hanselmann's avatar
Michael Hanselmann committed
933
  """Gives back the information about an instance as a dictionary.
Iustin Pop's avatar
Iustin Pop committed
934

935
936
937
938
  @type instance: string
  @param instance: the instance name
  @type hname: string
  @param hname: the hypervisor type of the instance
Iustin Pop's avatar
Iustin Pop committed
939

940
941
942
943
944
  @rtype: dict
  @return: dictionary with the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
945

946
  """
Iustin Pop's avatar
Iustin Pop committed
947
948
  output = {}

949
  iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance)
Iustin Pop's avatar
Iustin Pop committed
950
  if iinfo is not None:
Iustin Pop's avatar
Iustin Pop committed
951
952
953
    output["memory"] = iinfo[2]
    output["state"] = iinfo[4]
    output["time"] = iinfo[5]
Iustin Pop's avatar
Iustin Pop committed
954

955
  return output
Iustin Pop's avatar
Iustin Pop committed
956
957


958
959
960
961
962
963
964
965
966
967
968
969
970
def GetInstanceMigratable(instance):
  """Gives whether an instance can be migrated.

  @type instance: L{objects.Instance}
  @param instance: object representing the instance to be checked.

  @rtype: tuple
  @return: tuple of (result, description) where:
      - result: whether the instance can be migrated or not
      - description: a description of the issue, if relevant

  """
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
971
972
973
  iname = instance.name
  if iname not in hyper.ListInstances():
    _Fail("Instance %s is not running", iname)
974
975

  for idx in range(len(instance.disks)):
976
    link_name = _GetBlockDevSymlinkPath(iname, idx)
977
    if not os.path.islink(link_name):
978
979
      logging.warning("Instance %s is missing symlink %s for disk %d",
                      iname, link_name, idx)
980
981


982
def GetAllInstancesInfo(hypervisor_list):
Iustin Pop's avatar
Iustin Pop committed
983
984
  """Gather data about all instances.

Iustin Pop's avatar
Iustin Pop committed
985
  This is the equivalent of L{GetInstanceInfo}, except that it
Iustin Pop's avatar
Iustin Pop committed
986
987
988
  computes data for all instances at once, thus being faster if one
  needs data about more than one instance.

989
990
991
  @type hypervisor_list: list
  @param hypervisor_list: list of hypervisors to query for instance data

992
  @rtype: dict
993
994
995
996
  @return: dictionary of instance: data, with data having the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
997
      - vcpus: the number of vcpus
Iustin Pop's avatar
Iustin Pop committed
998

999
  """
Iustin Pop's avatar
Iustin Pop committed
1000
1001
  output = {}

1002
1003
1004
  for hname in hypervisor_list:
    iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo()
    if iinfo:
Iustin Pop's avatar
Iustin Pop committed
1005
      for name, _, memory, vcpus, state, times in iinfo:
1006
        value = {
Iustin Pop's avatar
Iustin Pop committed
1007
1008
1009
1010
          "memory": memory,
          "vcpus": vcpus,
          "state": state,
          "time": times,
1011
          }
1012
1013
1014
1015
        if name in output:
          # we only check static parameters, like memory and vcpus,
          # and not state and time which can change between the
          # invocations of the different hypervisors
Iustin Pop's avatar
Iustin Pop committed
1016
          for key in "memory", "vcpus":
1017
            if value[key] != output[name][key]:
1018
1019
              _Fail("Instance %s is running twice"
                    " with different parameters", name)
1020
        output[name] = value
Iustin Pop's avatar
Iustin Pop committed
1021

1022
  return output
Iustin Pop's avatar
Iustin Pop committed
1023
1024


1025
def _InstanceLogName(kind, os_name, instance, component):
Iustin Pop's avatar
Iustin Pop committed
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
  """Compute the OS log filename for a given instance and operation.

  The instance name and os name are passed in as strings since not all
  operations have these as part of an instance object.

  @type kind: string
  @param kind: the operation type (e.g. add, import, etc.)
  @type os_name: string
  @param os_name: the os name
  @type instance: string
  @param instance: the name of the instance being imported/added/etc.
1037
1038
1039
  @type component: string or None
  @param component: the name of the component of the instance being
      transferred
Iustin Pop's avatar
Iustin Pop committed
1040
1041

  """
1042
  # TODO: Use tempfile.mkstemp to create unique filename
1043
1044
1045
1046
1047
1048
1049
  if component:
    assert "/" not in component
    c_msg = "-%s" % component
  else:
    c_msg = ""
  base = ("%s-%s-%s%s-%s.log" %
          (kind, os_name, instance, c_msg, utils.TimestampForFilename()))
Iustin Pop's avatar
Iustin Pop committed
1050
1051
1052
  return utils.PathJoin(constants.LOG_OS_DIR, base)


1053
def InstanceOsAdd(instance, reinstall, debug):
Alexander Schreiber's avatar
Alexander Schreiber committed
1054
  """Add an OS to an instance.
Iustin Pop's avatar
Iustin Pop committed
1055

1056
1057
  @type instance: L{objects.Instance}
  @param instance: Instance whose OS is to be installed
1058
1059
  @type reinstall: boolean
  @param reinstall: whether this is an instance reinstall
1060
1061
  @type debug: integer
  @param debug: debug level, passed to the OS scripts
1062
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
1063
1064

  """
1065
1066
  inst_os = OSFromDisk(instance.os)

1067
  create_env = OSEnvironment(instance, inst_os, debug)
1068
  if reinstall:
Iustin Pop's avatar
Iustin Pop committed
1069
    create_env["INSTANCE_REINSTALL"] = "1"
Iustin Pop's avatar
Iustin Pop committed
1070

1071
  logfile = _InstanceLogName("add", instance.os, instance.name, None)
1072

1073
  result = utils.RunCmd([inst_os.create_script], env=create_env,
Iustin Pop's avatar
Iustin Pop committed
1074
                        cwd=inst_os.path, output=logfile, reset_env=True)
1075
  if result.failed:
1076
    logging.error("os create command '%s' returned error: %s, logfile: %s,"
1077
                  " output: %s", result.cmd, result.fail_reason, logfile,
1078
                  result.output)
1079
    lines = [utils.SafeEncode(val)
1080
             for val in utils.TailFile(logfile, lines=20)]
1081
1082
    _Fail("OS create script failed (%s), last lines in the"
          " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
1083
1084


1085
def RunRenameInstance(instance, old_name, debug):
1086
1087
  """Run the OS rename script for an instance.

Iustin Pop's avatar
Iustin Pop committed
1088
  @type instance: L{objects.Instance}
1089
1090
1091
  @param instance: Instance whose OS is to be installed
  @type old_name: string
  @param old_name: previous instance name
1092
1093
  @type debug: integer
  @param debug: debug level, passed to the OS scripts
Iustin Pop's avatar
Iustin Pop committed
1094
1095
  @rtype: boolean
  @return: the success of the operation
1096
1097
1098
1099

  """
  inst_os = OSFromDisk(instance.os)

1100
  rename_env = OSEnvironment(instance, inst_os, debug)
Iustin Pop's avatar
Iustin Pop committed
1101
  rename_env["OLD_INSTANCE_NAME"] = old_name
1102

Iustin Pop's avatar
Iustin Pop committed
1103
  logfile = _InstanceLogName("rename", instance.os,
1104
                             "%s-%s" % (old_name, instance.name), None)
Iustin Pop's avatar
Iustin Pop committed
1105

1106
  result = utils.RunCmd([inst_os.rename_script], env=rename_env,
Iustin Pop's avatar
Iustin Pop committed
1107
                        cwd=inst_os.path, output=logfile, reset_env=True)
Iustin Pop's avatar
Iustin Pop committed
1108
1109

  if result.failed:
1110
    logging.error("os create command '%s' returned error: %s output: %s",
1111
                  result.cmd, result.fail_reason, result.output)
1112
    lines = [utils.SafeEncode(val)
1113
             for val in utils.TailFile(logfile, lines=20)]
1114
1115
    _Fail("OS rename script failed (%s), last lines in the"
          " log file:\n%s", result.fail_reason, "\n".join(lines), log=False)
Iustin Pop's avatar
Iustin Pop committed
1116
1117


1118
def _GetBlockDevSymlinkPath(instance_name, idx):
Iustin Pop's avatar