backend.py 78 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
#
Iustin Pop's avatar
Iustin Pop committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Functions used by the node daemon"""


import os
import os.path
import shutil
import time
import stat
import errno
import re
import subprocess
33
import random
34
import logging
35
import tempfile
36
37
import zlib
import base64
Iustin Pop's avatar
Iustin Pop committed
38
39
40
41
42
43
44
45

from ganeti import errors
from ganeti import utils
from ganeti import ssh
from ganeti import hypervisor
from ganeti import constants
from ganeti import bdev
from ganeti import objects
46
from ganeti import ssconf
Iustin Pop's avatar
Iustin Pop committed
47
48


Michael Hanselmann's avatar
Michael Hanselmann committed
49
def _GetConfig():
Iustin Pop's avatar
Iustin Pop committed
50
  """Simple wrapper to return a SimpleStore.
Iustin Pop's avatar
Iustin Pop committed
51

Iustin Pop's avatar
Iustin Pop committed
52
53
  @rtype: L{ssconf.SimpleStore}
  @return: a SimpleStore instance
Iustin Pop's avatar
Iustin Pop committed
54
55

  """
Iustin Pop's avatar
Iustin Pop committed
56
  return ssconf.SimpleStore()
Michael Hanselmann's avatar
Michael Hanselmann committed
57
58


59
def _GetSshRunner(cluster_name):
Iustin Pop's avatar
Iustin Pop committed
60
61
62
63
64
65
66
67
68
  """Simple wrapper to return an SshRunner.

  @type cluster_name: str
  @param cluster_name: the cluster name, which is needed
      by the SshRunner constructor
  @rtype: L{ssh.SshRunner}
  @return: an SshRunner instance

  """
69
  return ssh.SshRunner(cluster_name)
70
71


72
73
74
75
76
77
78
79
80
def _Decompress(data):
  """Unpacks data compressed by the RPC client.

  @type data: list or tuple
  @param data: Data sent by RPC client
  @rtype: str
  @return: Decompressed data

  """
81
  assert isinstance(data, (list, tuple))
82
83
84
85
86
87
88
89
90
91
  assert len(data) == 2
  (encoding, content) = data
  if encoding == constants.RPC_ENCODING_NONE:
    return content
  elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
    return zlib.decompress(base64.b64decode(content))
  else:
    raise AssertionError("Unknown data encoding")


92
def _CleanDirectory(path, exclude=None):
93
94
  """Removes all regular files in a directory.

Iustin Pop's avatar
Iustin Pop committed
95
96
  @type path: str
  @param path: the directory to clean
97
  @type exclude: list
Iustin Pop's avatar
Iustin Pop committed
98
99
  @param exclude: list of files to be excluded, defaults
      to the empty list
100
101

  """
102
103
  if not os.path.isdir(path):
    return
104
105
106
107
108
  if exclude is None:
    exclude = []
  else:
    # Normalize excluded paths
    exclude = [os.path.normpath(i) for i in exclude]
109

110
  for rel_name in utils.ListVisibleFiles(path):
111
112
113
    full_name = os.path.normpath(os.path.join(path, rel_name))
    if full_name in exclude:
      continue
114
115
116
117
    if os.path.isfile(full_name) and not os.path.islink(full_name):
      utils.RemoveFile(full_name)


118
def JobQueuePurge():
Iustin Pop's avatar
Iustin Pop committed
119
120
121
  """Removes job queue files and archived jobs.

  @rtype: None
122
123

  """
124
  _CleanDirectory(constants.QUEUE_DIR, exclude=[constants.JOB_QUEUE_LOCK_FILE])
125
126
127
  _CleanDirectory(constants.JOB_QUEUE_ARCHIVE_DIR)


128
129
130
131
132
133
134
def GetMasterInfo():
  """Returns master information.

  This is an utility function to compute master information, either
  for consumption here or from the node daemon.

  @rtype: tuple
Iustin Pop's avatar
Iustin Pop committed
135
136
  @return: (master_netdev, master_ip, master_name) if we have a good
      configuration, otherwise (None, None, None)
137
138
139

  """
  try:
Michael Hanselmann's avatar
Michael Hanselmann committed
140
141
142
143
    cfg = _GetConfig()
    master_netdev = cfg.GetMasterNetdev()
    master_ip = cfg.GetMasterIP()
    master_node = cfg.GetMasterNode()
144
145
  except errors.ConfigurationError, err:
    logging.exception("Cluster configuration incomplete")
Iustin Pop's avatar
Iustin Pop committed
146
    return (None, None, None)
147
  return (master_netdev, master_ip, master_node)
148
149


150
def StartMaster(start_daemons):
Iustin Pop's avatar
Iustin Pop committed
151
152
  """Activate local node as master node.

153
  The function will always try activate the IP address of the master
Iustin Pop's avatar
Iustin Pop committed
154
155
156
157
158
159
160
  (unless someone else has it). It will also start the master daemons,
  based on the start_daemons parameter.

  @type start_daemons: boolean
  @param start_daemons: whther to also start the master
      daemons (ganeti-masterd and ganeti-rapi)
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
161
162

  """
163
  ok = True
164
  master_netdev, master_ip, _ = GetMasterInfo()
165
  if not master_netdev:
Iustin Pop's avatar
Iustin Pop committed
166
167
    return False

168
  if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
169
    if utils.OwnIpAddress(master_ip):
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
      # we already have the ip:
      logging.debug("Already started")
    else:
      logging.error("Someone else has the master ip, not activating")
      ok = False
  else:
    result = utils.RunCmd(["ip", "address", "add", "%s/32" % master_ip,
                           "dev", master_netdev, "label",
                           "%s:0" % master_netdev])
    if result.failed:
      logging.error("Can't activate master IP: %s", result.output)
      ok = False

    result = utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev,
                           "-s", master_ip, master_ip])
    # we'll ignore the exit code of arping

  # and now start the master and rapi daemons
  if start_daemons:
    for daemon in 'ganeti-masterd', 'ganeti-rapi':
      result = utils.RunCmd([daemon])
      if result.failed:
        logging.error("Can't start daemon %s: %s", daemon, result.output)
        ok = False
  return ok
Iustin Pop's avatar
Iustin Pop committed
195
196


197
def StopMaster(stop_daemons):
Iustin Pop's avatar
Iustin Pop committed
198
199
  """Deactivate this node as master.

200
  The function will always try to deactivate the IP address of the
Iustin Pop's avatar
Iustin Pop committed
201
202
203
204
205
206
207
  master. It will also stop the master daemons depending on the
  stop_daemons parameter.

  @type stop_daemons: boolean
  @param stop_daemons: whether to also stop the master daemons
      (ganeti-masterd and ganeti-rapi)
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
208
209

  """
210
  master_netdev, master_ip, _ = GetMasterInfo()
211
212
  if not master_netdev:
    return False
Iustin Pop's avatar
Iustin Pop committed
213

214
215
  result = utils.RunCmd(["ip", "address", "del", "%s/32" % master_ip,
                         "dev", master_netdev])
Iustin Pop's avatar
Iustin Pop committed
216
  if result.failed:
217
    logging.error("Can't remove the master IP, error: %s", result.output)
218
219
220
221
222
223
    # but otherwise ignore the failure

  if stop_daemons:
    # stop/kill the rapi and the master daemon
    for daemon in constants.RAPI_PID, constants.MASTERD_PID:
      utils.KillProcess(utils.ReadPidFile(utils.DaemonPidFileName(daemon)))
Iustin Pop's avatar
Iustin Pop committed
224
225
226
227

  return True


Iustin Pop's avatar
Iustin Pop committed
228
def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
229
  """Joins this node to the cluster.
Iustin Pop's avatar
Iustin Pop committed
230

231
232
233
234
  This does the following:
      - updates the hostkeys of the machine (rsa and dsa)
      - adds the ssh private key to the user
      - adds the ssh public key to the users' authorized_keys file
Iustin Pop's avatar
Iustin Pop committed
235

Iustin Pop's avatar
Iustin Pop committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
  @type dsa: str
  @param dsa: the DSA private key to write
  @type dsapub: str
  @param dsapub: the DSA public key to write
  @type rsa: str
  @param rsa: the RSA private key to write
  @type rsapub: str
  @param rsapub: the RSA public key to write
  @type sshkey: str
  @param sshkey: the SSH private key to write
  @type sshpub: str
  @param sshpub: the SSH public key to write
  @rtype: boolean
  @return: the success of the operation

251
  """
252
253
254
255
  sshd_keys =  [(constants.SSH_HOST_RSA_PRIV, rsa, 0600),
                (constants.SSH_HOST_RSA_PUB, rsapub, 0644),
                (constants.SSH_HOST_DSA_PRIV, dsa, 0600),
                (constants.SSH_HOST_DSA_PUB, dsapub, 0644)]
256
  for name, content, mode in sshd_keys:
257
    utils.WriteFile(name, data=content, mode=mode)
Iustin Pop's avatar
Iustin Pop committed
258

259
260
261
262
  try:
    priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
                                                    mkdir=True)
  except errors.OpExecError, err:
263
    logging.exception("Error while processing user ssh files")
264
    return False
Iustin Pop's avatar
Iustin Pop committed
265

266
267
  for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
    utils.WriteFile(name, data=content, mode=0600)
Iustin Pop's avatar
Iustin Pop committed
268

269
  utils.AddAuthorizedKey(auth_keys, sshpub)
Iustin Pop's avatar
Iustin Pop committed
270

271
  utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"])
Iustin Pop's avatar
Iustin Pop committed
272
273
274
275
276

  return True


def LeaveCluster():
Iustin Pop's avatar
Iustin Pop committed
277
278
279
280
281
282
  """Cleans up and remove the current node.

  This function cleans up and prepares the current node to be removed
  from the cluster.

  If processing is successful, then it raises an
Iustin Pop's avatar
Iustin Pop committed
283
  L{errors.QuitGanetiException} which is used as a special case to
Iustin Pop's avatar
Iustin Pop committed
284
  shutdown the node daemon.
Iustin Pop's avatar
Iustin Pop committed
285
286

  """
287
  _CleanDirectory(constants.DATA_DIR)
288
  JobQueuePurge()
289

290
291
  try:
    priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
292
293
  except errors.OpExecError:
    logging.exception("Error while processing ssh files")
294
295
    return

296
  f = open(pub_key, 'r')
Iustin Pop's avatar
Iustin Pop committed
297
  try:
298
    utils.RemoveAuthorizedKey(auth_keys, f.read(8192))
Iustin Pop's avatar
Iustin Pop committed
299
300
301
  finally:
    f.close()

302
303
  utils.RemoveFile(priv_key)
  utils.RemoveFile(pub_key)
Iustin Pop's avatar
Iustin Pop committed
304

305
306
307
  # Return a reassuring string to the caller, and quit
  raise errors.QuitGanetiException(False, 'Shutdown scheduled')

Iustin Pop's avatar
Iustin Pop committed
308

309
def GetNodeInfo(vgname, hypervisor_type):
Alexander Schreiber's avatar
Alexander Schreiber committed
310
  """Gives back a hash with different informations about the node.
Iustin Pop's avatar
Iustin Pop committed
311

312
313
314
315
316
317
318
319
320
321
322
323
  @type vgname: C{string}
  @param vgname: the name of the volume group to ask for disk space information
  @type hypervisor_type: C{str}
  @param hypervisor_type: the name of the hypervisor to ask for
      memory information
  @rtype: C{dict}
  @return: dictionary with the following keys:
      - vg_size is the size of the configured volume group in MiB
      - vg_free is the free size of the volume group in MiB
      - memory_dom0 is the memory allocated for domain0 in MiB
      - memory_free is the currently available (free) ram in MiB
      - memory_total is the total number of ram in MiB
Iustin Pop's avatar
Iustin Pop committed
324

325
  """
Iustin Pop's avatar
Iustin Pop committed
326
327
328
329
330
  outputarray = {}
  vginfo = _GetVGInfo(vgname)
  outputarray['vg_size'] = vginfo['vg_size']
  outputarray['vg_free'] = vginfo['vg_free']

331
  hyper = hypervisor.GetHypervisor(hypervisor_type)
Iustin Pop's avatar
Iustin Pop committed
332
333
334
335
  hyp_info = hyper.GetNodeInfo()
  if hyp_info is not None:
    outputarray.update(hyp_info)

336
337
338
339
340
341
  f = open("/proc/sys/kernel/random/boot_id", 'r')
  try:
    outputarray["bootid"] = f.read(128).rstrip("\n")
  finally:
    f.close()

Iustin Pop's avatar
Iustin Pop committed
342
343
344
  return outputarray


345
def VerifyNode(what, cluster_name):
Iustin Pop's avatar
Iustin Pop committed
346
347
  """Verify the status of the local node.

348
349
350
351
352
353
354
355
356
  Based on the input L{what} parameter, various checks are done on the
  local node.

  If the I{filelist} key is present, this list of
  files is checksummed and the file/checksum pairs are returned.

  If the I{nodelist} key is present, we check that we have
  connectivity via ssh with the target nodes (and check the hostname
  report).
Iustin Pop's avatar
Iustin Pop committed
357

358
359
360
361
362
363
364
365
366
367
368
  If the I{node-net-test} key is present, we check that we have
  connectivity to the given nodes via both primary IP and, if
  applicable, secondary IPs.

  @type what: C{dict}
  @param what: a dictionary of things to check:
      - filelist: list of files for which to compute checksums
      - nodelist: list of nodes we should check ssh communication with
      - node-net-test: list of nodes we should check node daemon port
        connectivity with
      - hypervisor: list with hypervisors to run the verify for
Iustin Pop's avatar
Iustin Pop committed
369
370
371
  @rtype: dict
  @return: a dictionary with the same keys as the input dict, and
      values representing the result of the checks
Iustin Pop's avatar
Iustin Pop committed
372
373
374
375

  """
  result = {}

376
377
378
379
380
381
382
383
384
385
386
387
388
  if constants.NV_HYPERVISOR in what:
    result[constants.NV_HYPERVISOR] = tmp = {}
    for hv_name in what[constants.NV_HYPERVISOR]:
      tmp[hv_name] = hypervisor.GetHypervisor(hv_name).Verify()

  if constants.NV_FILELIST in what:
    result[constants.NV_FILELIST] = utils.FingerprintFiles(
      what[constants.NV_FILELIST])

  if constants.NV_NODELIST in what:
    result[constants.NV_NODELIST] = tmp = {}
    random.shuffle(what[constants.NV_NODELIST])
    for node in what[constants.NV_NODELIST]:
389
      success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node)
Iustin Pop's avatar
Iustin Pop committed
390
      if not success:
391
392
393
394
        tmp[node] = message

  if constants.NV_NODENETTEST in what:
    result[constants.NV_NODENETTEST] = tmp = {}
395
396
    my_name = utils.HostInfo().name
    my_pip = my_sip = None
397
    for name, pip, sip in what[constants.NV_NODENETTEST]:
398
399
400
401
402
      if name == my_name:
        my_pip = pip
        my_sip = sip
        break
    if not my_pip:
403
404
      tmp[my_name] = ("Can't find my own primary/secondary IP"
                      " in the node list")
405
    else:
Michael Hanselmann's avatar
Michael Hanselmann committed
406
      port = utils.GetNodeDaemonPort()
407
      for name, pip, sip in what[constants.NV_NODENETTEST]:
408
409
410
411
412
413
414
        fail = []
        if not utils.TcpPing(pip, port, source=my_pip):
          fail.append("primary")
        if sip != pip:
          if not utils.TcpPing(sip, port, source=my_sip):
            fail.append("secondary")
        if fail:
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
          tmp[name] = ("failure using the %s interface(s)" %
                       " and ".join(fail))

  if constants.NV_LVLIST in what:
    result[constants.NV_LVLIST] = GetVolumeList(what[constants.NV_LVLIST])

  if constants.NV_INSTANCELIST in what:
    result[constants.NV_INSTANCELIST] = GetInstanceList(
      what[constants.NV_INSTANCELIST])

  if constants.NV_VGLIST in what:
    result[constants.NV_VGLIST] = ListVolumeGroups()

  if constants.NV_VERSION in what:
    result[constants.NV_VERSION] = constants.PROTOCOL_VERSION

  if constants.NV_HVINFO in what:
    hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO])
    result[constants.NV_HVINFO] = hyper.GetNodeInfo()
434

435
436
437
  if constants.NV_DRBDLIST in what:
    try:
      used_minors = bdev.DRBD8.GetUsedDevs().keys()
Iustin Pop's avatar
Iustin Pop committed
438
    except errors.BlockDeviceError:
439
440
441
442
      logging.warning("Can't get used minors list", exc_info=True)
      used_minors = []
    result[constants.NV_DRBDLIST] = used_minors

Iustin Pop's avatar
Iustin Pop committed
443
444
445
446
447
448
  return result


def GetVolumeList(vg_name):
  """Compute list of logical volumes and their size.

Iustin Pop's avatar
Iustin Pop committed
449
450
451
452
453
454
455
456
457
458
459
  @type vg_name: str
  @param vg_name: the volume group whose LVs we should list
  @rtype: dict
  @return:
      dictionary of all partions (key) with value being a tuple of
      their size (in MiB), inactive and online status::

        {'test1': ('20.06', True, True)}

      in case of errors, a string is returned with the error
      details.
Iustin Pop's avatar
Iustin Pop committed
460
461

  """
462
463
464
465
466
  lvs = {}
  sep = '|'
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=%s" % sep,
                         "-olv_name,lv_size,lv_attr", vg_name])
Iustin Pop's avatar
Iustin Pop committed
467
  if result.failed:
468
469
    logging.error("Failed to list logical volumes, lvs output: %s",
                  result.output)
470
    return result.output
471

472
  valid_line_re = re.compile("^ *([^|]+)\|([0-9.]+)\|([^|]{6})\|?$")
473
  for line in result.stdout.splitlines():
474
475
476
    line = line.strip()
    match = valid_line_re.match(line)
    if not match:
477
      logging.error("Invalid line returned from lvs output: '%s'", line)
478
479
      continue
    name, size, attr = match.groups()
480
481
482
483
484
    inactive = attr[4] == '-'
    online = attr[5] == 'o'
    lvs[name] = (size, inactive, online)

  return lvs
Iustin Pop's avatar
Iustin Pop committed
485
486
487


def ListVolumeGroups():
Alexander Schreiber's avatar
Alexander Schreiber committed
488
  """List the volume groups and their size.
Iustin Pop's avatar
Iustin Pop committed
489

Iustin Pop's avatar
Iustin Pop committed
490
491
492
  @rtype: dict
  @return: dictionary with keys volume name and values the
      size of the volume
Iustin Pop's avatar
Iustin Pop committed
493
494
495
496
497

  """
  return utils.ListVolumeGroups()


498
499
500
def NodeVolumes():
  """List all volumes on this node.

Iustin Pop's avatar
Iustin Pop committed
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
  @rtype: list
  @return:
    A list of dictionaries, each having four keys:
      - name: the logical volume name,
      - size: the size of the logical volume
      - dev: the physical device on which the LV lives
      - vg: the volume group to which it belongs

    In case of errors, we return an empty list and log the
    error.

    Note that since a logical volume can live on multiple physical
    volumes, the resulting list might include a logical volume
    multiple times.

516
517
518
519
520
  """
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=|",
                         "--options=lv_name,lv_size,devices,vg_name"])
  if result.failed:
521
522
    logging.error("Failed to list logical volumes, lvs output: %s",
                  result.output)
Iustin Pop's avatar
Iustin Pop committed
523
    return []
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538

  def parse_dev(dev):
    if '(' in dev:
      return dev.split('(')[0]
    else:
      return dev

  def map_line(line):
    return {
      'name': line[0].strip(),
      'size': line[1].strip(),
      'dev': parse_dev(line[2].strip()),
      'vg': line[3].strip(),
    }

539
540
  return [map_line(line.split('|')) for line in result.stdout.splitlines()
          if line.count('|') >= 3]
541
542


Iustin Pop's avatar
Iustin Pop committed
543
def BridgesExist(bridges_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
544
  """Check if a list of bridges exist on the current node.
Iustin Pop's avatar
Iustin Pop committed
545

Iustin Pop's avatar
Iustin Pop committed
546
547
  @rtype: boolean
  @return: C{True} if all of them exist, C{False} otherwise
Iustin Pop's avatar
Iustin Pop committed
548
549
550
551
552
553
554
555
556

  """
  for bridge in bridges_list:
    if not utils.BridgeExists(bridge):
      return False

  return True


557
def GetInstanceList(hypervisor_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
558
  """Provides a list of instances.
Iustin Pop's avatar
Iustin Pop committed
559

560
561
562
563
564
  @type hypervisor_list: list
  @param hypervisor_list: the list of hypervisors to query information

  @rtype: list
  @return: a list of all running instances on the current node
Iustin Pop's avatar
Iustin Pop committed
565
566
    - instance1.example.com
    - instance2.example.com
Iustin Pop's avatar
Iustin Pop committed
567

568
  """
569
570
571
572
573
574
575
576
577
  results = []
  for hname in hypervisor_list:
    try:
      names = hypervisor.GetHypervisor(hname).ListInstances()
      results.extend(names)
    except errors.HypervisorError, err:
      logging.exception("Error enumerating instances for hypevisor %s", hname)
      # FIXME: should we somehow not propagate this to the master?
      raise
Iustin Pop's avatar
Iustin Pop committed
578

579
  return results
Iustin Pop's avatar
Iustin Pop committed
580
581


582
def GetInstanceInfo(instance, hname):
Alexander Schreiber's avatar
Alexander Schreiber committed
583
  """Gives back the informations about an instance as a dictionary.
Iustin Pop's avatar
Iustin Pop committed
584

585
586
587
588
  @type instance: string
  @param instance: the instance name
  @type hname: string
  @param hname: the hypervisor type of the instance
Iustin Pop's avatar
Iustin Pop committed
589

590
591
592
593
594
  @rtype: dict
  @return: dictionary with the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
595

596
  """
Iustin Pop's avatar
Iustin Pop committed
597
598
  output = {}

599
  iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance)
Iustin Pop's avatar
Iustin Pop committed
600
601
602
603
604
605
606
607
  if iinfo is not None:
    output['memory'] = iinfo[2]
    output['state'] = iinfo[4]
    output['time'] = iinfo[5]

  return output


608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def GetInstanceMigratable(instance):
  """Gives whether an instance can be migrated.

  @type instance: L{objects.Instance}
  @param instance: object representing the instance to be checked.

  @rtype: tuple
  @return: tuple of (result, description) where:
      - result: whether the instance can be migrated or not
      - description: a description of the issue, if relevant

  """
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  if instance.name not in hyper.ListInstances():
    return (False, 'not running')

  for idx in range(len(instance.disks)):
    link_name = _GetBlockDevSymlinkPath(instance.name, idx)
    if not os.path.islink(link_name):
      return (False, 'not restarted since ganeti 1.2.5')

  return (True, '')


632
def GetAllInstancesInfo(hypervisor_list):
Iustin Pop's avatar
Iustin Pop committed
633
634
  """Gather data about all instances.

Iustin Pop's avatar
Iustin Pop committed
635
  This is the equivalent of L{GetInstanceInfo}, except that it
Iustin Pop's avatar
Iustin Pop committed
636
637
638
  computes data for all instances at once, thus being faster if one
  needs data about more than one instance.

639
640
641
  @type hypervisor_list: list
  @param hypervisor_list: list of hypervisors to query for instance data

642
  @rtype: dict
643
644
645
646
  @return: dictionary of instance: data, with data having the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
647
      - vcpus: the number of vcpus
Iustin Pop's avatar
Iustin Pop committed
648

649
  """
Iustin Pop's avatar
Iustin Pop committed
650
651
  output = {}

652
653
654
655
  for hname in hypervisor_list:
    iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo()
    if iinfo:
      for name, inst_id, memory, vcpus, state, times in iinfo:
656
        value = {
657
658
659
660
661
          'memory': memory,
          'vcpus': vcpus,
          'state': state,
          'time': times,
          }
662
663
664
665
        if name in output and output[name] != value:
          raise errors.HypervisorError("Instance %s running duplicate"
                                       " with different parameters" % name)
        output[name] = value
Iustin Pop's avatar
Iustin Pop committed
666
667
668
669

  return output


670
def AddOSToInstance(instance):
Alexander Schreiber's avatar
Alexander Schreiber committed
671
  """Add an OS to an instance.
Iustin Pop's avatar
Iustin Pop committed
672

673
674
  @type instance: L{objects.Instance}
  @param instance: Instance whose OS is to be installed
Iustin Pop's avatar
Iustin Pop committed
675
676
  @rtype: boolean
  @return: the success of the operation
Iustin Pop's avatar
Iustin Pop committed
677
678
679
680

  """
  inst_os = OSFromDisk(instance.os)

681
  create_env = OSEnvironment(instance)
Iustin Pop's avatar
Iustin Pop committed
682
683
684

  logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                     instance.name, int(time.time()))
685

686
687
  result = utils.RunCmd([inst_os.create_script], env=create_env,
                        cwd=inst_os.path, output=logfile,)
688
  if result.failed:
689
    logging.error("os create command '%s' returned error: %s, logfile: %s,"
690
                  " output: %s", result.cmd, result.fail_reason, logfile,
691
                  result.output)
692
693
694
695
    lines = [val.encode("string_escape")
             for val in utils.TailFile(logfile, lines=20)]
    return (False, "OS create script failed (%s), last lines in the"
            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
696

697
  return (True, "Successfully installed")
698
699


700
def RunRenameInstance(instance, old_name):
701
702
  """Run the OS rename script for an instance.

Iustin Pop's avatar
Iustin Pop committed
703
  @type instance: L{objects.Instance}
704
705
706
  @param instance: Instance whose OS is to be installed
  @type old_name: string
  @param old_name: previous instance name
Iustin Pop's avatar
Iustin Pop committed
707
708
  @rtype: boolean
  @return: the success of the operation
709
710
711
712

  """
  inst_os = OSFromDisk(instance.os)

713
714
  rename_env = OSEnvironment(instance)
  rename_env['OLD_INSTANCE_NAME'] = old_name
715
716
717
718

  logfile = "%s/rename-%s-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                           old_name,
                                           instance.name, int(time.time()))
Iustin Pop's avatar
Iustin Pop committed
719

720
721
  result = utils.RunCmd([inst_os.rename_script], env=rename_env,
                        cwd=inst_os.path, output=logfile)
Iustin Pop's avatar
Iustin Pop committed
722
723

  if result.failed:
724
    logging.error("os create command '%s' returned error: %s output: %s",
725
                  result.cmd, result.fail_reason, result.output)
726
727
728
729
    lines = [val.encode("string_escape")
             for val in utils.TailFile(logfile, lines=20)]
    return (False, "OS rename script failed (%s), last lines in the"
            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
Iustin Pop's avatar
Iustin Pop committed
730

731
  return (True, "Rename successful")
Iustin Pop's avatar
Iustin Pop committed
732
733
734
735
736


def _GetVGInfo(vg_name):
  """Get informations about the volume group.

Iustin Pop's avatar
Iustin Pop committed
737
738
739
740
741
742
743
744
  @type vg_name: str
  @param vg_name: the volume group which we query
  @rtype: dict
  @return:
    A dictionary with the following keys:
      - C{vg_size} is the total size of the volume group in MiB
      - C{vg_free} is the free size of the volume group in MiB
      - C{pv_count} are the number of physical disks in that VG
Iustin Pop's avatar
Iustin Pop committed
745

Iustin Pop's avatar
Iustin Pop committed
746
747
    If an error occurs during gathering of data, we return the same dict
    with keys all set to None.
748

Iustin Pop's avatar
Iustin Pop committed
749
  """
750
751
  retdic = dict.fromkeys(["vg_size", "vg_free", "pv_count"])

Iustin Pop's avatar
Iustin Pop committed
752
753
754
755
  retval = utils.RunCmd(["vgs", "-ovg_size,vg_free,pv_count", "--noheadings",
                         "--nosuffix", "--units=m", "--separator=:", vg_name])

  if retval.failed:
756
    logging.error("volume group %s not present", vg_name)
757
    return retdic
Iustin Pop's avatar
Iustin Pop committed
758
  valarr = retval.stdout.strip().rstrip(':').split(':')
759
760
761
762
763
764
765
766
  if len(valarr) == 3:
    try:
      retdic = {
        "vg_size": int(round(float(valarr[0]), 0)),
        "vg_free": int(round(float(valarr[1]), 0)),
        "pv_count": int(valarr[2]),
        }
    except ValueError, err:
767
      logging.exception("Fail to parse vgs output")
768
  else:
769
770
    logging.error("vgs output has the wrong number of fields (expected"
                  " three): %s", str(valarr))
Iustin Pop's avatar
Iustin Pop committed
771
772
773
  return retdic


774
775
776
777
778
779
def _GetBlockDevSymlinkPath(instance_name, idx):
  return os.path.join(constants.DISK_LINKS_DIR,
                      "%s:%d" % (instance_name, idx))


def _SymlinkBlockDev(instance_name, device_path, idx):
780
781
782
783
784
785
  """Set up symlinks to a instance's block device.

  This is an auxiliary function run when an instance is start (on the primary
  node) or when an instance is migrated (on the target node).


786
787
788
789
  @param instance_name: the name of the target instance
  @param device_path: path of the physical block device, on the node
  @param idx: the disk index
  @return: absolute path to the disk's symlink
790
791

  """
792
  link_name = _GetBlockDevSymlinkPath(instance_name, idx)
793
794
  try:
    os.symlink(device_path, link_name)
795
796
  except OSError, err:
    if err.errno == errno.EEXIST:
797
798
799
800
801
802
803
804
805
806
      if (not os.path.islink(link_name) or
          os.readlink(link_name) != device_path):
        os.remove(link_name)
        os.symlink(device_path, link_name)
    else:
      raise

  return link_name


807
def _RemoveBlockDevLinks(instance_name, disks):
Iustin Pop's avatar
Iustin Pop committed
808
809
810
  """Remove the block device symlinks belonging to the given instance.

  """
811
812
813
  for idx, disk in enumerate(disks):
    link_name = _GetBlockDevSymlinkPath(instance_name, idx)
    if os.path.islink(link_name):
Iustin Pop's avatar
Iustin Pop committed
814
      try:
815
816
817
        os.remove(link_name)
      except OSError:
        logging.exception("Can't remove symlink '%s'", link_name)
Iustin Pop's avatar
Iustin Pop committed
818
819


820
def _GatherAndLinkBlockDevs(instance):
Iustin Pop's avatar
Iustin Pop committed
821
822
823
824
825
  """Set up an instance's block device(s).

  This is run on the primary node at instance startup. The block
  devices must be already assembled.

Iustin Pop's avatar
Iustin Pop committed
826
827
  @type instance: L{objects.Instance}
  @param instance: the instance whose disks we shoul assemble
828
829
  @rtype: list
  @return: list of (disk_object, device_path)
Iustin Pop's avatar
Iustin Pop committed
830

Iustin Pop's avatar
Iustin Pop committed
831
832
  """
  block_devices = []
833
  for idx, disk in enumerate(instance.disks):
Iustin Pop's avatar
Iustin Pop committed
834
835
836
837
838
    device = _RecursiveFindBD(disk)
    if device is None:
      raise errors.BlockDeviceError("Block device '%s' is not set up." %
                                    str(disk))
    device.Open()
839
    try:
840
      link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
841
842
843
844
845
846
    except OSError, e:
      raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
                                    e.strerror)

    block_devices.append((disk, link_name))

Iustin Pop's avatar
Iustin Pop committed
847
848
849
850
851
852
  return block_devices


def StartInstance(instance, extra_args):
  """Start an instance.

Iustin Pop's avatar
Iustin Pop committed
853
  @type instance: L{objects.Instance}
854
855
856
  @param instance: the instance object
  @rtype: boolean
  @return: whether the startup was successful or not
Iustin Pop's avatar
Iustin Pop committed
857

858
  """
859
  running_instances = GetInstanceList([instance.hypervisor])
Iustin Pop's avatar
Iustin Pop committed
860
861

  if instance.name in running_instances:
862
    return (True, "Already running")
Iustin Pop's avatar
Iustin Pop committed
863
864

  try:
865
866
    block_devices = _GatherAndLinkBlockDevs(instance)
    hyper = hypervisor.GetHypervisor(instance.hypervisor)
Iustin Pop's avatar
Iustin Pop committed
867
    hyper.StartInstance(instance, block_devices, extra_args)
868
869
  except errors.BlockDeviceError, err:
    logging.exception("Failed to start instance")
870
    return (False, "Block device error: %s" % str(err))
Iustin Pop's avatar
Iustin Pop committed
871
  except errors.HypervisorError, err:
872
    logging.exception("Failed to start instance")
873
    _RemoveBlockDevLinks(instance.name, instance.disks)
874
    return (False, "Hypervisor error: %s" % str(err))
Iustin Pop's avatar
Iustin Pop committed
875

876
  return (True, "Instance started successfully")
Iustin Pop's avatar
Iustin Pop committed
877
878
879
880
881


def ShutdownInstance(instance):
  """Shut an instance down.

Iustin Pop's avatar
Iustin Pop committed
882
883
884
  @note: this functions uses polling with a hardcoded timeout.

  @type instance: L{objects.Instance}
885
886
887
  @param instance: the instance object
  @rtype: boolean
  @return: whether the startup was successful or not
Iustin Pop's avatar
Iustin Pop committed
888

889
  """
890
891
  hv_name = instance.hypervisor
  running_instances = GetInstanceList([hv_name])
Iustin Pop's avatar
Iustin Pop committed
892
893
894
895

  if instance.name not in running_instances:
    return True

896
  hyper = hypervisor.GetHypervisor(hv_name)
Iustin Pop's avatar
Iustin Pop committed
897
898
899
  try:
    hyper.StopInstance(instance)
  except errors.HypervisorError, err:
900
    logging.error("Failed to stop instance: %s" % err)
Iustin Pop's avatar
Iustin Pop committed
901
902
903
904
905
906
    return False

  # test every 10secs for 2min

  time.sleep(1)
  for dummy in range(11):
907
    if instance.name not in GetInstanceList([hv_name]):
Iustin Pop's avatar
Iustin Pop committed
908
909
910
911
      break
    time.sleep(10)
  else:
    # the shutdown did not succeed
912
913
    logging.error("Shutdown of '%s' unsuccessful, using destroy",
                  instance.name)
Iustin Pop's avatar
Iustin Pop committed
914
915
916
917

    try:
      hyper.StopInstance(instance, force=True)
    except errors.HypervisorError, err:
918
      logging.exception("Failed to stop instance: %s" % err)
Iustin Pop's avatar
Iustin Pop committed
919
920
921
      return False

    time.sleep(1)
922
    if instance.name in GetInstanceList([hv_name]):
923
      logging.error("Could not shutdown instance '%s' even by destroy",
924
                    instance.name)
Iustin Pop's avatar
Iustin Pop committed
925
926
      return False

927
  _RemoveBlockDevLinks(instance.name, instance.disks)
Iustin Pop's avatar
Iustin Pop committed
928

Iustin Pop's avatar
Iustin Pop committed
929
930
931
  return True


932
933
934
def RebootInstance(instance, reboot_type, extra_args):
  """Reboot an instance.

Iustin Pop's avatar
Iustin Pop committed
935
936
937
938
939
940
941
942
943
944
945
946
947
948
  @type instance: L{objects.Instance}
  @param instance: the instance object to reboot
  @type reboot_type: str
  @param reboot_type: the type of reboot, one the following
    constants:
      - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
        instance OS, do not recreate the VM
      - L{constants.INSTANCE_REBOOT_HARD}: tear down and
        restart the VM (at the hypervisor level)
      - the other reboot type (L{constants.INSTANCE_REBOOT_HARD})
        is not accepted here, since that mode is handled
        differently
  @rtype: boolean
  @return: the success of the operation
949
950

  """
951
  running_instances = GetInstanceList([instance.hypervisor])
952
953

  if instance.name not in running_instances:
954
    logging.error("Cannot reboot instance that is not running")
955
956
    return False

957
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
958
959
960
961
  if reboot_type == constants.INSTANCE_REBOOT_SOFT:
    try:
      hyper.RebootInstance(instance)
    except errors.HypervisorError, err:
962
      logging.exception("Failed to soft reboot instance")
963
964
965
966
967
968
      return False
  elif reboot_type == constants.INSTANCE_REBOOT_HARD:
    try:
      ShutdownInstance(instance)
      StartInstance(instance, extra_args)
    except errors.HypervisorError, err:
969
      logging.exception("Failed to hard reboot instance")
970
971
972
973
974
975
976
      return False
  else:
    raise errors.ParameterError("reboot_type invalid")

  return True


977
978
979
980
981
982
983
def MigrationInfo(instance):
  """Gather information about an instance to be migrated.

  @type instance: L{objects.Instance}
  @param instance: the instance definition

  """
984
985
986
987
988
989
990
991
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    info = hyper.MigrationInfo(instance)
  except errors.HypervisorError, err:
    msg = "Failed to fetch migration information"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
  return (True, info)
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004


def AcceptInstance(instance, info, target):
  """Prepare the node to accept an instance.

  @type instance: L{objects.Instance}
  @param instance: the instance definition
  @type info: string/data (opaque)
  @param info: migration information, from the source node
  @type target: string
  @param target: target host (usually ip), on this node

  """
1005
1006
1007
1008
1009
1010
1011
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    hyper.AcceptInstance(instance, info, target)
  except errors.HypervisorError, err:
    msg = "Failed to accept instance"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
  return (True, "Accept successfull")


def FinalizeMigration(instance, info, success):
  """Finalize any preparation to accept an instance.

  @type instance: L{objects.Instance}
  @param instance: the instance definition
  @type info: string/data (opaque)
  @param info: migration information, from the source node
  @type success: boolean
  @param success: whether the migration was a success or a failure

  """
1026
1027
1028
1029
1030
1031
1032
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    hyper.FinalizeMigration(instance, info, success)
  except errors.HypervisorError, err:
    msg = "Failed to finalize migration"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
1033
1034
1035
  return (True, "Migration Finalized")


1036
1037
1038
def MigrateInstance(instance, target, live):
  """Migrates an instance to another node.

Iustin Pop's avatar
Iustin Pop committed
1039
  @type instance: L{objects.Instance}
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
  @param instance: the instance definition
  @type target: string
  @param target: the target node name
  @type live: boolean
  @param live: whether the migration should be done live or not (the
      interpretation of this parameter is left to the hypervisor)
  @rtype: tuple
  @return: a tuple of (success, msg) where:
      - succes is a boolean denoting the success/failure of the operation
      - msg is a string with details in case of failure

1051
  """
1052
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
1053
1054

  try:
1055
    hyper.MigrateInstance(instance.name, target, live)
1056
  except errors.HypervisorError, err:
1057
1058
1059
    msg = "Failed to migrate instance"
    logging.exception(msg)
    return (False, "%s: %s" % (msg, err))
1060
1061
1062
  return (True, "Migration successfull")


1063
def CreateBlockDevice(disk, size, owner, on_primary, info):
Iustin Pop's avatar
Iustin Pop committed
1064
1065
  """Creates a block device for an instance.

Iustin Pop's avatar
Iustin Pop committed
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
  @type disk: L{objects.Disk}
  @param disk: the object describing the disk we should create
  @type size: int
  @param size: the size of the physical underlying device, in MiB
  @type owner: str
  @param owner: the name of the instance for which disk is created,
      used for device cache data
  @type on_primary: boolean
  @param on_primary:  indicates if it is the primary node or not
  @type info: string
  @param info: string that will be sent to the physical device
      creation, used for example to set (LVM) tags on LVs

  @return: the new unique_id of the device (this can sometime be
      computed only after creation), or None. On secondary nodes,
      it's not required to return anything.
Iustin Pop's avatar
Iustin Pop committed
1082
1083
1084
1085
1086

  """
  clist = []
  if disk.children:
    for child in disk.children:
1087
      crdev = _RecursiveAssembleBD(child, owner, on_primary)
Iustin Pop's avatar
Iustin Pop committed
1088
1089
1090
1091
1092
1093
      if on_primary or disk.AssembleOnSecondary():
        # we need the children open in case the device itself has to
        # be assembled
        crdev.Open()
      clist.append(crdev)

1094
1095
1096
1097
  try:
    device = bdev.Create(disk.dev_type, disk.physical_id, clist, size)
  except errors.GenericError, err:
    return False, "Can't create block device: %s" % str(err)
Iustin Pop's avatar
Iustin Pop committed
1098

Iustin Pop's avatar
Iustin Pop committed
1099
  if on_primary or disk.AssembleOnSecondary():
1100
    if not device.Assemble():
1101
      errorstring = "Can't assemble device after creation, very unusual event"
1102
      logging.error(errorstring)
1103
      return False, errorstring
1104
    device.SetSyncSpeed(constants.SYNC_SPEED)
Iustin Pop's avatar
Iustin Pop committed
1105
1106
    if on_primary or disk.OpenOnSecondary():
      device.Open(force=True)
1107
1108
    DevCacheManager.UpdateCache(device.dev_path, owner,
                                on_primary, disk.iv_name)
1109
1110
1111

  device.SetInfo(info)

Iustin Pop's avatar
Iustin Pop committed
1112
  physical_id = device.unique_id
1113
  return True, physical_id
Iustin Pop's avatar
Iustin Pop committed
1114
1115
1116
1117
1118


def RemoveBlockDevice(disk):
  """Remove a block device.

Iustin Pop's avatar
Iustin Pop committed
1119
1120
  @note: This is intended to be called recursively.

Iustin Pop's avatar
Iustin Pop committed
1121
  @type disk: L{objects.Disk}
Iustin Pop's avatar
Iustin Pop committed
1122
1123
1124
  @param disk: the disk object we should remove
  @rtype: boolean
  @return: the success of the operation
Iustin Pop's avatar
Iustin Pop committed
1125
1126
1127

  """
  try:
1128
    rdev = _RecursiveFindBD(disk)
Iustin Pop's avatar
Iustin Pop committed
1129
1130
  except errors.BlockDeviceError, err:
    # probably can't attach
1131
    logging.info("Can't attach to device %s in remove", disk)
Iustin Pop's avatar
Iustin Pop committed
1132
1133
    rdev = None
  if rdev is not None:
1134
    r_path = rdev.dev_path
Iustin Pop's avatar
Iustin Pop committed
1135
    result = rdev.Remove()
1136
1137
    if result:
      DevCacheManager.RemoveCache(r_path)
Iustin Pop's avatar
Iustin Pop committed
1138
1139
1140
1141
1142
1143
1144
1145
  else:
    result = True
  if disk.children:
    for child in disk.children:
      result = result and RemoveBlockDevice(child)
  return result


1146
def _RecursiveAssembleBD(disk, owner, as_primary):
Iustin Pop's avatar
Iustin Pop committed
1147
1148
1149
1150
  """Activate a block device for an instance.

  This is run on the primary and secondary nodes for an instance.

Iustin Pop's avatar
Iustin Pop committed
1151
  @note: this function is called recursively.
Iustin Pop's avatar
Iustin Pop committed
1152

Iustin Pop's avatar
Iustin Pop committed
1153
1154
1155
1156
1157
1158
1159
  @type disk: L{objects.Disk}
  @param disk: the disk we try to assemble
  @type owner: str
  @param owner: the name of the instance which owns the disk
  @type as_primary: boolean
  @param as_primary: if we should make the block device
      read/write
Iustin Pop's avatar
Iustin Pop committed
1160

Iustin Pop's avatar
Iustin Pop committed
1161
1162
1163
1164
1165
  @return: the assembled device or None (in case no device
      was assembled)
  @raise errors.BlockDeviceError: in case there is an error
      during the activation of the children or the device
      itself
Iustin Pop's avatar
Iustin Pop committed
1166
1167
1168
1169

  """
  children = []
  if disk.children:
1170
1171
1172
1173
1174
    mcn = disk.ChildrenNeeded()
    if mcn == -1:
      mcn = 0 # max number of Nones allowed
    else:
      mcn = len(disk.children) - mcn # max number of Nones
Iustin Pop's avatar
Iustin Pop committed
1175
    for chld_disk in disk.children:
1176
1177
1178
      try:
        cdev = _RecursiveAssembleBD(chld_disk, owner, as_primary)
      except errors.BlockDeviceError, err:
1179
        if children.count(None) >= mcn:
1180
1181
          raise
        cdev = None
1182
        logging.debug("Error in child activation: %s", str(err))