backend.py 80.1 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
#
Iustin Pop's avatar
Iustin Pop committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Functions used by the node daemon"""


import os
import os.path
import shutil
import time
import stat
import errno
import re
import subprocess
33
import random
34
import logging
35
import tempfile
36
37
import zlib
import base64
Iustin Pop's avatar
Iustin Pop committed
38
39
40
41
42
43
44
45

from ganeti import errors
from ganeti import utils
from ganeti import ssh
from ganeti import hypervisor
from ganeti import constants
from ganeti import bdev
from ganeti import objects
46
from ganeti import ssconf
Iustin Pop's avatar
Iustin Pop committed
47
48


Michael Hanselmann's avatar
Michael Hanselmann committed
49
def _GetConfig():
Iustin Pop's avatar
Iustin Pop committed
50
  """Simple wrapper to return a SimpleStore.
Iustin Pop's avatar
Iustin Pop committed
51

Iustin Pop's avatar
Iustin Pop committed
52
53
  @rtype: L{ssconf.SimpleStore}
  @return: a SimpleStore instance
Iustin Pop's avatar
Iustin Pop committed
54
55

  """
Iustin Pop's avatar
Iustin Pop committed
56
  return ssconf.SimpleStore()
Michael Hanselmann's avatar
Michael Hanselmann committed
57
58


59
def _GetSshRunner(cluster_name):
Iustin Pop's avatar
Iustin Pop committed
60
61
62
63
64
65
66
67
68
  """Simple wrapper to return an SshRunner.

  @type cluster_name: str
  @param cluster_name: the cluster name, which is needed
      by the SshRunner constructor
  @rtype: L{ssh.SshRunner}
  @return: an SshRunner instance

  """
69
  return ssh.SshRunner(cluster_name)
70
71


72
73
74
75
76
77
78
79
80
def _Decompress(data):
  """Unpacks data compressed by the RPC client.

  @type data: list or tuple
  @param data: Data sent by RPC client
  @rtype: str
  @return: Decompressed data

  """
81
  assert isinstance(data, (list, tuple))
82
83
84
85
86
87
88
89
90
91
  assert len(data) == 2
  (encoding, content) = data
  if encoding == constants.RPC_ENCODING_NONE:
    return content
  elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
    return zlib.decompress(base64.b64decode(content))
  else:
    raise AssertionError("Unknown data encoding")


92
def _CleanDirectory(path, exclude=None):
93
94
  """Removes all regular files in a directory.

Iustin Pop's avatar
Iustin Pop committed
95
96
  @type path: str
  @param path: the directory to clean
97
  @type exclude: list
Iustin Pop's avatar
Iustin Pop committed
98
99
  @param exclude: list of files to be excluded, defaults
      to the empty list
100
101

  """
102
103
  if not os.path.isdir(path):
    return
104
105
106
107
108
  if exclude is None:
    exclude = []
  else:
    # Normalize excluded paths
    exclude = [os.path.normpath(i) for i in exclude]
109

110
  for rel_name in utils.ListVisibleFiles(path):
111
112
113
    full_name = os.path.normpath(os.path.join(path, rel_name))
    if full_name in exclude:
      continue
114
115
116
117
    if os.path.isfile(full_name) and not os.path.islink(full_name):
      utils.RemoveFile(full_name)


118
def JobQueuePurge():
Iustin Pop's avatar
Iustin Pop committed
119
120
121
  """Removes job queue files and archived jobs.

  @rtype: None
122
123

  """
124
  _CleanDirectory(constants.QUEUE_DIR, exclude=[constants.JOB_QUEUE_LOCK_FILE])
125
126
127
  _CleanDirectory(constants.JOB_QUEUE_ARCHIVE_DIR)


128
129
130
131
132
133
134
def GetMasterInfo():
  """Returns master information.

  This is an utility function to compute master information, either
  for consumption here or from the node daemon.

  @rtype: tuple
Iustin Pop's avatar
Iustin Pop committed
135
136
  @return: (master_netdev, master_ip, master_name) if we have a good
      configuration, otherwise (None, None, None)
137
138
139

  """
  try:
Michael Hanselmann's avatar
Michael Hanselmann committed
140
141
142
143
    cfg = _GetConfig()
    master_netdev = cfg.GetMasterNetdev()
    master_ip = cfg.GetMasterIP()
    master_node = cfg.GetMasterNode()
144
145
  except errors.ConfigurationError, err:
    logging.exception("Cluster configuration incomplete")
Iustin Pop's avatar
Iustin Pop committed
146
    return (None, None, None)
147
  return (master_netdev, master_ip, master_node)
148
149


150
def StartMaster(start_daemons):
Iustin Pop's avatar
Iustin Pop committed
151
152
  """Activate local node as master node.

153
  The function will always try activate the IP address of the master
Iustin Pop's avatar
Iustin Pop committed
154
155
156
157
158
159
160
  (unless someone else has it). It will also start the master daemons,
  based on the start_daemons parameter.

  @type start_daemons: boolean
  @param start_daemons: whther to also start the master
      daemons (ganeti-masterd and ganeti-rapi)
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
161
162

  """
163
  ok = True
164
  master_netdev, master_ip, _ = GetMasterInfo()
165
  if not master_netdev:
Iustin Pop's avatar
Iustin Pop committed
166
167
    return False

168
  if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
169
    if utils.OwnIpAddress(master_ip):
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
      # we already have the ip:
      logging.debug("Already started")
    else:
      logging.error("Someone else has the master ip, not activating")
      ok = False
  else:
    result = utils.RunCmd(["ip", "address", "add", "%s/32" % master_ip,
                           "dev", master_netdev, "label",
                           "%s:0" % master_netdev])
    if result.failed:
      logging.error("Can't activate master IP: %s", result.output)
      ok = False

    result = utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev,
                           "-s", master_ip, master_ip])
    # we'll ignore the exit code of arping

  # and now start the master and rapi daemons
  if start_daemons:
    for daemon in 'ganeti-masterd', 'ganeti-rapi':
      result = utils.RunCmd([daemon])
      if result.failed:
        logging.error("Can't start daemon %s: %s", daemon, result.output)
        ok = False
  return ok
Iustin Pop's avatar
Iustin Pop committed
195
196


197
def StopMaster(stop_daemons):
Iustin Pop's avatar
Iustin Pop committed
198
199
  """Deactivate this node as master.

200
  The function will always try to deactivate the IP address of the
Iustin Pop's avatar
Iustin Pop committed
201
202
203
204
205
206
207
  master. It will also stop the master daemons depending on the
  stop_daemons parameter.

  @type stop_daemons: boolean
  @param stop_daemons: whether to also stop the master daemons
      (ganeti-masterd and ganeti-rapi)
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
208
209

  """
210
  master_netdev, master_ip, _ = GetMasterInfo()
211
212
  if not master_netdev:
    return False
Iustin Pop's avatar
Iustin Pop committed
213

214
215
  result = utils.RunCmd(["ip", "address", "del", "%s/32" % master_ip,
                         "dev", master_netdev])
Iustin Pop's avatar
Iustin Pop committed
216
  if result.failed:
217
    logging.error("Can't remove the master IP, error: %s", result.output)
218
219
220
221
222
223
    # but otherwise ignore the failure

  if stop_daemons:
    # stop/kill the rapi and the master daemon
    for daemon in constants.RAPI_PID, constants.MASTERD_PID:
      utils.KillProcess(utils.ReadPidFile(utils.DaemonPidFileName(daemon)))
Iustin Pop's avatar
Iustin Pop committed
224
225
226
227

  return True


Iustin Pop's avatar
Iustin Pop committed
228
def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
229
  """Joins this node to the cluster.
Iustin Pop's avatar
Iustin Pop committed
230

231
232
233
234
  This does the following:
      - updates the hostkeys of the machine (rsa and dsa)
      - adds the ssh private key to the user
      - adds the ssh public key to the users' authorized_keys file
Iustin Pop's avatar
Iustin Pop committed
235

Iustin Pop's avatar
Iustin Pop committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
  @type dsa: str
  @param dsa: the DSA private key to write
  @type dsapub: str
  @param dsapub: the DSA public key to write
  @type rsa: str
  @param rsa: the RSA private key to write
  @type rsapub: str
  @param rsapub: the RSA public key to write
  @type sshkey: str
  @param sshkey: the SSH private key to write
  @type sshpub: str
  @param sshpub: the SSH public key to write
  @rtype: boolean
  @return: the success of the operation

251
  """
252
253
254
255
  sshd_keys =  [(constants.SSH_HOST_RSA_PRIV, rsa, 0600),
                (constants.SSH_HOST_RSA_PUB, rsapub, 0644),
                (constants.SSH_HOST_DSA_PRIV, dsa, 0600),
                (constants.SSH_HOST_DSA_PUB, dsapub, 0644)]
256
  for name, content, mode in sshd_keys:
257
    utils.WriteFile(name, data=content, mode=mode)
Iustin Pop's avatar
Iustin Pop committed
258

259
260
261
262
  try:
    priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
                                                    mkdir=True)
  except errors.OpExecError, err:
263
264
265
    msg = "Error while processing user ssh files"
    logging.exception(msg)
    return (False, "%s: %s" % (msg, err))
Iustin Pop's avatar
Iustin Pop committed
266

267
268
  for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
    utils.WriteFile(name, data=content, mode=0600)
Iustin Pop's avatar
Iustin Pop committed
269

270
  utils.AddAuthorizedKey(auth_keys, sshpub)
Iustin Pop's avatar
Iustin Pop committed
271

272
  utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"])
Iustin Pop's avatar
Iustin Pop committed
273

274
  return (True, "Node added successfully")
Iustin Pop's avatar
Iustin Pop committed
275
276
277


def LeaveCluster():
Iustin Pop's avatar
Iustin Pop committed
278
279
280
281
282
283
  """Cleans up and remove the current node.

  This function cleans up and prepares the current node to be removed
  from the cluster.

  If processing is successful, then it raises an
Iustin Pop's avatar
Iustin Pop committed
284
  L{errors.QuitGanetiException} which is used as a special case to
Iustin Pop's avatar
Iustin Pop committed
285
  shutdown the node daemon.
Iustin Pop's avatar
Iustin Pop committed
286
287

  """
288
  _CleanDirectory(constants.DATA_DIR)
289
  JobQueuePurge()
290

291
292
  try:
    priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
293
294
  except errors.OpExecError:
    logging.exception("Error while processing ssh files")
295
296
    return

297
  f = open(pub_key, 'r')
Iustin Pop's avatar
Iustin Pop committed
298
  try:
299
    utils.RemoveAuthorizedKey(auth_keys, f.read(8192))
Iustin Pop's avatar
Iustin Pop committed
300
301
302
  finally:
    f.close()

303
304
  utils.RemoveFile(priv_key)
  utils.RemoveFile(pub_key)
Iustin Pop's avatar
Iustin Pop committed
305

306
307
308
  # Return a reassuring string to the caller, and quit
  raise errors.QuitGanetiException(False, 'Shutdown scheduled')

Iustin Pop's avatar
Iustin Pop committed
309

310
def GetNodeInfo(vgname, hypervisor_type):
Alexander Schreiber's avatar
Alexander Schreiber committed
311
  """Gives back a hash with different informations about the node.
Iustin Pop's avatar
Iustin Pop committed
312

313
314
315
316
317
318
319
320
321
322
323
324
  @type vgname: C{string}
  @param vgname: the name of the volume group to ask for disk space information
  @type hypervisor_type: C{str}
  @param hypervisor_type: the name of the hypervisor to ask for
      memory information
  @rtype: C{dict}
  @return: dictionary with the following keys:
      - vg_size is the size of the configured volume group in MiB
      - vg_free is the free size of the volume group in MiB
      - memory_dom0 is the memory allocated for domain0 in MiB
      - memory_free is the currently available (free) ram in MiB
      - memory_total is the total number of ram in MiB
Iustin Pop's avatar
Iustin Pop committed
325

326
  """
Iustin Pop's avatar
Iustin Pop committed
327
328
329
330
331
  outputarray = {}
  vginfo = _GetVGInfo(vgname)
  outputarray['vg_size'] = vginfo['vg_size']
  outputarray['vg_free'] = vginfo['vg_free']

332
  hyper = hypervisor.GetHypervisor(hypervisor_type)
Iustin Pop's avatar
Iustin Pop committed
333
334
335
336
  hyp_info = hyper.GetNodeInfo()
  if hyp_info is not None:
    outputarray.update(hyp_info)

337
338
339
340
341
342
  f = open("/proc/sys/kernel/random/boot_id", 'r')
  try:
    outputarray["bootid"] = f.read(128).rstrip("\n")
  finally:
    f.close()

Iustin Pop's avatar
Iustin Pop committed
343
344
345
  return outputarray


346
def VerifyNode(what, cluster_name):
Iustin Pop's avatar
Iustin Pop committed
347
348
  """Verify the status of the local node.

349
350
351
352
353
354
355
356
357
  Based on the input L{what} parameter, various checks are done on the
  local node.

  If the I{filelist} key is present, this list of
  files is checksummed and the file/checksum pairs are returned.

  If the I{nodelist} key is present, we check that we have
  connectivity via ssh with the target nodes (and check the hostname
  report).
Iustin Pop's avatar
Iustin Pop committed
358

359
360
361
362
363
364
365
366
367
368
369
  If the I{node-net-test} key is present, we check that we have
  connectivity to the given nodes via both primary IP and, if
  applicable, secondary IPs.

  @type what: C{dict}
  @param what: a dictionary of things to check:
      - filelist: list of files for which to compute checksums
      - nodelist: list of nodes we should check ssh communication with
      - node-net-test: list of nodes we should check node daemon port
        connectivity with
      - hypervisor: list with hypervisors to run the verify for
Iustin Pop's avatar
Iustin Pop committed
370
371
372
  @rtype: dict
  @return: a dictionary with the same keys as the input dict, and
      values representing the result of the checks
Iustin Pop's avatar
Iustin Pop committed
373
374
375
376

  """
  result = {}

377
378
379
380
381
382
383
384
385
386
387
388
389
  if constants.NV_HYPERVISOR in what:
    result[constants.NV_HYPERVISOR] = tmp = {}
    for hv_name in what[constants.NV_HYPERVISOR]:
      tmp[hv_name] = hypervisor.GetHypervisor(hv_name).Verify()

  if constants.NV_FILELIST in what:
    result[constants.NV_FILELIST] = utils.FingerprintFiles(
      what[constants.NV_FILELIST])

  if constants.NV_NODELIST in what:
    result[constants.NV_NODELIST] = tmp = {}
    random.shuffle(what[constants.NV_NODELIST])
    for node in what[constants.NV_NODELIST]:
390
      success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node)
Iustin Pop's avatar
Iustin Pop committed
391
      if not success:
392
393
394
395
        tmp[node] = message

  if constants.NV_NODENETTEST in what:
    result[constants.NV_NODENETTEST] = tmp = {}
396
397
    my_name = utils.HostInfo().name
    my_pip = my_sip = None
398
    for name, pip, sip in what[constants.NV_NODENETTEST]:
399
400
401
402
403
      if name == my_name:
        my_pip = pip
        my_sip = sip
        break
    if not my_pip:
404
405
      tmp[my_name] = ("Can't find my own primary/secondary IP"
                      " in the node list")
406
    else:
Michael Hanselmann's avatar
Michael Hanselmann committed
407
      port = utils.GetNodeDaemonPort()
408
      for name, pip, sip in what[constants.NV_NODENETTEST]:
409
410
411
412
413
414
415
        fail = []
        if not utils.TcpPing(pip, port, source=my_pip):
          fail.append("primary")
        if sip != pip:
          if not utils.TcpPing(sip, port, source=my_sip):
            fail.append("secondary")
        if fail:
416
417
418
419
420
421
422
423
424
425
426
427
428
429
          tmp[name] = ("failure using the %s interface(s)" %
                       " and ".join(fail))

  if constants.NV_LVLIST in what:
    result[constants.NV_LVLIST] = GetVolumeList(what[constants.NV_LVLIST])

  if constants.NV_INSTANCELIST in what:
    result[constants.NV_INSTANCELIST] = GetInstanceList(
      what[constants.NV_INSTANCELIST])

  if constants.NV_VGLIST in what:
    result[constants.NV_VGLIST] = ListVolumeGroups()

  if constants.NV_VERSION in what:
430
431
    result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
                                    constants.RELEASE_VERSION)
432
433
434
435

  if constants.NV_HVINFO in what:
    hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO])
    result[constants.NV_HVINFO] = hyper.GetNodeInfo()
436

437
438
439
  if constants.NV_DRBDLIST in what:
    try:
      used_minors = bdev.DRBD8.GetUsedDevs().keys()
440
    except errors.BlockDeviceError, err:
441
      logging.warning("Can't get used minors list", exc_info=True)
442
      used_minors = str(err)
443
444
    result[constants.NV_DRBDLIST] = used_minors

Iustin Pop's avatar
Iustin Pop committed
445
446
447
448
449
450
  return result


def GetVolumeList(vg_name):
  """Compute list of logical volumes and their size.

Iustin Pop's avatar
Iustin Pop committed
451
452
453
454
455
456
457
458
459
460
461
  @type vg_name: str
  @param vg_name: the volume group whose LVs we should list
  @rtype: dict
  @return:
      dictionary of all partions (key) with value being a tuple of
      their size (in MiB), inactive and online status::

        {'test1': ('20.06', True, True)}

      in case of errors, a string is returned with the error
      details.
Iustin Pop's avatar
Iustin Pop committed
462
463

  """
464
465
466
467
468
  lvs = {}
  sep = '|'
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=%s" % sep,
                         "-olv_name,lv_size,lv_attr", vg_name])
Iustin Pop's avatar
Iustin Pop committed
469
  if result.failed:
470
471
    logging.error("Failed to list logical volumes, lvs output: %s",
                  result.output)
472
    return result.output
473

474
  valid_line_re = re.compile("^ *([^|]+)\|([0-9.]+)\|([^|]{6})\|?$")
475
  for line in result.stdout.splitlines():
476
477
478
    line = line.strip()
    match = valid_line_re.match(line)
    if not match:
479
      logging.error("Invalid line returned from lvs output: '%s'", line)
480
481
      continue
    name, size, attr = match.groups()
482
483
484
485
486
    inactive = attr[4] == '-'
    online = attr[5] == 'o'
    lvs[name] = (size, inactive, online)

  return lvs
Iustin Pop's avatar
Iustin Pop committed
487
488
489


def ListVolumeGroups():
Alexander Schreiber's avatar
Alexander Schreiber committed
490
  """List the volume groups and their size.
Iustin Pop's avatar
Iustin Pop committed
491

Iustin Pop's avatar
Iustin Pop committed
492
493
494
  @rtype: dict
  @return: dictionary with keys volume name and values the
      size of the volume
Iustin Pop's avatar
Iustin Pop committed
495
496
497
498
499

  """
  return utils.ListVolumeGroups()


500
501
502
def NodeVolumes():
  """List all volumes on this node.

Iustin Pop's avatar
Iustin Pop committed
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
  @rtype: list
  @return:
    A list of dictionaries, each having four keys:
      - name: the logical volume name,
      - size: the size of the logical volume
      - dev: the physical device on which the LV lives
      - vg: the volume group to which it belongs

    In case of errors, we return an empty list and log the
    error.

    Note that since a logical volume can live on multiple physical
    volumes, the resulting list might include a logical volume
    multiple times.

518
519
520
521
522
  """
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=|",
                         "--options=lv_name,lv_size,devices,vg_name"])
  if result.failed:
523
524
    logging.error("Failed to list logical volumes, lvs output: %s",
                  result.output)
Iustin Pop's avatar
Iustin Pop committed
525
    return []
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540

  def parse_dev(dev):
    if '(' in dev:
      return dev.split('(')[0]
    else:
      return dev

  def map_line(line):
    return {
      'name': line[0].strip(),
      'size': line[1].strip(),
      'dev': parse_dev(line[2].strip()),
      'vg': line[3].strip(),
    }

541
542
  return [map_line(line.split('|')) for line in result.stdout.splitlines()
          if line.count('|') >= 3]
543
544


Iustin Pop's avatar
Iustin Pop committed
545
def BridgesExist(bridges_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
546
  """Check if a list of bridges exist on the current node.
Iustin Pop's avatar
Iustin Pop committed
547

Iustin Pop's avatar
Iustin Pop committed
548
549
  @rtype: boolean
  @return: C{True} if all of them exist, C{False} otherwise
Iustin Pop's avatar
Iustin Pop committed
550
551
552
553
554
555
556
557
558

  """
  for bridge in bridges_list:
    if not utils.BridgeExists(bridge):
      return False

  return True


559
def GetInstanceList(hypervisor_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
560
  """Provides a list of instances.
Iustin Pop's avatar
Iustin Pop committed
561

562
563
564
565
566
  @type hypervisor_list: list
  @param hypervisor_list: the list of hypervisors to query information

  @rtype: list
  @return: a list of all running instances on the current node
Iustin Pop's avatar
Iustin Pop committed
567
568
    - instance1.example.com
    - instance2.example.com
Iustin Pop's avatar
Iustin Pop committed
569

570
  """
571
572
573
574
575
576
577
578
  results = []
  for hname in hypervisor_list:
    try:
      names = hypervisor.GetHypervisor(hname).ListInstances()
      results.extend(names)
    except errors.HypervisorError, err:
      logging.exception("Error enumerating instances for hypevisor %s", hname)
      raise
Iustin Pop's avatar
Iustin Pop committed
579

580
  return results
Iustin Pop's avatar
Iustin Pop committed
581
582


583
def GetInstanceInfo(instance, hname):
Alexander Schreiber's avatar
Alexander Schreiber committed
584
  """Gives back the informations about an instance as a dictionary.
Iustin Pop's avatar
Iustin Pop committed
585

586
587
588
589
  @type instance: string
  @param instance: the instance name
  @type hname: string
  @param hname: the hypervisor type of the instance
Iustin Pop's avatar
Iustin Pop committed
590

591
592
593
594
595
  @rtype: dict
  @return: dictionary with the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
596

597
  """
Iustin Pop's avatar
Iustin Pop committed
598
599
  output = {}

600
  iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance)
Iustin Pop's avatar
Iustin Pop committed
601
602
603
604
605
606
607
608
  if iinfo is not None:
    output['memory'] = iinfo[2]
    output['state'] = iinfo[4]
    output['time'] = iinfo[5]

  return output


609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
def GetInstanceMigratable(instance):
  """Gives whether an instance can be migrated.

  @type instance: L{objects.Instance}
  @param instance: object representing the instance to be checked.

  @rtype: tuple
  @return: tuple of (result, description) where:
      - result: whether the instance can be migrated or not
      - description: a description of the issue, if relevant

  """
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  if instance.name not in hyper.ListInstances():
    return (False, 'not running')

  for idx in range(len(instance.disks)):
    link_name = _GetBlockDevSymlinkPath(instance.name, idx)
    if not os.path.islink(link_name):
      return (False, 'not restarted since ganeti 1.2.5')

  return (True, '')


633
def GetAllInstancesInfo(hypervisor_list):
Iustin Pop's avatar
Iustin Pop committed
634
635
  """Gather data about all instances.

Iustin Pop's avatar
Iustin Pop committed
636
  This is the equivalent of L{GetInstanceInfo}, except that it
Iustin Pop's avatar
Iustin Pop committed
637
638
639
  computes data for all instances at once, thus being faster if one
  needs data about more than one instance.

640
641
642
  @type hypervisor_list: list
  @param hypervisor_list: list of hypervisors to query for instance data

643
  @rtype: dict
644
645
646
647
  @return: dictionary of instance: data, with data having the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
648
      - vcpus: the number of vcpus
Iustin Pop's avatar
Iustin Pop committed
649

650
  """
Iustin Pop's avatar
Iustin Pop committed
651
652
  output = {}

653
654
655
656
  for hname in hypervisor_list:
    iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo()
    if iinfo:
      for name, inst_id, memory, vcpus, state, times in iinfo:
657
        value = {
658
659
660
661
662
          'memory': memory,
          'vcpus': vcpus,
          'state': state,
          'time': times,
          }
663
664
665
666
        if name in output and output[name] != value:
          raise errors.HypervisorError("Instance %s running duplicate"
                                       " with different parameters" % name)
        output[name] = value
Iustin Pop's avatar
Iustin Pop committed
667
668
669
670

  return output


671
def InstanceOsAdd(instance):
Alexander Schreiber's avatar
Alexander Schreiber committed
672
  """Add an OS to an instance.
Iustin Pop's avatar
Iustin Pop committed
673

674
675
  @type instance: L{objects.Instance}
  @param instance: Instance whose OS is to be installed
Iustin Pop's avatar
Iustin Pop committed
676
677
  @rtype: boolean
  @return: the success of the operation
Iustin Pop's avatar
Iustin Pop committed
678
679

  """
680
681
682
683
684
685
686
687
688
  try:
    inst_os = OSFromDisk(instance.os)
  except errors.InvalidOS, err:
    os_name, os_dir, os_err = err.args
    if os_dir is None:
      return (False, "Can't find OS '%s': %s" % (os_name, os_err))
    else:
      return (False, "Error parsing OS '%s' in directory %s: %s" %
              (os_name, os_dir, os_err))
Iustin Pop's avatar
Iustin Pop committed
689

690
  create_env = OSEnvironment(instance)
Iustin Pop's avatar
Iustin Pop committed
691
692
693

  logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                     instance.name, int(time.time()))
694

695
696
  result = utils.RunCmd([inst_os.create_script], env=create_env,
                        cwd=inst_os.path, output=logfile,)
697
  if result.failed:
698
    logging.error("os create command '%s' returned error: %s, logfile: %s,"
699
                  " output: %s", result.cmd, result.fail_reason, logfile,
700
                  result.output)
701
    lines = [utils.SafeEncode(val)
702
703
704
             for val in utils.TailFile(logfile, lines=20)]
    return (False, "OS create script failed (%s), last lines in the"
            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
705

706
  return (True, "Successfully installed")
707
708


709
def RunRenameInstance(instance, old_name):
710
711
  """Run the OS rename script for an instance.

Iustin Pop's avatar
Iustin Pop committed
712
  @type instance: L{objects.Instance}
713
714
715
  @param instance: Instance whose OS is to be installed
  @type old_name: string
  @param old_name: previous instance name
Iustin Pop's avatar
Iustin Pop committed
716
717
  @rtype: boolean
  @return: the success of the operation
718
719
720
721

  """
  inst_os = OSFromDisk(instance.os)

722
723
  rename_env = OSEnvironment(instance)
  rename_env['OLD_INSTANCE_NAME'] = old_name
724
725
726
727

  logfile = "%s/rename-%s-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                           old_name,
                                           instance.name, int(time.time()))
Iustin Pop's avatar
Iustin Pop committed
728

729
730
  result = utils.RunCmd([inst_os.rename_script], env=rename_env,
                        cwd=inst_os.path, output=logfile)
Iustin Pop's avatar
Iustin Pop committed
731
732

  if result.failed:
733
    logging.error("os create command '%s' returned error: %s output: %s",
734
                  result.cmd, result.fail_reason, result.output)
735
    lines = [utils.SafeEncode(val)
736
737
738
             for val in utils.TailFile(logfile, lines=20)]
    return (False, "OS rename script failed (%s), last lines in the"
            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
Iustin Pop's avatar
Iustin Pop committed
739

740
  return (True, "Rename successful")
Iustin Pop's avatar
Iustin Pop committed
741
742
743
744
745


def _GetVGInfo(vg_name):
  """Get informations about the volume group.

Iustin Pop's avatar
Iustin Pop committed
746
747
748
749
750
751
752
753
  @type vg_name: str
  @param vg_name: the volume group which we query
  @rtype: dict
  @return:
    A dictionary with the following keys:
      - C{vg_size} is the total size of the volume group in MiB
      - C{vg_free} is the free size of the volume group in MiB
      - C{pv_count} are the number of physical disks in that VG
Iustin Pop's avatar
Iustin Pop committed
754

Iustin Pop's avatar
Iustin Pop committed
755
756
    If an error occurs during gathering of data, we return the same dict
    with keys all set to None.
757

Iustin Pop's avatar
Iustin Pop committed
758
  """
759
760
  retdic = dict.fromkeys(["vg_size", "vg_free", "pv_count"])

Iustin Pop's avatar
Iustin Pop committed
761
762
763
764
  retval = utils.RunCmd(["vgs", "-ovg_size,vg_free,pv_count", "--noheadings",
                         "--nosuffix", "--units=m", "--separator=:", vg_name])

  if retval.failed:
765
    logging.error("volume group %s not present", vg_name)
766
    return retdic
Iustin Pop's avatar
Iustin Pop committed
767
  valarr = retval.stdout.strip().rstrip(':').split(':')
768
769
770
771
772
773
774
775
  if len(valarr) == 3:
    try:
      retdic = {
        "vg_size": int(round(float(valarr[0]), 0)),
        "vg_free": int(round(float(valarr[1]), 0)),
        "pv_count": int(valarr[2]),
        }
    except ValueError, err:
776
      logging.exception("Fail to parse vgs output")
777
  else:
778
779
    logging.error("vgs output has the wrong number of fields (expected"
                  " three): %s", str(valarr))
Iustin Pop's avatar
Iustin Pop committed
780
781
782
  return retdic


783
784
785
786
787
788
def _GetBlockDevSymlinkPath(instance_name, idx):
  return os.path.join(constants.DISK_LINKS_DIR,
                      "%s:%d" % (instance_name, idx))


def _SymlinkBlockDev(instance_name, device_path, idx):
789
790
791
792
793
794
  """Set up symlinks to a instance's block device.

  This is an auxiliary function run when an instance is start (on the primary
  node) or when an instance is migrated (on the target node).


795
796
797
798
  @param instance_name: the name of the target instance
  @param device_path: path of the physical block device, on the node
  @param idx: the disk index
  @return: absolute path to the disk's symlink
799
800

  """
801
  link_name = _GetBlockDevSymlinkPath(instance_name, idx)
802
803
  try:
    os.symlink(device_path, link_name)
804
805
  except OSError, err:
    if err.errno == errno.EEXIST:
806
807
808
809
810
811
812
813
814
815
      if (not os.path.islink(link_name) or
          os.readlink(link_name) != device_path):
        os.remove(link_name)
        os.symlink(device_path, link_name)
    else:
      raise

  return link_name


816
def _RemoveBlockDevLinks(instance_name, disks):
Iustin Pop's avatar
Iustin Pop committed
817
818
819
  """Remove the block device symlinks belonging to the given instance.

  """
820
821
822
  for idx, disk in enumerate(disks):
    link_name = _GetBlockDevSymlinkPath(instance_name, idx)
    if os.path.islink(link_name):
Iustin Pop's avatar
Iustin Pop committed
823
      try:
824
825
826
        os.remove(link_name)
      except OSError:
        logging.exception("Can't remove symlink '%s'", link_name)
Iustin Pop's avatar
Iustin Pop committed
827
828


829
def _GatherAndLinkBlockDevs(instance):
Iustin Pop's avatar
Iustin Pop committed
830
831
832
833
834
  """Set up an instance's block device(s).

  This is run on the primary node at instance startup. The block
  devices must be already assembled.

Iustin Pop's avatar
Iustin Pop committed
835
836
  @type instance: L{objects.Instance}
  @param instance: the instance whose disks we shoul assemble
837
838
  @rtype: list
  @return: list of (disk_object, device_path)
Iustin Pop's avatar
Iustin Pop committed
839

Iustin Pop's avatar
Iustin Pop committed
840
841
  """
  block_devices = []
842
  for idx, disk in enumerate(instance.disks):
Iustin Pop's avatar
Iustin Pop committed
843
844
845
846
847
    device = _RecursiveFindBD(disk)
    if device is None:
      raise errors.BlockDeviceError("Block device '%s' is not set up." %
                                    str(disk))
    device.Open()
848
    try:
849
      link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
850
851
852
853
854
855
    except OSError, e:
      raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
                                    e.strerror)

    block_devices.append((disk, link_name))

Iustin Pop's avatar
Iustin Pop committed
856
857
858
  return block_devices


859
def StartInstance(instance):
Iustin Pop's avatar
Iustin Pop committed
860
861
  """Start an instance.

Iustin Pop's avatar
Iustin Pop committed
862
  @type instance: L{objects.Instance}
863
864
865
  @param instance: the instance object
  @rtype: boolean
  @return: whether the startup was successful or not
Iustin Pop's avatar
Iustin Pop committed
866

867
  """
868
  running_instances = GetInstanceList([instance.hypervisor])
Iustin Pop's avatar
Iustin Pop committed
869
870

  if instance.name in running_instances:
871
    return (True, "Already running")
Iustin Pop's avatar
Iustin Pop committed
872
873

  try:
874
875
    block_devices = _GatherAndLinkBlockDevs(instance)
    hyper = hypervisor.GetHypervisor(instance.hypervisor)
876
    hyper.StartInstance(instance, block_devices)
877
878
  except errors.BlockDeviceError, err:
    logging.exception("Failed to start instance")
879
    return (False, "Block device error: %s" % str(err))
Iustin Pop's avatar
Iustin Pop committed
880
  except errors.HypervisorError, err:
881
    logging.exception("Failed to start instance")
882
    _RemoveBlockDevLinks(instance.name, instance.disks)
883
    return (False, "Hypervisor error: %s" % str(err))
Iustin Pop's avatar
Iustin Pop committed
884

885
  return (True, "Instance started successfully")
Iustin Pop's avatar
Iustin Pop committed
886
887


888
def InstanceShutdown(instance):
Iustin Pop's avatar
Iustin Pop committed
889
890
  """Shut an instance down.

Iustin Pop's avatar
Iustin Pop committed
891
892
893
  @note: this functions uses polling with a hardcoded timeout.

  @type instance: L{objects.Instance}
894
895
896
  @param instance: the instance object
  @rtype: boolean
  @return: whether the startup was successful or not
Iustin Pop's avatar
Iustin Pop committed
897

898
  """
899
900
  hv_name = instance.hypervisor
  running_instances = GetInstanceList([hv_name])
Iustin Pop's avatar
Iustin Pop committed
901
902

  if instance.name not in running_instances:
903
    return (True, "Instance already stopped")
Iustin Pop's avatar
Iustin Pop committed
904

905
  hyper = hypervisor.GetHypervisor(hv_name)
Iustin Pop's avatar
Iustin Pop committed
906
907
908
  try:
    hyper.StopInstance(instance)
  except errors.HypervisorError, err:
909
910
911
    msg = "Failed to stop instance %s: %s" % (instance.name, err)
    logging.error(msg)
    return (False, msg)
Iustin Pop's avatar
Iustin Pop committed
912
913
914
915
916

  # test every 10secs for 2min

  time.sleep(1)
  for dummy in range(11):
917
    if instance.name not in GetInstanceList([hv_name]):
Iustin Pop's avatar
Iustin Pop committed
918
919
920
921
      break
    time.sleep(10)
  else:
    # the shutdown did not succeed
922
923
    logging.error("Shutdown of '%s' unsuccessful, using destroy",
                  instance.name)
Iustin Pop's avatar
Iustin Pop committed
924
925
926
927

    try:
      hyper.StopInstance(instance, force=True)
    except errors.HypervisorError, err:
928
929
930
      msg = "Failed to force stop instance %s: %s" % (instance.name, err)
      logging.error(msg)
      return (False, msg)
Iustin Pop's avatar
Iustin Pop committed
931
932

    time.sleep(1)
933
    if instance.name in GetInstanceList([hv_name]):
934
935
936
937
      msg = ("Could not shutdown instance %s even by destroy" %
             instance.name)
      logging.error(msg)
      return (False, msg)
Iustin Pop's avatar
Iustin Pop committed
938

939
  _RemoveBlockDevLinks(instance.name, instance.disks)
Iustin Pop's avatar
Iustin Pop committed
940

941
  return (True, "Instance has been shutdown successfully")
Iustin Pop's avatar
Iustin Pop committed
942
943


944
def InstanceReboot(instance, reboot_type):
945
946
  """Reboot an instance.

Iustin Pop's avatar
Iustin Pop committed
947
948
949
950
951
952
953
954
955
956
957
958
959
960
  @type instance: L{objects.Instance}
  @param instance: the instance object to reboot
  @type reboot_type: str
  @param reboot_type: the type of reboot, one the following
    constants:
      - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
        instance OS, do not recreate the VM
      - L{constants.INSTANCE_REBOOT_HARD}: tear down and
        restart the VM (at the hypervisor level)
      - the other reboot type (L{constants.INSTANCE_REBOOT_HARD})
        is not accepted here, since that mode is handled
        differently
  @rtype: boolean
  @return: the success of the operation
961
962

  """
963
  running_instances = GetInstanceList([instance.hypervisor])
964
965

  if instance.name not in running_instances:
966
967
968
    msg = "Cannot reboot instance %s that is not running" % instance.name
    logging.error(msg)
    return (False, msg)
969

970
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
971
972
973
974
  if reboot_type == constants.INSTANCE_REBOOT_SOFT:
    try:
      hyper.RebootInstance(instance)
    except errors.HypervisorError, err:
975
976
977
      msg = "Failed to soft reboot instance %s: %s" % (instance.name, err)
      logging.error(msg)
      return (False, msg)
978
979
  elif reboot_type == constants.INSTANCE_REBOOT_HARD:
    try:
Iustin Pop's avatar
Iustin Pop committed
980
981
982
      stop_result = InstanceShutdown(instance)
      if not stop_result[0]:
        return stop_result
983
      return StartInstance(instance)
984
    except errors.HypervisorError, err:
985
986
987
      msg = "Failed to hard reboot instance %s: %s" % (instance.name, err)
      logging.error(msg)
      return (False, msg)
988
  else:
989
    return (False, "Invalid reboot_type received: %s" % (reboot_type,))
990

991
  return (True, "Reboot successful")
992
993


994
995
996
997
998
999
1000
def MigrationInfo(instance):
  """Gather information about an instance to be migrated.

  @type instance: L{objects.Instance}
  @param instance: the instance definition

  """
1001
1002
1003
1004
1005
1006
1007
1008
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    info = hyper.MigrationInfo(instance)
  except errors.HypervisorError, err:
    msg = "Failed to fetch migration information"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
  return (True, info)
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021


def AcceptInstance(instance, info, target):
  """Prepare the node to accept an instance.

  @type instance: L{objects.Instance}
  @param instance: the instance definition
  @type info: string/data (opaque)
  @param info: migration information, from the source node
  @type target: string
  @param target: target host (usually ip), on this node

  """
1022
1023
1024
1025
1026
1027
1028
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    hyper.AcceptInstance(instance, info, target)
  except errors.HypervisorError, err:
    msg = "Failed to accept instance"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
  return (True, "Accept successfull")


def FinalizeMigration(instance, info, success):
  """Finalize any preparation to accept an instance.

  @type instance: L{objects.Instance}
  @param instance: the instance definition
  @type info: string/data (opaque)
  @param info: migration information, from the source node
  @type success: boolean
  @param success: whether the migration was a success or a failure

  """
1043
1044
1045
1046
1047
1048
1049
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    hyper.FinalizeMigration(instance, info, success)
  except errors.HypervisorError, err:
    msg = "Failed to finalize migration"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
1050
1051
1052
  return (True, "Migration Finalized")


1053
1054
1055
def MigrateInstance(instance, target, live):
  """Migrates an instance to another node.

Iustin Pop's avatar
Iustin Pop committed
1056
  @type instance: L{objects.Instance}
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
  @param instance: the instance definition
  @type target: string
  @param target: the target node name
  @type live: boolean
  @param live: whether the migration should be done live or not (the
      interpretation of this parameter is left to the hypervisor)
  @rtype: tuple
  @return: a tuple of (success, msg) where:
      - succes is a boolean denoting the success/failure of the operation
      - msg is a string with details in case of failure

1068
  """
1069
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
1070
1071

  try:
1072
    hyper.MigrateInstance(instance.name, target, live)
1073
  except errors.HypervisorError, err:
1074
1075
1076
    msg = "Failed to migrate instance"
    logging.exception(msg)
    return (False, "%s: %s" % (msg, err))
1077
1078
1079
  return (True, "Migration successfull")


1080
def BlockdevCreate(disk, size, owner, on_primary, info):
Iustin Pop's avatar
Iustin Pop committed
1081
1082
  """Creates a block device for an instance.

Iustin Pop's avatar
Iustin Pop committed
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
  @type disk: L{objects.Disk}
  @param disk: the object describing the disk we should create
  @type size: int
  @param size: the size of the physical underlying device, in MiB
  @type owner: str
  @param owner: the name of the instance for which disk is created,
      used for device cache data
  @type on_primary: boolean
  @param on_primary:  indicates if it is the primary node or not
  @type info: string
  @param info: string that will be sent to the physical device
      creation, used for example to set (LVM) tags on LVs

  @return: the new unique_id of the device (this can sometime be
      computed only after creation), or None. On secondary nodes,
      it's not required to return anything.
Iustin Pop's avatar
Iustin Pop committed
1099
1100
1101
1102
1103

  """
  clist = []
  if disk.children:
    for child in disk.children:
1104
1105
1106
1107
1108
1109
      try:
        crdev = _RecursiveAssembleBD(child, owner, on_primary)
      except errors.BlockDeviceError, err:
        errmsg = "Can't assemble device %s: %s" % (child, err)
        logging.error(errmsg)
        return False, errmsg
Iustin Pop's avatar
Iustin Pop committed
1110
1111
1112
      if on_primary or disk.AssembleOnSecondary():
        # we need the children open in case the device itself has to
        # be assembled
1113
1114
1115
        try:
          crdev.Open()
        except errors.BlockDeviceError, err:
Iustin Pop's avatar
Iustin Pop committed
1116
          errmsg = "Can't make child '%s' read-write: %s" % (child, err)
1117
1118
          logging.error(errmsg)
          return False, errmsg
Iustin Pop's avatar
Iustin Pop committed
1119
1120
      clist.append(crdev)

1121
1122
  try:
    device = bdev.Create(disk.dev_type, disk.physical_id, clist, size)
1123
  except errors.BlockDeviceError, err:
1124
    return False, "Can't create block device: %s" % str(err)
Iustin Pop's avatar
Iustin Pop committed
1125

Iustin Pop's avatar
Iustin Pop committed
1126
  if on_primary or disk.AssembleOnSecondary():
1127
1128
1129
1130
1131
1132
1133
    try:
      device.Assemble()
    except errors.BlockDeviceError, err:
      errmsg = ("Can't assemble device after creation, very"
                " unusual event: %s" % str(err))
      logging.error(errmsg)
      return False, errmsg
1134
    device.SetSyncSpeed(constants.SYNC_SPEED)
Iustin Pop's avatar
Iustin Pop committed
1135
    if on_primary or disk.OpenOnSecondary():
1136
1137
1138
1139
1140
1141
1142
      try:
        device.Open(force=True)
      except errors.BlockDeviceError, err:
        errmsg = ("Can't make device r/w after creation, very"
                  " unusual event: %s" % str(err))
        logging.error(errmsg)
        return False, errmsg
1143
1144
    DevCacheManager.UpdateCache(device.dev_path, owner,
                                on_primary, disk.iv_name)
1145
1146
1147

  device.SetInfo(info)

Iustin Pop's avatar
Iustin Pop committed
1148
  physical_id = device.unique_id
1149
  return True, physical_id
Iustin Pop's avatar
Iustin Pop committed
1150
1151


1152
def BlockdevRemove(disk):
Iustin Pop's avatar
Iustin Pop committed
1153
1154
  """Remove a block device.