backend.py 80.8 KB
Newer Older
Iustin Pop's avatar
Iustin Pop committed
1
#
Iustin Pop's avatar
Iustin Pop committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#

# Copyright (C) 2006, 2007 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


"""Functions used by the node daemon"""


import os
import os.path
import shutil
import time
import stat
import errno
import re
import subprocess
33
import random
34
import logging
35
import tempfile
36
37
import zlib
import base64
Iustin Pop's avatar
Iustin Pop committed
38
39
40
41
42
43
44
45

from ganeti import errors
from ganeti import utils
from ganeti import ssh
from ganeti import hypervisor
from ganeti import constants
from ganeti import bdev
from ganeti import objects
46
from ganeti import ssconf
Iustin Pop's avatar
Iustin Pop committed
47
48


Michael Hanselmann's avatar
Michael Hanselmann committed
49
def _GetConfig():
Iustin Pop's avatar
Iustin Pop committed
50
  """Simple wrapper to return a SimpleStore.
Iustin Pop's avatar
Iustin Pop committed
51

Iustin Pop's avatar
Iustin Pop committed
52
53
  @rtype: L{ssconf.SimpleStore}
  @return: a SimpleStore instance
Iustin Pop's avatar
Iustin Pop committed
54
55

  """
Iustin Pop's avatar
Iustin Pop committed
56
  return ssconf.SimpleStore()
Michael Hanselmann's avatar
Michael Hanselmann committed
57
58


59
def _GetSshRunner(cluster_name):
Iustin Pop's avatar
Iustin Pop committed
60
61
62
63
64
65
66
67
68
  """Simple wrapper to return an SshRunner.

  @type cluster_name: str
  @param cluster_name: the cluster name, which is needed
      by the SshRunner constructor
  @rtype: L{ssh.SshRunner}
  @return: an SshRunner instance

  """
69
  return ssh.SshRunner(cluster_name)
70
71


72
73
74
75
76
77
78
79
80
def _Decompress(data):
  """Unpacks data compressed by the RPC client.

  @type data: list or tuple
  @param data: Data sent by RPC client
  @rtype: str
  @return: Decompressed data

  """
81
  assert isinstance(data, (list, tuple))
82
83
84
85
86
87
88
89
90
91
  assert len(data) == 2
  (encoding, content) = data
  if encoding == constants.RPC_ENCODING_NONE:
    return content
  elif encoding == constants.RPC_ENCODING_ZLIB_BASE64:
    return zlib.decompress(base64.b64decode(content))
  else:
    raise AssertionError("Unknown data encoding")


92
def _CleanDirectory(path, exclude=None):
93
94
  """Removes all regular files in a directory.

Iustin Pop's avatar
Iustin Pop committed
95
96
  @type path: str
  @param path: the directory to clean
97
  @type exclude: list
Iustin Pop's avatar
Iustin Pop committed
98
99
  @param exclude: list of files to be excluded, defaults
      to the empty list
100
101

  """
102
103
  if not os.path.isdir(path):
    return
104
105
106
107
108
  if exclude is None:
    exclude = []
  else:
    # Normalize excluded paths
    exclude = [os.path.normpath(i) for i in exclude]
109

110
  for rel_name in utils.ListVisibleFiles(path):
111
112
113
    full_name = os.path.normpath(os.path.join(path, rel_name))
    if full_name in exclude:
      continue
114
115
116
117
    if os.path.isfile(full_name) and not os.path.islink(full_name):
      utils.RemoveFile(full_name)


118
def JobQueuePurge():
Iustin Pop's avatar
Iustin Pop committed
119
120
121
  """Removes job queue files and archived jobs.

  @rtype: None
122
123

  """
124
  _CleanDirectory(constants.QUEUE_DIR, exclude=[constants.JOB_QUEUE_LOCK_FILE])
125
126
127
  _CleanDirectory(constants.JOB_QUEUE_ARCHIVE_DIR)


128
129
130
131
132
133
134
def GetMasterInfo():
  """Returns master information.

  This is an utility function to compute master information, either
  for consumption here or from the node daemon.

  @rtype: tuple
Iustin Pop's avatar
Iustin Pop committed
135
136
  @return: (master_netdev, master_ip, master_name) if we have a good
      configuration, otherwise (None, None, None)
137
138
139

  """
  try:
Michael Hanselmann's avatar
Michael Hanselmann committed
140
141
142
143
    cfg = _GetConfig()
    master_netdev = cfg.GetMasterNetdev()
    master_ip = cfg.GetMasterIP()
    master_node = cfg.GetMasterNode()
144
145
  except errors.ConfigurationError, err:
    logging.exception("Cluster configuration incomplete")
Iustin Pop's avatar
Iustin Pop committed
146
    return (None, None, None)
147
  return (master_netdev, master_ip, master_node)
148
149


150
def StartMaster(start_daemons):
Iustin Pop's avatar
Iustin Pop committed
151
152
  """Activate local node as master node.

153
  The function will always try activate the IP address of the master
Iustin Pop's avatar
Iustin Pop committed
154
155
156
157
158
159
160
  (unless someone else has it). It will also start the master daemons,
  based on the start_daemons parameter.

  @type start_daemons: boolean
  @param start_daemons: whther to also start the master
      daemons (ganeti-masterd and ganeti-rapi)
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
161
162

  """
163
  ok = True
164
  master_netdev, master_ip, _ = GetMasterInfo()
165
  if not master_netdev:
Iustin Pop's avatar
Iustin Pop committed
166
167
    return False

168
  if utils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT):
169
    if utils.OwnIpAddress(master_ip):
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
      # we already have the ip:
      logging.debug("Already started")
    else:
      logging.error("Someone else has the master ip, not activating")
      ok = False
  else:
    result = utils.RunCmd(["ip", "address", "add", "%s/32" % master_ip,
                           "dev", master_netdev, "label",
                           "%s:0" % master_netdev])
    if result.failed:
      logging.error("Can't activate master IP: %s", result.output)
      ok = False

    result = utils.RunCmd(["arping", "-q", "-U", "-c 3", "-I", master_netdev,
                           "-s", master_ip, master_ip])
    # we'll ignore the exit code of arping

  # and now start the master and rapi daemons
  if start_daemons:
    for daemon in 'ganeti-masterd', 'ganeti-rapi':
      result = utils.RunCmd([daemon])
      if result.failed:
        logging.error("Can't start daemon %s: %s", daemon, result.output)
        ok = False
  return ok
Iustin Pop's avatar
Iustin Pop committed
195
196


197
def StopMaster(stop_daemons):
Iustin Pop's avatar
Iustin Pop committed
198
199
  """Deactivate this node as master.

200
  The function will always try to deactivate the IP address of the
Iustin Pop's avatar
Iustin Pop committed
201
202
203
204
205
206
207
  master. It will also stop the master daemons depending on the
  stop_daemons parameter.

  @type stop_daemons: boolean
  @param stop_daemons: whether to also stop the master daemons
      (ganeti-masterd and ganeti-rapi)
  @rtype: None
Iustin Pop's avatar
Iustin Pop committed
208
209

  """
210
  master_netdev, master_ip, _ = GetMasterInfo()
211
212
  if not master_netdev:
    return False
Iustin Pop's avatar
Iustin Pop committed
213

214
215
  result = utils.RunCmd(["ip", "address", "del", "%s/32" % master_ip,
                         "dev", master_netdev])
Iustin Pop's avatar
Iustin Pop committed
216
  if result.failed:
217
    logging.error("Can't remove the master IP, error: %s", result.output)
218
219
220
221
222
223
    # but otherwise ignore the failure

  if stop_daemons:
    # stop/kill the rapi and the master daemon
    for daemon in constants.RAPI_PID, constants.MASTERD_PID:
      utils.KillProcess(utils.ReadPidFile(utils.DaemonPidFileName(daemon)))
Iustin Pop's avatar
Iustin Pop committed
224
225
226
227

  return True


Iustin Pop's avatar
Iustin Pop committed
228
def AddNode(dsa, dsapub, rsa, rsapub, sshkey, sshpub):
229
  """Joins this node to the cluster.
Iustin Pop's avatar
Iustin Pop committed
230

231
232
233
234
  This does the following:
      - updates the hostkeys of the machine (rsa and dsa)
      - adds the ssh private key to the user
      - adds the ssh public key to the users' authorized_keys file
Iustin Pop's avatar
Iustin Pop committed
235

Iustin Pop's avatar
Iustin Pop committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
  @type dsa: str
  @param dsa: the DSA private key to write
  @type dsapub: str
  @param dsapub: the DSA public key to write
  @type rsa: str
  @param rsa: the RSA private key to write
  @type rsapub: str
  @param rsapub: the RSA public key to write
  @type sshkey: str
  @param sshkey: the SSH private key to write
  @type sshpub: str
  @param sshpub: the SSH public key to write
  @rtype: boolean
  @return: the success of the operation

251
  """
252
253
254
255
  sshd_keys =  [(constants.SSH_HOST_RSA_PRIV, rsa, 0600),
                (constants.SSH_HOST_RSA_PUB, rsapub, 0644),
                (constants.SSH_HOST_DSA_PRIV, dsa, 0600),
                (constants.SSH_HOST_DSA_PUB, dsapub, 0644)]
256
  for name, content, mode in sshd_keys:
257
    utils.WriteFile(name, data=content, mode=mode)
Iustin Pop's avatar
Iustin Pop committed
258

259
260
261
262
  try:
    priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS,
                                                    mkdir=True)
  except errors.OpExecError, err:
263
264
265
    msg = "Error while processing user ssh files"
    logging.exception(msg)
    return (False, "%s: %s" % (msg, err))
Iustin Pop's avatar
Iustin Pop committed
266

267
268
  for name, content in [(priv_key, sshkey), (pub_key, sshpub)]:
    utils.WriteFile(name, data=content, mode=0600)
Iustin Pop's avatar
Iustin Pop committed
269

270
  utils.AddAuthorizedKey(auth_keys, sshpub)
Iustin Pop's avatar
Iustin Pop committed
271

272
  utils.RunCmd([constants.SSH_INITD_SCRIPT, "restart"])
Iustin Pop's avatar
Iustin Pop committed
273

274
  return (True, "Node added successfully")
Iustin Pop's avatar
Iustin Pop committed
275
276
277


def LeaveCluster():
Iustin Pop's avatar
Iustin Pop committed
278
279
280
281
282
283
  """Cleans up and remove the current node.

  This function cleans up and prepares the current node to be removed
  from the cluster.

  If processing is successful, then it raises an
Iustin Pop's avatar
Iustin Pop committed
284
  L{errors.QuitGanetiException} which is used as a special case to
Iustin Pop's avatar
Iustin Pop committed
285
  shutdown the node daemon.
Iustin Pop's avatar
Iustin Pop committed
286
287

  """
288
  _CleanDirectory(constants.DATA_DIR)
289
  JobQueuePurge()
290

291
292
  try:
    priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.GANETI_RUNAS)
293
294
  except errors.OpExecError:
    logging.exception("Error while processing ssh files")
295
296
    return

297
  f = open(pub_key, 'r')
Iustin Pop's avatar
Iustin Pop committed
298
  try:
299
    utils.RemoveAuthorizedKey(auth_keys, f.read(8192))
Iustin Pop's avatar
Iustin Pop committed
300
301
302
  finally:
    f.close()

303
304
  utils.RemoveFile(priv_key)
  utils.RemoveFile(pub_key)
Iustin Pop's avatar
Iustin Pop committed
305

306
307
308
  # Return a reassuring string to the caller, and quit
  raise errors.QuitGanetiException(False, 'Shutdown scheduled')

Iustin Pop's avatar
Iustin Pop committed
309

310
def GetNodeInfo(vgname, hypervisor_type):
Alexander Schreiber's avatar
Alexander Schreiber committed
311
  """Gives back a hash with different informations about the node.
Iustin Pop's avatar
Iustin Pop committed
312

313
314
315
316
317
318
319
320
321
322
323
324
  @type vgname: C{string}
  @param vgname: the name of the volume group to ask for disk space information
  @type hypervisor_type: C{str}
  @param hypervisor_type: the name of the hypervisor to ask for
      memory information
  @rtype: C{dict}
  @return: dictionary with the following keys:
      - vg_size is the size of the configured volume group in MiB
      - vg_free is the free size of the volume group in MiB
      - memory_dom0 is the memory allocated for domain0 in MiB
      - memory_free is the currently available (free) ram in MiB
      - memory_total is the total number of ram in MiB
Iustin Pop's avatar
Iustin Pop committed
325

326
  """
Iustin Pop's avatar
Iustin Pop committed
327
328
329
330
331
  outputarray = {}
  vginfo = _GetVGInfo(vgname)
  outputarray['vg_size'] = vginfo['vg_size']
  outputarray['vg_free'] = vginfo['vg_free']

332
  hyper = hypervisor.GetHypervisor(hypervisor_type)
Iustin Pop's avatar
Iustin Pop committed
333
334
335
336
  hyp_info = hyper.GetNodeInfo()
  if hyp_info is not None:
    outputarray.update(hyp_info)

337
338
339
340
341
342
  f = open("/proc/sys/kernel/random/boot_id", 'r')
  try:
    outputarray["bootid"] = f.read(128).rstrip("\n")
  finally:
    f.close()

Iustin Pop's avatar
Iustin Pop committed
343
344
345
  return outputarray


346
def VerifyNode(what, cluster_name):
Iustin Pop's avatar
Iustin Pop committed
347
348
  """Verify the status of the local node.

349
350
351
352
353
354
355
356
357
  Based on the input L{what} parameter, various checks are done on the
  local node.

  If the I{filelist} key is present, this list of
  files is checksummed and the file/checksum pairs are returned.

  If the I{nodelist} key is present, we check that we have
  connectivity via ssh with the target nodes (and check the hostname
  report).
Iustin Pop's avatar
Iustin Pop committed
358

359
360
361
362
363
364
365
366
367
368
369
  If the I{node-net-test} key is present, we check that we have
  connectivity to the given nodes via both primary IP and, if
  applicable, secondary IPs.

  @type what: C{dict}
  @param what: a dictionary of things to check:
      - filelist: list of files for which to compute checksums
      - nodelist: list of nodes we should check ssh communication with
      - node-net-test: list of nodes we should check node daemon port
        connectivity with
      - hypervisor: list with hypervisors to run the verify for
Iustin Pop's avatar
Iustin Pop committed
370
371
372
  @rtype: dict
  @return: a dictionary with the same keys as the input dict, and
      values representing the result of the checks
Iustin Pop's avatar
Iustin Pop committed
373
374
375
376

  """
  result = {}

377
378
379
380
381
382
383
384
385
386
387
388
389
  if constants.NV_HYPERVISOR in what:
    result[constants.NV_HYPERVISOR] = tmp = {}
    for hv_name in what[constants.NV_HYPERVISOR]:
      tmp[hv_name] = hypervisor.GetHypervisor(hv_name).Verify()

  if constants.NV_FILELIST in what:
    result[constants.NV_FILELIST] = utils.FingerprintFiles(
      what[constants.NV_FILELIST])

  if constants.NV_NODELIST in what:
    result[constants.NV_NODELIST] = tmp = {}
    random.shuffle(what[constants.NV_NODELIST])
    for node in what[constants.NV_NODELIST]:
390
      success, message = _GetSshRunner(cluster_name).VerifyNodeHostname(node)
Iustin Pop's avatar
Iustin Pop committed
391
      if not success:
392
393
394
395
        tmp[node] = message

  if constants.NV_NODENETTEST in what:
    result[constants.NV_NODENETTEST] = tmp = {}
396
397
    my_name = utils.HostInfo().name
    my_pip = my_sip = None
398
    for name, pip, sip in what[constants.NV_NODENETTEST]:
399
400
401
402
403
      if name == my_name:
        my_pip = pip
        my_sip = sip
        break
    if not my_pip:
404
405
      tmp[my_name] = ("Can't find my own primary/secondary IP"
                      " in the node list")
406
    else:
Michael Hanselmann's avatar
Michael Hanselmann committed
407
      port = utils.GetNodeDaemonPort()
408
      for name, pip, sip in what[constants.NV_NODENETTEST]:
409
410
411
412
413
414
415
        fail = []
        if not utils.TcpPing(pip, port, source=my_pip):
          fail.append("primary")
        if sip != pip:
          if not utils.TcpPing(sip, port, source=my_sip):
            fail.append("secondary")
        if fail:
416
417
418
419
420
421
422
423
424
425
426
427
428
429
          tmp[name] = ("failure using the %s interface(s)" %
                       " and ".join(fail))

  if constants.NV_LVLIST in what:
    result[constants.NV_LVLIST] = GetVolumeList(what[constants.NV_LVLIST])

  if constants.NV_INSTANCELIST in what:
    result[constants.NV_INSTANCELIST] = GetInstanceList(
      what[constants.NV_INSTANCELIST])

  if constants.NV_VGLIST in what:
    result[constants.NV_VGLIST] = ListVolumeGroups()

  if constants.NV_VERSION in what:
430
431
    result[constants.NV_VERSION] = (constants.PROTOCOL_VERSION,
                                    constants.RELEASE_VERSION)
432
433
434
435

  if constants.NV_HVINFO in what:
    hyper = hypervisor.GetHypervisor(what[constants.NV_HVINFO])
    result[constants.NV_HVINFO] = hyper.GetNodeInfo()
436

437
438
439
  if constants.NV_DRBDLIST in what:
    try:
      used_minors = bdev.DRBD8.GetUsedDevs().keys()
440
    except errors.BlockDeviceError, err:
441
      logging.warning("Can't get used minors list", exc_info=True)
442
      used_minors = str(err)
443
444
    result[constants.NV_DRBDLIST] = used_minors

Iustin Pop's avatar
Iustin Pop committed
445
446
447
448
449
450
  return result


def GetVolumeList(vg_name):
  """Compute list of logical volumes and their size.

Iustin Pop's avatar
Iustin Pop committed
451
452
453
454
455
456
457
458
459
460
461
  @type vg_name: str
  @param vg_name: the volume group whose LVs we should list
  @rtype: dict
  @return:
      dictionary of all partions (key) with value being a tuple of
      their size (in MiB), inactive and online status::

        {'test1': ('20.06', True, True)}

      in case of errors, a string is returned with the error
      details.
Iustin Pop's avatar
Iustin Pop committed
462
463

  """
464
465
466
467
468
  lvs = {}
  sep = '|'
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=%s" % sep,
                         "-olv_name,lv_size,lv_attr", vg_name])
Iustin Pop's avatar
Iustin Pop committed
469
  if result.failed:
470
471
    logging.error("Failed to list logical volumes, lvs output: %s",
                  result.output)
472
    return result.output
473

474
  valid_line_re = re.compile("^ *([^|]+)\|([0-9.]+)\|([^|]{6})\|?$")
475
  for line in result.stdout.splitlines():
476
477
478
    line = line.strip()
    match = valid_line_re.match(line)
    if not match:
479
      logging.error("Invalid line returned from lvs output: '%s'", line)
480
481
      continue
    name, size, attr = match.groups()
482
483
484
485
486
    inactive = attr[4] == '-'
    online = attr[5] == 'o'
    lvs[name] = (size, inactive, online)

  return lvs
Iustin Pop's avatar
Iustin Pop committed
487
488
489


def ListVolumeGroups():
Alexander Schreiber's avatar
Alexander Schreiber committed
490
  """List the volume groups and their size.
Iustin Pop's avatar
Iustin Pop committed
491

Iustin Pop's avatar
Iustin Pop committed
492
493
494
  @rtype: dict
  @return: dictionary with keys volume name and values the
      size of the volume
Iustin Pop's avatar
Iustin Pop committed
495
496
497
498
499

  """
  return utils.ListVolumeGroups()


500
501
502
def NodeVolumes():
  """List all volumes on this node.

Iustin Pop's avatar
Iustin Pop committed
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
  @rtype: list
  @return:
    A list of dictionaries, each having four keys:
      - name: the logical volume name,
      - size: the size of the logical volume
      - dev: the physical device on which the LV lives
      - vg: the volume group to which it belongs

    In case of errors, we return an empty list and log the
    error.

    Note that since a logical volume can live on multiple physical
    volumes, the resulting list might include a logical volume
    multiple times.

518
519
520
521
522
  """
  result = utils.RunCmd(["lvs", "--noheadings", "--units=m", "--nosuffix",
                         "--separator=|",
                         "--options=lv_name,lv_size,devices,vg_name"])
  if result.failed:
523
524
    logging.error("Failed to list logical volumes, lvs output: %s",
                  result.output)
Iustin Pop's avatar
Iustin Pop committed
525
    return []
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540

  def parse_dev(dev):
    if '(' in dev:
      return dev.split('(')[0]
    else:
      return dev

  def map_line(line):
    return {
      'name': line[0].strip(),
      'size': line[1].strip(),
      'dev': parse_dev(line[2].strip()),
      'vg': line[3].strip(),
    }

541
542
  return [map_line(line.split('|')) for line in result.stdout.splitlines()
          if line.count('|') >= 3]
543
544


Iustin Pop's avatar
Iustin Pop committed
545
def BridgesExist(bridges_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
546
  """Check if a list of bridges exist on the current node.
Iustin Pop's avatar
Iustin Pop committed
547

Iustin Pop's avatar
Iustin Pop committed
548
549
  @rtype: boolean
  @return: C{True} if all of them exist, C{False} otherwise
Iustin Pop's avatar
Iustin Pop committed
550
551
552
553
554
555
556
557
558

  """
  for bridge in bridges_list:
    if not utils.BridgeExists(bridge):
      return False

  return True


559
def GetInstanceList(hypervisor_list):
Alexander Schreiber's avatar
Alexander Schreiber committed
560
  """Provides a list of instances.
Iustin Pop's avatar
Iustin Pop committed
561

562
563
564
565
566
  @type hypervisor_list: list
  @param hypervisor_list: the list of hypervisors to query information

  @rtype: list
  @return: a list of all running instances on the current node
Iustin Pop's avatar
Iustin Pop committed
567
568
    - instance1.example.com
    - instance2.example.com
Iustin Pop's avatar
Iustin Pop committed
569

570
  """
571
572
573
574
575
576
577
578
  results = []
  for hname in hypervisor_list:
    try:
      names = hypervisor.GetHypervisor(hname).ListInstances()
      results.extend(names)
    except errors.HypervisorError, err:
      logging.exception("Error enumerating instances for hypevisor %s", hname)
      raise
Iustin Pop's avatar
Iustin Pop committed
579

580
  return results
Iustin Pop's avatar
Iustin Pop committed
581
582


583
def GetInstanceInfo(instance, hname):
Alexander Schreiber's avatar
Alexander Schreiber committed
584
  """Gives back the informations about an instance as a dictionary.
Iustin Pop's avatar
Iustin Pop committed
585

586
587
588
589
  @type instance: string
  @param instance: the instance name
  @type hname: string
  @param hname: the hypervisor type of the instance
Iustin Pop's avatar
Iustin Pop committed
590

591
592
593
594
595
  @rtype: dict
  @return: dictionary with the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
596

597
  """
Iustin Pop's avatar
Iustin Pop committed
598
599
  output = {}

600
  iinfo = hypervisor.GetHypervisor(hname).GetInstanceInfo(instance)
Iustin Pop's avatar
Iustin Pop committed
601
602
603
604
605
606
607
608
  if iinfo is not None:
    output['memory'] = iinfo[2]
    output['state'] = iinfo[4]
    output['time'] = iinfo[5]

  return output


609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
def GetInstanceMigratable(instance):
  """Gives whether an instance can be migrated.

  @type instance: L{objects.Instance}
  @param instance: object representing the instance to be checked.

  @rtype: tuple
  @return: tuple of (result, description) where:
      - result: whether the instance can be migrated or not
      - description: a description of the issue, if relevant

  """
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  if instance.name not in hyper.ListInstances():
    return (False, 'not running')

  for idx in range(len(instance.disks)):
    link_name = _GetBlockDevSymlinkPath(instance.name, idx)
    if not os.path.islink(link_name):
      return (False, 'not restarted since ganeti 1.2.5')

  return (True, '')


633
def GetAllInstancesInfo(hypervisor_list):
Iustin Pop's avatar
Iustin Pop committed
634
635
  """Gather data about all instances.

Iustin Pop's avatar
Iustin Pop committed
636
  This is the equivalent of L{GetInstanceInfo}, except that it
Iustin Pop's avatar
Iustin Pop committed
637
638
639
  computes data for all instances at once, thus being faster if one
  needs data about more than one instance.

640
641
642
  @type hypervisor_list: list
  @param hypervisor_list: list of hypervisors to query for instance data

643
  @rtype: dict
644
645
646
647
  @return: dictionary of instance: data, with data having the following keys:
      - memory: memory size of instance (int)
      - state: xen state of instance (string)
      - time: cpu time of instance (float)
Iustin Pop's avatar
Iustin Pop committed
648
      - vcpus: the number of vcpus
Iustin Pop's avatar
Iustin Pop committed
649

650
  """
Iustin Pop's avatar
Iustin Pop committed
651
652
  output = {}

653
654
655
656
  for hname in hypervisor_list:
    iinfo = hypervisor.GetHypervisor(hname).GetAllInstancesInfo()
    if iinfo:
      for name, inst_id, memory, vcpus, state, times in iinfo:
657
        value = {
658
659
660
661
662
          'memory': memory,
          'vcpus': vcpus,
          'state': state,
          'time': times,
          }
663
664
665
666
667
668
669
670
        if name in output:
          # we only check static parameters, like memory and vcpus,
          # and not state and time which can change between the
          # invocations of the different hypervisors
          for key in 'memory', 'vcpus':
            if value[key] != output[name][key]:
              raise errors.HypervisorError("Instance %s is running twice"
                                           " with different parameters" % name)
671
        output[name] = value
Iustin Pop's avatar
Iustin Pop committed
672
673
674
675

  return output


676
def InstanceOsAdd(instance):
Alexander Schreiber's avatar
Alexander Schreiber committed
677
  """Add an OS to an instance.
Iustin Pop's avatar
Iustin Pop committed
678

679
680
  @type instance: L{objects.Instance}
  @param instance: Instance whose OS is to be installed
Iustin Pop's avatar
Iustin Pop committed
681
682
  @rtype: boolean
  @return: the success of the operation
Iustin Pop's avatar
Iustin Pop committed
683
684

  """
685
686
687
688
689
690
691
692
693
  try:
    inst_os = OSFromDisk(instance.os)
  except errors.InvalidOS, err:
    os_name, os_dir, os_err = err.args
    if os_dir is None:
      return (False, "Can't find OS '%s': %s" % (os_name, os_err))
    else:
      return (False, "Error parsing OS '%s' in directory %s: %s" %
              (os_name, os_dir, os_err))
Iustin Pop's avatar
Iustin Pop committed
694

695
  create_env = OSEnvironment(instance)
Iustin Pop's avatar
Iustin Pop committed
696
697
698

  logfile = "%s/add-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                     instance.name, int(time.time()))
699

700
701
  result = utils.RunCmd([inst_os.create_script], env=create_env,
                        cwd=inst_os.path, output=logfile,)
702
  if result.failed:
703
    logging.error("os create command '%s' returned error: %s, logfile: %s,"
704
                  " output: %s", result.cmd, result.fail_reason, logfile,
705
                  result.output)
706
    lines = [utils.SafeEncode(val)
707
708
709
             for val in utils.TailFile(logfile, lines=20)]
    return (False, "OS create script failed (%s), last lines in the"
            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
710

711
  return (True, "Successfully installed")
712
713


714
def RunRenameInstance(instance, old_name):
715
716
  """Run the OS rename script for an instance.

Iustin Pop's avatar
Iustin Pop committed
717
  @type instance: L{objects.Instance}
718
719
720
  @param instance: Instance whose OS is to be installed
  @type old_name: string
  @param old_name: previous instance name
Iustin Pop's avatar
Iustin Pop committed
721
722
  @rtype: boolean
  @return: the success of the operation
723
724
725
726

  """
  inst_os = OSFromDisk(instance.os)

727
728
  rename_env = OSEnvironment(instance)
  rename_env['OLD_INSTANCE_NAME'] = old_name
729
730
731
732

  logfile = "%s/rename-%s-%s-%s-%d.log" % (constants.LOG_OS_DIR, instance.os,
                                           old_name,
                                           instance.name, int(time.time()))
Iustin Pop's avatar
Iustin Pop committed
733

734
735
  result = utils.RunCmd([inst_os.rename_script], env=rename_env,
                        cwd=inst_os.path, output=logfile)
Iustin Pop's avatar
Iustin Pop committed
736
737

  if result.failed:
738
    logging.error("os create command '%s' returned error: %s output: %s",
739
                  result.cmd, result.fail_reason, result.output)
740
    lines = [utils.SafeEncode(val)
741
742
743
             for val in utils.TailFile(logfile, lines=20)]
    return (False, "OS rename script failed (%s), last lines in the"
            " log file:\n%s" % (result.fail_reason, "\n".join(lines)))
Iustin Pop's avatar
Iustin Pop committed
744

745
  return (True, "Rename successful")
Iustin Pop's avatar
Iustin Pop committed
746
747
748
749
750


def _GetVGInfo(vg_name):
  """Get informations about the volume group.

Iustin Pop's avatar
Iustin Pop committed
751
752
753
754
755
756
757
758
  @type vg_name: str
  @param vg_name: the volume group which we query
  @rtype: dict
  @return:
    A dictionary with the following keys:
      - C{vg_size} is the total size of the volume group in MiB
      - C{vg_free} is the free size of the volume group in MiB
      - C{pv_count} are the number of physical disks in that VG
Iustin Pop's avatar
Iustin Pop committed
759

Iustin Pop's avatar
Iustin Pop committed
760
761
    If an error occurs during gathering of data, we return the same dict
    with keys all set to None.
762

Iustin Pop's avatar
Iustin Pop committed
763
  """
764
765
  retdic = dict.fromkeys(["vg_size", "vg_free", "pv_count"])

Iustin Pop's avatar
Iustin Pop committed
766
767
768
769
  retval = utils.RunCmd(["vgs", "-ovg_size,vg_free,pv_count", "--noheadings",
                         "--nosuffix", "--units=m", "--separator=:", vg_name])

  if retval.failed:
770
    logging.error("volume group %s not present", vg_name)
771
    return retdic
Iustin Pop's avatar
Iustin Pop committed
772
  valarr = retval.stdout.strip().rstrip(':').split(':')
773
774
775
776
777
778
779
780
  if len(valarr) == 3:
    try:
      retdic = {
        "vg_size": int(round(float(valarr[0]), 0)),
        "vg_free": int(round(float(valarr[1]), 0)),
        "pv_count": int(valarr[2]),
        }
    except ValueError, err:
781
      logging.exception("Fail to parse vgs output")
782
  else:
783
784
    logging.error("vgs output has the wrong number of fields (expected"
                  " three): %s", str(valarr))
Iustin Pop's avatar
Iustin Pop committed
785
786
787
  return retdic


788
789
790
791
792
793
def _GetBlockDevSymlinkPath(instance_name, idx):
  return os.path.join(constants.DISK_LINKS_DIR,
                      "%s:%d" % (instance_name, idx))


def _SymlinkBlockDev(instance_name, device_path, idx):
794
795
796
797
798
799
  """Set up symlinks to a instance's block device.

  This is an auxiliary function run when an instance is start (on the primary
  node) or when an instance is migrated (on the target node).


800
801
802
803
  @param instance_name: the name of the target instance
  @param device_path: path of the physical block device, on the node
  @param idx: the disk index
  @return: absolute path to the disk's symlink
804
805

  """
806
  link_name = _GetBlockDevSymlinkPath(instance_name, idx)
807
808
  try:
    os.symlink(device_path, link_name)
809
810
  except OSError, err:
    if err.errno == errno.EEXIST:
811
812
813
814
815
816
817
818
819
820
      if (not os.path.islink(link_name) or
          os.readlink(link_name) != device_path):
        os.remove(link_name)
        os.symlink(device_path, link_name)
    else:
      raise

  return link_name


821
def _RemoveBlockDevLinks(instance_name, disks):
Iustin Pop's avatar
Iustin Pop committed
822
823
824
  """Remove the block device symlinks belonging to the given instance.

  """
825
826
827
  for idx, disk in enumerate(disks):
    link_name = _GetBlockDevSymlinkPath(instance_name, idx)
    if os.path.islink(link_name):
Iustin Pop's avatar
Iustin Pop committed
828
      try:
829
830
831
        os.remove(link_name)
      except OSError:
        logging.exception("Can't remove symlink '%s'", link_name)
Iustin Pop's avatar
Iustin Pop committed
832
833


834
def _GatherAndLinkBlockDevs(instance):
Iustin Pop's avatar
Iustin Pop committed
835
836
837
838
839
  """Set up an instance's block device(s).

  This is run on the primary node at instance startup. The block
  devices must be already assembled.

Iustin Pop's avatar
Iustin Pop committed
840
841
  @type instance: L{objects.Instance}
  @param instance: the instance whose disks we shoul assemble
842
843
  @rtype: list
  @return: list of (disk_object, device_path)
Iustin Pop's avatar
Iustin Pop committed
844

Iustin Pop's avatar
Iustin Pop committed
845
846
  """
  block_devices = []
847
  for idx, disk in enumerate(instance.disks):
Iustin Pop's avatar
Iustin Pop committed
848
849
850
851
852
    device = _RecursiveFindBD(disk)
    if device is None:
      raise errors.BlockDeviceError("Block device '%s' is not set up." %
                                    str(disk))
    device.Open()
853
    try:
854
      link_name = _SymlinkBlockDev(instance.name, device.dev_path, idx)
855
856
857
858
859
860
    except OSError, e:
      raise errors.BlockDeviceError("Cannot create block device symlink: %s" %
                                    e.strerror)

    block_devices.append((disk, link_name))

Iustin Pop's avatar
Iustin Pop committed
861
862
863
  return block_devices


864
def StartInstance(instance):
Iustin Pop's avatar
Iustin Pop committed
865
866
  """Start an instance.

Iustin Pop's avatar
Iustin Pop committed
867
  @type instance: L{objects.Instance}
868
869
870
  @param instance: the instance object
  @rtype: boolean
  @return: whether the startup was successful or not
Iustin Pop's avatar
Iustin Pop committed
871

872
  """
873
  running_instances = GetInstanceList([instance.hypervisor])
Iustin Pop's avatar
Iustin Pop committed
874
875

  if instance.name in running_instances:
876
    return (True, "Already running")
Iustin Pop's avatar
Iustin Pop committed
877
878

  try:
879
880
    block_devices = _GatherAndLinkBlockDevs(instance)
    hyper = hypervisor.GetHypervisor(instance.hypervisor)
881
    hyper.StartInstance(instance, block_devices)
882
883
  except errors.BlockDeviceError, err:
    logging.exception("Failed to start instance")
884
    return (False, "Block device error: %s" % str(err))
Iustin Pop's avatar
Iustin Pop committed
885
  except errors.HypervisorError, err:
886
    logging.exception("Failed to start instance")
887
    _RemoveBlockDevLinks(instance.name, instance.disks)
888
    return (False, "Hypervisor error: %s" % str(err))
Iustin Pop's avatar
Iustin Pop committed
889

890
  return (True, "Instance started successfully")
Iustin Pop's avatar
Iustin Pop committed
891
892


893
def InstanceShutdown(instance):
Iustin Pop's avatar
Iustin Pop committed
894
895
  """Shut an instance down.

Iustin Pop's avatar
Iustin Pop committed
896
897
898
  @note: this functions uses polling with a hardcoded timeout.

  @type instance: L{objects.Instance}
899
900
901
  @param instance: the instance object
  @rtype: boolean
  @return: whether the startup was successful or not
Iustin Pop's avatar
Iustin Pop committed
902

903
  """
904
905
  hv_name = instance.hypervisor
  running_instances = GetInstanceList([hv_name])
Iustin Pop's avatar
Iustin Pop committed
906
907

  if instance.name not in running_instances:
908
    return (True, "Instance already stopped")
Iustin Pop's avatar
Iustin Pop committed
909

910
  hyper = hypervisor.GetHypervisor(hv_name)
Iustin Pop's avatar
Iustin Pop committed
911
912
913
  try:
    hyper.StopInstance(instance)
  except errors.HypervisorError, err:
914
915
916
    msg = "Failed to stop instance %s: %s" % (instance.name, err)
    logging.error(msg)
    return (False, msg)
Iustin Pop's avatar
Iustin Pop committed
917
918
919
920
921

  # test every 10secs for 2min

  time.sleep(1)
  for dummy in range(11):
922
    if instance.name not in GetInstanceList([hv_name]):
Iustin Pop's avatar
Iustin Pop committed
923
924
925
926
      break
    time.sleep(10)
  else:
    # the shutdown did not succeed
927
928
    logging.error("Shutdown of '%s' unsuccessful, using destroy",
                  instance.name)
Iustin Pop's avatar
Iustin Pop committed
929
930
931
932

    try:
      hyper.StopInstance(instance, force=True)
    except errors.HypervisorError, err:
933
934
935
      msg = "Failed to force stop instance %s: %s" % (instance.name, err)
      logging.error(msg)
      return (False, msg)
Iustin Pop's avatar
Iustin Pop committed
936
937

    time.sleep(1)
938
    if instance.name in GetInstanceList([hv_name]):
939
940
941
942
      msg = ("Could not shutdown instance %s even by destroy" %
             instance.name)
      logging.error(msg)
      return (False, msg)
Iustin Pop's avatar
Iustin Pop committed
943

944
  _RemoveBlockDevLinks(instance.name, instance.disks)
Iustin Pop's avatar
Iustin Pop committed
945

946
  return (True, "Instance has been shutdown successfully")
Iustin Pop's avatar
Iustin Pop committed
947
948


949
def InstanceReboot(instance, reboot_type):
950
951
  """Reboot an instance.

Iustin Pop's avatar
Iustin Pop committed
952
953
954
955
956
957
958
959
960
  @type instance: L{objects.Instance}
  @param instance: the instance object to reboot
  @type reboot_type: str
  @param reboot_type: the type of reboot, one the following
    constants:
      - L{constants.INSTANCE_REBOOT_SOFT}: only reboot the
        instance OS, do not recreate the VM
      - L{constants.INSTANCE_REBOOT_HARD}: tear down and
        restart the VM (at the hypervisor level)
961
962
963
964
      - the other reboot type (L{constants.INSTANCE_REBOOT_FULL}) is
        not accepted here, since that mode is handled differently, in
        cmdlib, and translates into full stop and start of the
        instance (instead of a call_instance_reboot RPC)
Iustin Pop's avatar
Iustin Pop committed
965
966
  @rtype: boolean
  @return: the success of the operation
967
968

  """
969
  running_instances = GetInstanceList([instance.hypervisor])
970
971

  if instance.name not in running_instances:
972
973
974
    msg = "Cannot reboot instance %s that is not running" % instance.name
    logging.error(msg)
    return (False, msg)
975

976
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
977
978
979
980
  if reboot_type == constants.INSTANCE_REBOOT_SOFT:
    try:
      hyper.RebootInstance(instance)
    except errors.HypervisorError, err:
981
982
983
      msg = "Failed to soft reboot instance %s: %s" % (instance.name, err)
      logging.error(msg)
      return (False, msg)
984
985
  elif reboot_type == constants.INSTANCE_REBOOT_HARD:
    try:
Iustin Pop's avatar
Iustin Pop committed
986
987
988
      stop_result = InstanceShutdown(instance)
      if not stop_result[0]:
        return stop_result
989
      return StartInstance(instance)
990
    except errors.HypervisorError, err:
991
992
993
      msg = "Failed to hard reboot instance %s: %s" % (instance.name, err)
      logging.error(msg)
      return (False, msg)
994
  else:
995
    return (False, "Invalid reboot_type received: %s" % (reboot_type,))
996

997
  return (True, "Reboot successful")
998
999


1000
1001
1002
1003
1004
1005
1006
def MigrationInfo(instance):
  """Gather information about an instance to be migrated.

  @type instance: L{objects.Instance}
  @param instance: the instance definition

  """
1007
1008
1009
1010
1011
1012
1013
1014
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    info = hyper.MigrationInfo(instance)
  except errors.HypervisorError, err:
    msg = "Failed to fetch migration information"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
  return (True, info)
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027


def AcceptInstance(instance, info, target):
  """Prepare the node to accept an instance.

  @type instance: L{objects.Instance}
  @param instance: the instance definition
  @type info: string/data (opaque)
  @param info: migration information, from the source node
  @type target: string
  @param target: target host (usually ip), on this node

  """
1028
1029
1030
1031
1032
1033
1034
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    hyper.AcceptInstance(instance, info, target)
  except errors.HypervisorError, err:
    msg = "Failed to accept instance"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
  return (True, "Accept successfull")


def FinalizeMigration(instance, info, success):
  """Finalize any preparation to accept an instance.

  @type instance: L{objects.Instance}
  @param instance: the instance definition
  @type info: string/data (opaque)
  @param info: migration information, from the source node
  @type success: boolean
  @param success: whether the migration was a success or a failure

  """
1049
1050
1051
1052
1053
1054
1055
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
  try:
    hyper.FinalizeMigration(instance, info, success)
  except errors.HypervisorError, err:
    msg = "Failed to finalize migration"
    logging.exception(msg)
    return (False, '%s: %s' % (msg, err))
1056
1057
1058
  return (True, "Migration Finalized")


1059
1060
1061
def MigrateInstance(instance, target, live):
  """Migrates an instance to another node.

Iustin Pop's avatar
Iustin Pop committed
1062
  @type instance: L{objects.Instance}
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
  @param instance: the instance definition
  @type target: string
  @param target: the target node name
  @type live: boolean
  @param live: whether the migration should be done live or not (the
      interpretation of this parameter is left to the hypervisor)
  @rtype: tuple
  @return: a tuple of (success, msg) where:
      - succes is a boolean denoting the success/failure of the operation
      - msg is a string with details in case of failure

1074
  """
1075
  hyper = hypervisor.GetHypervisor(instance.hypervisor)
1076
1077

  try:
1078
    hyper.MigrateInstance(instance.name, target, live)
1079
  except errors.HypervisorError, err:
1080
1081
1082
    msg = "Failed to migrate instance"
    logging.exception(msg)
    return (False, "%s: %s" % (msg, err))
1083
1084
1085
  return (True, "Migration successfull")


1086
def BlockdevCreate(disk, size, owner, on_primary, info):
Iustin Pop's avatar
Iustin Pop committed
1087
1088
  """Creates a block device for an instance.

Iustin Pop's avatar
Iustin Pop committed
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
  @type disk: L{objects.Disk}
  @param disk: the object describing the disk we should create
  @type size: int
  @param size: the size of the physical underlying device, in MiB
  @type owner: str
  @param owner: the name of the instance for which disk is created,
      used for device cache data
  @type on_primary: boolean
  @param on_primary:  indicates if it is the primary node or not
  @type info: string
  @param info: string that will be sent to the physical device
      creation, used for example to set (LVM) tags on LVs

  @return: the new unique_id of the device (this can sometime be
      computed only after creation), or None. On secondary nodes,
      it's not required to return anything.
Iustin Pop's avatar
Iustin Pop committed
1105
1106
1107
1108
1109

  """
  clist = []
  if disk.children:
    for child in disk.children:
1110
1111
1112
1113
1114
1115
      try:
        crdev = _RecursiveAssembleBD(child, owner, on_primary)
      except errors.BlockDeviceError, err:
        errmsg = "Can't assemble device %s: %s" % (child, err)
        logging.error(errmsg)
        return False, errmsg
Iustin Pop's avatar
Iustin Pop committed
1116
1117
1118
      if on_primary or disk.AssembleOnSecondary():
        # we need the children open in case the device itself has to
        # be assembled
1119
1120
1121
        try:
          crdev.Open()
        except errors.BlockDeviceError, err:
Iustin Pop's avatar
Iustin Pop committed
1122
          errmsg = "Can't make child '%s' read-write: %s" % (child, err)
1123
1124
          logging.error(errmsg)
          return False, errmsg
Iustin Pop's avatar
Iustin Pop committed
1125
1126
      clist.append(crdev)

1127
  try:
Iustin Pop's avatar
Iustin Pop committed
1128
    device = bdev.Create(disk.dev_type, disk.physical_id, clist, disk.size)
1129
  except errors.BlockDeviceError, err:
1130
    return False, "Can't create block device: %s" % str(err)
Iustin Pop's avatar
Iustin Pop committed
1131

Iustin Pop's avatar
Iustin Pop committed
1132
  if on_primary or disk.AssembleOnSecondary():
1133
1134
1135
1136
1137
1138
1139
    try:
      device.Assemble()
    except errors.BlockDeviceError, err:
      errmsg = ("Can't assemble device after creation, very"
                " unusual event: %s" % str(err))
      logging.error(errmsg)
      return False, errmsg
1140
    device.SetSyncSpeed(constants.SYNC_SPEED)
Iustin Pop's avatar
Iustin Pop committed
1141
    if on_primary or disk.OpenOnSecondary():
1142
1143
1144
1145
1146
1147
1148
      try:
        device.Open(force=True)
      except errors.BlockDeviceError, err:
        errmsg = ("Can't make device r/w after creation, very"
                  " unusual event: %s" % str(err))
        logging.error(errmsg)
        return False, errmsg
1149
1150
    DevCacheManager.UpdateCache(device.dev_path, owner,
                                on_primary, disk.iv_name)
1151
1152
1153

  device.SetInfo(info)

Iustin Pop's avatar
Iustin Pop committed