summaryrefslogtreecommitdiffstats
path: root/nova/compute
diff options
context:
space:
mode:
authorEd Leafe <ed@leafe.com>2011-08-02 19:30:03 +0000
committerEd Leafe <ed@leafe.com>2011-08-02 19:30:03 +0000
commit200b6b980aada1d38014e620e025ee61c38915da (patch)
tree216b4ba5675e94c10451911c6b51ec4af234583a /nova/compute
parent0bd6bf4a791e03e2c1ad1715aeae3e4413705414 (diff)
parent65ba8bda43aa79080f6fec9c396f412c294718b8 (diff)
Merged trunk
Diffstat (limited to 'nova/compute')
-rw-r--r--nova/compute/monitor.py435
1 files changed, 0 insertions, 435 deletions
diff --git a/nova/compute/monitor.py b/nova/compute/monitor.py
deleted file mode 100644
index 9d8e2a25d..000000000
--- a/nova/compute/monitor.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# vim: tabstop=4 shiftwidth=4 softtabstop=4
-
-# Copyright 2010 United States Government as represented by the
-# Administrator of the National Aeronautics and Space Administration.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-"""
-Instance Monitoring:
-
- Optionally may be run on each compute node. Provides RRD
- based statistics and graphs and makes them internally available
- in the object store.
-"""
-
-import datetime
-import os
-import time
-
-import boto
-import boto.s3
-import rrdtool
-from twisted.internet import task
-from twisted.application import service
-
-from nova import flags
-from nova import log as logging
-from nova import utils
-from nova.virt import connection as virt_connection
-
-
-FLAGS = flags.FLAGS
-flags.DEFINE_integer('monitoring_instances_delay', 5,
- 'Sleep time between updates')
-flags.DEFINE_integer('monitoring_instances_step', 300,
- 'Interval of RRD updates')
-flags.DEFINE_string('monitoring_rrd_path', '$state_path/monitor/instances',
- 'Location of RRD files')
-
-
-RRD_VALUES = {
- 'cpu': [
- 'DS:cpu:GAUGE:600:0:100',
- 'RRA:AVERAGE:0.5:1:800',
- 'RRA:AVERAGE:0.5:6:800',
- 'RRA:AVERAGE:0.5:24:800',
- 'RRA:AVERAGE:0.5:288:800',
- 'RRA:MAX:0.5:1:800',
- 'RRA:MAX:0.5:6:800',
- 'RRA:MAX:0.5:24:800',
- 'RRA:MAX:0.5:288:800',
- ],
- 'net': [
- 'DS:rx:COUNTER:600:0:1250000',
- 'DS:tx:COUNTER:600:0:1250000',
- 'RRA:AVERAGE:0.5:1:800',
- 'RRA:AVERAGE:0.5:6:800',
- 'RRA:AVERAGE:0.5:24:800',
- 'RRA:AVERAGE:0.5:288:800',
- 'RRA:MAX:0.5:1:800',
- 'RRA:MAX:0.5:6:800',
- 'RRA:MAX:0.5:24:800',
- 'RRA:MAX:0.5:288:800',
- ],
- 'disk': [
- 'DS:rd:COUNTER:600:U:U',
- 'DS:wr:COUNTER:600:U:U',
- 'RRA:AVERAGE:0.5:1:800',
- 'RRA:AVERAGE:0.5:6:800',
- 'RRA:AVERAGE:0.5:24:800',
- 'RRA:AVERAGE:0.5:288:800',
- 'RRA:MAX:0.5:1:800',
- 'RRA:MAX:0.5:6:800',
- 'RRA:MAX:0.5:24:800',
- 'RRA:MAX:0.5:444:800',
- ]}
-
-
-utcnow = utils.utcnow
-
-
-LOG = logging.getLogger('nova.compute.monitor')
-
-
-def update_rrd(instance, name, data):
- """
- Updates the specified RRD file.
- """
- filename = os.path.join(instance.get_rrd_path(), '%s.rrd' % name)
-
- if not os.path.exists(filename):
- init_rrd(instance, name)
-
- timestamp = int(time.mktime(utcnow().timetuple()))
- rrdtool.update(filename, '%d:%s' % (timestamp, data))
-
-
-def init_rrd(instance, name):
- """
- Initializes the specified RRD file.
- """
- path = os.path.join(FLAGS.monitoring_rrd_path, instance.instance_id)
-
- if not os.path.exists(path):
- os.makedirs(path)
-
- filename = os.path.join(path, '%s.rrd' % name)
-
- if not os.path.exists(filename):
- rrdtool.create(
- filename,
- '--step', '%d' % FLAGS.monitoring_instances_step,
- '--start', '0',
- *RRD_VALUES[name])
-
-
-def graph_cpu(instance, duration):
- """
- Creates a graph of cpu usage for the specified instance and duration.
- """
- path = instance.get_rrd_path()
- filename = os.path.join(path, 'cpu-%s.png' % duration)
-
- rrdtool.graph(
- filename,
- '--disable-rrdtool-tag',
- '--imgformat', 'PNG',
- '--width', '400',
- '--height', '120',
- '--start', 'now-%s' % duration,
- '--vertical-label', '% cpu used',
- '-l', '0',
- '-u', '100',
- 'DEF:cpu=%s:cpu:AVERAGE' % os.path.join(path, 'cpu.rrd'),
- 'AREA:cpu#eacc00:% CPU',)
-
- store_graph(instance.instance_id, filename)
-
-
-def graph_net(instance, duration):
- """
- Creates a graph of network usage for the specified instance and duration.
- """
- path = instance.get_rrd_path()
- filename = os.path.join(path, 'net-%s.png' % duration)
-
- rrdtool.graph(
- filename,
- '--disable-rrdtool-tag',
- '--imgformat', 'PNG',
- '--width', '400',
- '--height', '120',
- '--start', 'now-%s' % duration,
- '--vertical-label', 'bytes/s',
- '--logarithmic',
- '--units', 'si',
- '--lower-limit', '1000',
- '--rigid',
- 'DEF:rx=%s:rx:AVERAGE' % os.path.join(path, 'net.rrd'),
- 'DEF:tx=%s:tx:AVERAGE' % os.path.join(path, 'net.rrd'),
- 'AREA:rx#00FF00:In traffic',
- 'LINE1:tx#0000FF:Out traffic',)
-
- store_graph(instance.instance_id, filename)
-
-
-def graph_disk(instance, duration):
- """
- Creates a graph of disk usage for the specified duration.
- """
- path = instance.get_rrd_path()
- filename = os.path.join(path, 'disk-%s.png' % duration)
-
- rrdtool.graph(
- filename,
- '--disable-rrdtool-tag',
- '--imgformat', 'PNG',
- '--width', '400',
- '--height', '120',
- '--start', 'now-%s' % duration,
- '--vertical-label', 'bytes/s',
- '--logarithmic',
- '--units', 'si',
- '--lower-limit', '1000',
- '--rigid',
- 'DEF:rd=%s:rd:AVERAGE' % os.path.join(path, 'disk.rrd'),
- 'DEF:wr=%s:wr:AVERAGE' % os.path.join(path, 'disk.rrd'),
- 'AREA:rd#00FF00:Read',
- 'LINE1:wr#0000FF:Write',)
-
- store_graph(instance.instance_id, filename)
-
-
-def store_graph(instance_id, filename):
- """
- Transmits the specified graph file to internal object store on cloud
- controller.
- """
- # TODO(devcamcar): Need to use an asynchronous method to make this
- # connection. If boto has some separate method that generates
- # the request it would like to make and another method to parse
- # the response we can make our own client that does the actual
- # request and hands it off to the response parser.
- s3 = boto.s3.connection.S3Connection(
- aws_access_key_id=FLAGS.aws_access_key_id,
- aws_secret_access_key=FLAGS.aws_secret_access_key,
- is_secure=False,
- calling_format=boto.s3.connection.OrdinaryCallingFormat(),
- port=FLAGS.s3_port,
- host=FLAGS.s3_host)
- bucket_name = '_%s.monitor' % instance_id
-
- # Object store isn't creating the bucket like it should currently
- # when it is first requested, so have to catch and create manually.
- try:
- bucket = s3.get_bucket(bucket_name)
- except Exception:
- bucket = s3.create_bucket(bucket_name)
-
- key = boto.s3.Key(bucket)
- key.key = os.path.basename(filename)
- key.set_contents_from_filename(filename)
-
-
-class Instance(object):
- def __init__(self, conn, instance_id):
- self.conn = conn
- self.instance_id = instance_id
- self.last_updated = datetime.datetime.min
- self.cputime = 0
- self.cputime_last_updated = None
-
- init_rrd(self, 'cpu')
- init_rrd(self, 'net')
- init_rrd(self, 'disk')
-
- def needs_update(self):
- """
- Indicates whether this instance is due to have its statistics updated.
- """
- delta = utcnow() - self.last_updated
- return delta.seconds >= FLAGS.monitoring_instances_step
-
- def update(self):
- """
- Updates the instances statistics and stores the resulting graphs
- in the internal object store on the cloud controller.
- """
- LOG.debug(_('updating %s...'), self.instance_id)
-
- try:
- data = self.fetch_cpu_stats()
- if data is not None:
- LOG.debug('CPU: %s', data)
- update_rrd(self, 'cpu', data)
-
- data = self.fetch_net_stats()
- LOG.debug('NET: %s', data)
- update_rrd(self, 'net', data)
-
- data = self.fetch_disk_stats()
- LOG.debug('DISK: %s', data)
- update_rrd(self, 'disk', data)
-
- # TODO(devcamcar): Turn these into pool.ProcessPool.execute() calls
- # and make the methods @defer.inlineCallbacks.
- graph_cpu(self, '1d')
- graph_cpu(self, '1w')
- graph_cpu(self, '1m')
-
- graph_net(self, '1d')
- graph_net(self, '1w')
- graph_net(self, '1m')
-
- graph_disk(self, '1d')
- graph_disk(self, '1w')
- graph_disk(self, '1m')
- except Exception:
- LOG.exception(_('unexpected error during update'))
-
- self.last_updated = utcnow()
-
- def get_rrd_path(self):
- """
- Returns the path to where RRD files are stored.
- """
- return os.path.join(FLAGS.monitoring_rrd_path, self.instance_id)
-
- def fetch_cpu_stats(self):
- """
- Returns cpu usage statistics for this instance.
- """
- info = self.conn.get_info(self.instance_id)
-
- # Get the previous values.
- cputime_last = self.cputime
- cputime_last_updated = self.cputime_last_updated
-
- # Get the raw CPU time used in nanoseconds.
- self.cputime = float(info['cpu_time'])
- self.cputime_last_updated = utcnow()
-
- LOG.debug('CPU: %d', self.cputime)
-
- # Skip calculation on first pass. Need delta to get a meaningful value.
- if cputime_last_updated is None:
- return None
-
- # Calculate the number of seconds between samples.
- d = self.cputime_last_updated - cputime_last_updated
- t = d.days * 86400 + d.seconds
-
- LOG.debug('t = %d', t)
-
- # Calculate change over time in number of nanoseconds of CPU time used.
- cputime_delta = self.cputime - cputime_last
-
- LOG.debug('cputime_delta = %s', cputime_delta)
-
- # Get the number of virtual cpus in this domain.
- vcpus = int(info['num_cpu'])
-
- LOG.debug('vcpus = %d', vcpus)
-
- # Calculate CPU % used and cap at 100.
- return min(cputime_delta / (t * vcpus * 1.0e9) * 100, 100)
-
- def fetch_disk_stats(self):
- """
- Returns disk usage statistics for this instance.
- """
- rd = 0
- wr = 0
-
- disks = self.conn.get_disks(self.instance_id)
-
- # Aggregate the read and write totals.
- for disk in disks:
- try:
- rd_req, rd_bytes, wr_req, wr_bytes, errs = \
- self.conn.block_stats(self.instance_id, disk)
- rd += rd_bytes
- wr += wr_bytes
- except TypeError:
- iid = self.instance_id
- LOG.error(_('Cannot get blockstats for "%(disk)s"'
- ' on "%(iid)s"') % locals())
- raise
-
- return '%d:%d' % (rd, wr)
-
- def fetch_net_stats(self):
- """
- Returns network usage statistics for this instance.
- """
- rx = 0
- tx = 0
-
- interfaces = self.conn.get_interfaces(self.instance_id)
-
- # Aggregate the in and out totals.
- for interface in interfaces:
- try:
- stats = self.conn.interface_stats(self.instance_id, interface)
- rx += stats[0]
- tx += stats[4]
- except TypeError:
- iid = self.instance_id
- LOG.error(_('Cannot get ifstats for "%(interface)s"'
- ' on "%(iid)s"') % locals())
- raise
-
- return '%d:%d' % (rx, tx)
-
-
-class InstanceMonitor(object, service.Service):
- """
- Monitors the running instances of the current machine.
- """
-
- def __init__(self):
- """
- Initialize the monitoring loop.
- """
- self._instances = {}
- self._loop = task.LoopingCall(self.updateInstances)
-
- def startService(self):
- self._instances = {}
- self._loop.start(interval=FLAGS.monitoring_instances_delay)
- service.Service.startService(self)
-
- def stopService(self):
- self._loop.stop()
- service.Service.stopService(self)
-
- def updateInstances(self):
- """
- Update resource usage for all running instances.
- """
- try:
- conn = virt_connection.get_connection(read_only=True)
- except Exception, exn:
- LOG.exception(_('unexpected exception getting connection'))
- time.sleep(FLAGS.monitoring_instances_delay)
- return
-
- domain_ids = conn.list_instances()
- try:
- self.updateInstances_(conn, domain_ids)
- except Exception, exn:
- LOG.exception('updateInstances_')
-
- def updateInstances_(self, conn, domain_ids):
- for domain_id in domain_ids:
- if not domain_id in self._instances:
- instance = Instance(conn, domain_id)
- self._instances[domain_id] = instance
- LOG.debug(_('Found instance: %s'), domain_id)
-
- for key in self._instances.keys():
- instance = self._instances[key]
- if instance.needs_update():
- instance.update()