diff options
| author | Ed Leafe <ed@leafe.com> | 2011-08-02 19:30:03 +0000 |
|---|---|---|
| committer | Ed Leafe <ed@leafe.com> | 2011-08-02 19:30:03 +0000 |
| commit | 200b6b980aada1d38014e620e025ee61c38915da (patch) | |
| tree | 216b4ba5675e94c10451911c6b51ec4af234583a /nova/compute | |
| parent | 0bd6bf4a791e03e2c1ad1715aeae3e4413705414 (diff) | |
| parent | 65ba8bda43aa79080f6fec9c396f412c294718b8 (diff) | |
Merged trunk
Diffstat (limited to 'nova/compute')
| -rw-r--r-- | nova/compute/monitor.py | 435 |
1 files changed, 0 insertions, 435 deletions
diff --git a/nova/compute/monitor.py b/nova/compute/monitor.py deleted file mode 100644 index 9d8e2a25d..000000000 --- a/nova/compute/monitor.py +++ /dev/null @@ -1,435 +0,0 @@ -# vim: tabstop=4 shiftwidth=4 softtabstop=4 - -# Copyright 2010 United States Government as represented by the -# Administrator of the National Aeronautics and Space Administration. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -""" -Instance Monitoring: - - Optionally may be run on each compute node. Provides RRD - based statistics and graphs and makes them internally available - in the object store. -""" - -import datetime -import os -import time - -import boto -import boto.s3 -import rrdtool -from twisted.internet import task -from twisted.application import service - -from nova import flags -from nova import log as logging -from nova import utils -from nova.virt import connection as virt_connection - - -FLAGS = flags.FLAGS -flags.DEFINE_integer('monitoring_instances_delay', 5, - 'Sleep time between updates') -flags.DEFINE_integer('monitoring_instances_step', 300, - 'Interval of RRD updates') -flags.DEFINE_string('monitoring_rrd_path', '$state_path/monitor/instances', - 'Location of RRD files') - - -RRD_VALUES = { - 'cpu': [ - 'DS:cpu:GAUGE:600:0:100', - 'RRA:AVERAGE:0.5:1:800', - 'RRA:AVERAGE:0.5:6:800', - 'RRA:AVERAGE:0.5:24:800', - 'RRA:AVERAGE:0.5:288:800', - 'RRA:MAX:0.5:1:800', - 'RRA:MAX:0.5:6:800', - 'RRA:MAX:0.5:24:800', - 'RRA:MAX:0.5:288:800', - ], - 'net': [ - 'DS:rx:COUNTER:600:0:1250000', - 'DS:tx:COUNTER:600:0:1250000', - 'RRA:AVERAGE:0.5:1:800', - 'RRA:AVERAGE:0.5:6:800', - 'RRA:AVERAGE:0.5:24:800', - 'RRA:AVERAGE:0.5:288:800', - 'RRA:MAX:0.5:1:800', - 'RRA:MAX:0.5:6:800', - 'RRA:MAX:0.5:24:800', - 'RRA:MAX:0.5:288:800', - ], - 'disk': [ - 'DS:rd:COUNTER:600:U:U', - 'DS:wr:COUNTER:600:U:U', - 'RRA:AVERAGE:0.5:1:800', - 'RRA:AVERAGE:0.5:6:800', - 'RRA:AVERAGE:0.5:24:800', - 'RRA:AVERAGE:0.5:288:800', - 'RRA:MAX:0.5:1:800', - 'RRA:MAX:0.5:6:800', - 'RRA:MAX:0.5:24:800', - 'RRA:MAX:0.5:444:800', - ]} - - -utcnow = utils.utcnow - - -LOG = logging.getLogger('nova.compute.monitor') - - -def update_rrd(instance, name, data): - """ - Updates the specified RRD file. - """ - filename = os.path.join(instance.get_rrd_path(), '%s.rrd' % name) - - if not os.path.exists(filename): - init_rrd(instance, name) - - timestamp = int(time.mktime(utcnow().timetuple())) - rrdtool.update(filename, '%d:%s' % (timestamp, data)) - - -def init_rrd(instance, name): - """ - Initializes the specified RRD file. - """ - path = os.path.join(FLAGS.monitoring_rrd_path, instance.instance_id) - - if not os.path.exists(path): - os.makedirs(path) - - filename = os.path.join(path, '%s.rrd' % name) - - if not os.path.exists(filename): - rrdtool.create( - filename, - '--step', '%d' % FLAGS.monitoring_instances_step, - '--start', '0', - *RRD_VALUES[name]) - - -def graph_cpu(instance, duration): - """ - Creates a graph of cpu usage for the specified instance and duration. - """ - path = instance.get_rrd_path() - filename = os.path.join(path, 'cpu-%s.png' % duration) - - rrdtool.graph( - filename, - '--disable-rrdtool-tag', - '--imgformat', 'PNG', - '--width', '400', - '--height', '120', - '--start', 'now-%s' % duration, - '--vertical-label', '% cpu used', - '-l', '0', - '-u', '100', - 'DEF:cpu=%s:cpu:AVERAGE' % os.path.join(path, 'cpu.rrd'), - 'AREA:cpu#eacc00:% CPU',) - - store_graph(instance.instance_id, filename) - - -def graph_net(instance, duration): - """ - Creates a graph of network usage for the specified instance and duration. - """ - path = instance.get_rrd_path() - filename = os.path.join(path, 'net-%s.png' % duration) - - rrdtool.graph( - filename, - '--disable-rrdtool-tag', - '--imgformat', 'PNG', - '--width', '400', - '--height', '120', - '--start', 'now-%s' % duration, - '--vertical-label', 'bytes/s', - '--logarithmic', - '--units', 'si', - '--lower-limit', '1000', - '--rigid', - 'DEF:rx=%s:rx:AVERAGE' % os.path.join(path, 'net.rrd'), - 'DEF:tx=%s:tx:AVERAGE' % os.path.join(path, 'net.rrd'), - 'AREA:rx#00FF00:In traffic', - 'LINE1:tx#0000FF:Out traffic',) - - store_graph(instance.instance_id, filename) - - -def graph_disk(instance, duration): - """ - Creates a graph of disk usage for the specified duration. - """ - path = instance.get_rrd_path() - filename = os.path.join(path, 'disk-%s.png' % duration) - - rrdtool.graph( - filename, - '--disable-rrdtool-tag', - '--imgformat', 'PNG', - '--width', '400', - '--height', '120', - '--start', 'now-%s' % duration, - '--vertical-label', 'bytes/s', - '--logarithmic', - '--units', 'si', - '--lower-limit', '1000', - '--rigid', - 'DEF:rd=%s:rd:AVERAGE' % os.path.join(path, 'disk.rrd'), - 'DEF:wr=%s:wr:AVERAGE' % os.path.join(path, 'disk.rrd'), - 'AREA:rd#00FF00:Read', - 'LINE1:wr#0000FF:Write',) - - store_graph(instance.instance_id, filename) - - -def store_graph(instance_id, filename): - """ - Transmits the specified graph file to internal object store on cloud - controller. - """ - # TODO(devcamcar): Need to use an asynchronous method to make this - # connection. If boto has some separate method that generates - # the request it would like to make and another method to parse - # the response we can make our own client that does the actual - # request and hands it off to the response parser. - s3 = boto.s3.connection.S3Connection( - aws_access_key_id=FLAGS.aws_access_key_id, - aws_secret_access_key=FLAGS.aws_secret_access_key, - is_secure=False, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - port=FLAGS.s3_port, - host=FLAGS.s3_host) - bucket_name = '_%s.monitor' % instance_id - - # Object store isn't creating the bucket like it should currently - # when it is first requested, so have to catch and create manually. - try: - bucket = s3.get_bucket(bucket_name) - except Exception: - bucket = s3.create_bucket(bucket_name) - - key = boto.s3.Key(bucket) - key.key = os.path.basename(filename) - key.set_contents_from_filename(filename) - - -class Instance(object): - def __init__(self, conn, instance_id): - self.conn = conn - self.instance_id = instance_id - self.last_updated = datetime.datetime.min - self.cputime = 0 - self.cputime_last_updated = None - - init_rrd(self, 'cpu') - init_rrd(self, 'net') - init_rrd(self, 'disk') - - def needs_update(self): - """ - Indicates whether this instance is due to have its statistics updated. - """ - delta = utcnow() - self.last_updated - return delta.seconds >= FLAGS.monitoring_instances_step - - def update(self): - """ - Updates the instances statistics and stores the resulting graphs - in the internal object store on the cloud controller. - """ - LOG.debug(_('updating %s...'), self.instance_id) - - try: - data = self.fetch_cpu_stats() - if data is not None: - LOG.debug('CPU: %s', data) - update_rrd(self, 'cpu', data) - - data = self.fetch_net_stats() - LOG.debug('NET: %s', data) - update_rrd(self, 'net', data) - - data = self.fetch_disk_stats() - LOG.debug('DISK: %s', data) - update_rrd(self, 'disk', data) - - # TODO(devcamcar): Turn these into pool.ProcessPool.execute() calls - # and make the methods @defer.inlineCallbacks. - graph_cpu(self, '1d') - graph_cpu(self, '1w') - graph_cpu(self, '1m') - - graph_net(self, '1d') - graph_net(self, '1w') - graph_net(self, '1m') - - graph_disk(self, '1d') - graph_disk(self, '1w') - graph_disk(self, '1m') - except Exception: - LOG.exception(_('unexpected error during update')) - - self.last_updated = utcnow() - - def get_rrd_path(self): - """ - Returns the path to where RRD files are stored. - """ - return os.path.join(FLAGS.monitoring_rrd_path, self.instance_id) - - def fetch_cpu_stats(self): - """ - Returns cpu usage statistics for this instance. - """ - info = self.conn.get_info(self.instance_id) - - # Get the previous values. - cputime_last = self.cputime - cputime_last_updated = self.cputime_last_updated - - # Get the raw CPU time used in nanoseconds. - self.cputime = float(info['cpu_time']) - self.cputime_last_updated = utcnow() - - LOG.debug('CPU: %d', self.cputime) - - # Skip calculation on first pass. Need delta to get a meaningful value. - if cputime_last_updated is None: - return None - - # Calculate the number of seconds between samples. - d = self.cputime_last_updated - cputime_last_updated - t = d.days * 86400 + d.seconds - - LOG.debug('t = %d', t) - - # Calculate change over time in number of nanoseconds of CPU time used. - cputime_delta = self.cputime - cputime_last - - LOG.debug('cputime_delta = %s', cputime_delta) - - # Get the number of virtual cpus in this domain. - vcpus = int(info['num_cpu']) - - LOG.debug('vcpus = %d', vcpus) - - # Calculate CPU % used and cap at 100. - return min(cputime_delta / (t * vcpus * 1.0e9) * 100, 100) - - def fetch_disk_stats(self): - """ - Returns disk usage statistics for this instance. - """ - rd = 0 - wr = 0 - - disks = self.conn.get_disks(self.instance_id) - - # Aggregate the read and write totals. - for disk in disks: - try: - rd_req, rd_bytes, wr_req, wr_bytes, errs = \ - self.conn.block_stats(self.instance_id, disk) - rd += rd_bytes - wr += wr_bytes - except TypeError: - iid = self.instance_id - LOG.error(_('Cannot get blockstats for "%(disk)s"' - ' on "%(iid)s"') % locals()) - raise - - return '%d:%d' % (rd, wr) - - def fetch_net_stats(self): - """ - Returns network usage statistics for this instance. - """ - rx = 0 - tx = 0 - - interfaces = self.conn.get_interfaces(self.instance_id) - - # Aggregate the in and out totals. - for interface in interfaces: - try: - stats = self.conn.interface_stats(self.instance_id, interface) - rx += stats[0] - tx += stats[4] - except TypeError: - iid = self.instance_id - LOG.error(_('Cannot get ifstats for "%(interface)s"' - ' on "%(iid)s"') % locals()) - raise - - return '%d:%d' % (rx, tx) - - -class InstanceMonitor(object, service.Service): - """ - Monitors the running instances of the current machine. - """ - - def __init__(self): - """ - Initialize the monitoring loop. - """ - self._instances = {} - self._loop = task.LoopingCall(self.updateInstances) - - def startService(self): - self._instances = {} - self._loop.start(interval=FLAGS.monitoring_instances_delay) - service.Service.startService(self) - - def stopService(self): - self._loop.stop() - service.Service.stopService(self) - - def updateInstances(self): - """ - Update resource usage for all running instances. - """ - try: - conn = virt_connection.get_connection(read_only=True) - except Exception, exn: - LOG.exception(_('unexpected exception getting connection')) - time.sleep(FLAGS.monitoring_instances_delay) - return - - domain_ids = conn.list_instances() - try: - self.updateInstances_(conn, domain_ids) - except Exception, exn: - LOG.exception('updateInstances_') - - def updateInstances_(self, conn, domain_ids): - for domain_id in domain_ids: - if not domain_id in self._instances: - instance = Instance(conn, domain_id) - self._instances[domain_id] = instance - LOG.debug(_('Found instance: %s'), domain_id) - - for key in self._instances.keys(): - instance = self._instances[key] - if instance.needs_update(): - instance.update() |
