diff options
Diffstat (limited to 'scripts/kprobes_test/monitor_system.py')
-rwxr-xr-x | scripts/kprobes_test/monitor_system.py | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/scripts/kprobes_test/monitor_system.py b/scripts/kprobes_test/monitor_system.py new file mode 100755 index 00000000..59d49e7d --- /dev/null +++ b/scripts/kprobes_test/monitor_system.py @@ -0,0 +1,82 @@ +#!/usr/bin/python + +# Copyright (C) 2008 Red Hat Inc. +# +# This file is part of systemtap, and is free software. You can +# redistribute it and/or modify it under the terms of the GNU General +# Public License (GPL); either version 2, or (at your option) any +# later version. + +# This script monitors a remote system that is running the kprobes +# test. If several consecutive 'ping's fail, the system is rebooted. +# +# This script takes as an argument a config filename, whose contents +# should look like the following: +# +# config_opts['system_name'] = "SYSTEM_NAME" +# config_opts['restart_cmds'] = [ +# 'CMD1', +# 'CMD2', +# ] +# +# As an example, here is a config file used when monitoring a kvm +# instance: +# +# config_opts['system_name'] = "dhcp-148" +# config_opts['restart_cmds'] = [ +# 'sudo virsh destroy kvm-rawhide-64-1', +# 'sudo virsh start kvm-rawhide-64-1', +# ] + +import sys +import os +import time + +if len(sys.argv) != 2: + print >>sys.stderr, "Usage: %s config_file" % sys.argv[0] + sys.exit(1) +cfg = sys.argv[1] + +# Read in the config file +if not os.path.exists(cfg): + print >>sys.stderr, ("Could not find required config file: %s" % cfg) + sys.exit(1) + +print "Reading config file %s..." % cfg +config_opts = dict() +execfile(cfg) +if not config_opts.has_key('system_name'): + print >>sys.stderr, "Missing required config opt 'system_name'" + sys.exit(1) +if not config_opts.has_key('restart_cmds'): + print >>sys.stderr, "Missing required config opt 'restart_cmds'" + sys.exit(1) + +errors = 0 +while 1: + rc = os.system("ping -c 1 %s" % config_opts['system_name']) + # If ping worked, system is still up and running. Wait a minute + # and try again. + if os.WEXITSTATUS(rc) == 0: + time.sleep(60) + errors = 0 + + # If the ping failed, increase the error count. If we've got 3 + # consecutive errors, assume the machine has crashed and restart + # it. + else: + errors += 1 + if errors < 3: + time.sleep(30) + else: + print >>sys.stderr, "Restarting %s..." % config_opts['system_name'] + # Run each restart command + + for cmd in config_opts['restart_cmds']: + print >>sys.stderr, "Running '%s'..." % cmd + os.system(cmd) + # Sleep for 5 minutes to give the system a chance to boot + print >>sys.stderr, "Sleeping for 5 minutes..." + time.sleep(5 * 60) + errors = 0 + |