1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
#!/usr/bin/python
# Copyright (C) 2008 Red Hat Inc.
#
# This file is part of systemtap, and is free software. You can
# redistribute it and/or modify it under the terms of the GNU General
# Public License (GPL); either version 2, or (at your option) any
# later version.
# This script monitors a remote system that is running the kprobes
# test. If several consecutive 'ping's fail, the system is rebooted.
#
# This script takes as an argument a config filename, whose contents
# should look like the following:
#
# config_opts['system_name'] = "SYSTEM_NAME"
# config_opts['restart_cmds'] = [
# 'CMD1',
# 'CMD2',
# ]
#
# As an example, here is a config file used when monitoring a kvm
# instance:
#
# config_opts['system_name'] = "dhcp-148"
# config_opts['restart_cmds'] = [
# 'sudo virsh destroy kvm-rawhide-64-1',
# 'sudo virsh start kvm-rawhide-64-1',
# ]
import sys
import os
import time
if len(sys.argv) != 2:
print >>sys.stderr, "Usage: %s config_file" % sys.argv[0]
sys.exit(1)
cfg = sys.argv[1]
# Read in the config file
if not os.path.exists(cfg):
print >>sys.stderr, ("Could not find required config file: %s" % cfg)
sys.exit(1)
print "Reading config file %s..." % cfg
config_opts = dict()
execfile(cfg)
if not config_opts.has_key('system_name'):
print >>sys.stderr, "Missing required config opt 'system_name'"
sys.exit(1)
if not config_opts.has_key('restart_cmds'):
print >>sys.stderr, "Missing required config opt 'restart_cmds'"
sys.exit(1)
errors = 0
while 1:
rc = os.system("ping -c 1 %s" % config_opts['system_name'])
# If ping worked, system is still up and running. Wait a minute
# and try again.
if os.WEXITSTATUS(rc) == 0:
time.sleep(60)
errors = 0
# If the ping failed, increase the error count. If we've got 3
# consecutive errors, assume the machine has crashed and restart
# it.
else:
errors += 1
if errors < 3:
time.sleep(30)
else:
print >>sys.stderr, "Restarting %s..." % config_opts['system_name']
# Run each restart command
for cmd in config_opts['restart_cmds']:
print >>sys.stderr, "Running '%s'..." % cmd
os.system(cmd)
# Sleep for 5 minutes to give the system a chance to boot
print >>sys.stderr, "Sleeping for 5 minutes..."
time.sleep(5 * 60)
errors = 0
|