roles/epylog/files/modules/common_unparsed_mod.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

#!/usr/bin/python -tt
"""
This module should run after all other modules.

Takes the remaining logs, saves them out to a tmpfile.
Uses difflib.SequenceMatcher() to return logs which occur most often and are
at least a set percentage similar. This lets you catch logs which are
from multiple machines and more or less the same or similar logs (with memory
or process id offsets) from the same machine.

"""

import sys
import re
import difflib
from operator import itemgetter
import tempfile
import os
import shutil

##
# This is for testing purposes, so you can invoke this from the
# modules directory. See also the testing notes at the end of the
# file.
#
sys.path.insert(0, '../py/')
from epylog import Result, InternalModule

class common_unparsed_mod(InternalModule):
    def __init__(self, opts, logger):
        InternalModule.__init__(self)
        self.logger = logger
        rc = re.compile

        self.match_percentage = int(opts.get('match_percentage', '95'))
        self.debug_dump = int(opts.get('debug_dump', '1'))
        self.regex_map = {rc('^.+$'):self.expand_out_line}
        self.tmpdir = tempfile.mkdtemp(prefix='epylog-common-unparsed', dir='/var/tmp')
        self.matchfile = self.tmpdir + '/match_limited'
        self.matchfile_f = open(self.matchfile, 'w')
        self.complete = self.tmpdir + '/complete'
        self.complete_f = open(self.complete, 'w')
        

    ##
    # Line-matching routines
    #
    def expand_out_line(self, linemap):
        sys, msg, mult = self.get_smm(linemap)
        matchout = '%s\n' % (msg)
        com_out = '%s' % (linemap['line'])
        for i in range(0, int(mult)):
            self.complete_f.write(com_out)
        self.matchfile_f.write(matchout)

        #dirty like zebra
        if not os.path.exists(self.tmpdir + '/returned'):
            open(self.tmpdir + '/returned', 'w').close()
            return {'match':0}
        else:
            return None


    def finalize(self, rs):
        #FIXME - enable a debug mode where it writes
        # out to a file all of the first line and things which match it
        # and percentages of match for later investigation
        self.complete_f.close()
        self.matchfile_f.close()
        matches = {}
        full_matches = {}
        lines_matched = set()
        where = 0
        rl = open(self.matchfile, 'r')
        for l in rl:
            where += 1
            sub_where = 0
            for ol in open(self.matchfile, 'r'):
                sub_where += 1
                if sub_where < where: # if we're before it in the file we've already matched it
                    continue
                if sub_where in lines_matched:
                    continue

                c = difflib.SequenceMatcher(isjunk=lambda x: x in ('1','2','3','4','5','6','7','8','9','0'),a=l, b=ol)
                rq_ratio = int(c.real_quick_ratio()*100) # do the fast upper bound - find out if we should even glance at it.
                if rq_ratio < 60:
                    continue
                ratio = int(c.ratio()*100)
                if ratio >= self.match_percentage:
                    lines_matched.add(sub_where)
                    if l not in matches:
                        matches[l] = 0
                        full_matches[l] = []
                    matches[l] += 1
                    full_matches[l].append((ol,ratio))
        if self.debug_dump:
            fm_db = open(self.tmpdir + '/debug-match', 'w')
            for k in full_matches:
                fm_db.write(k)
                for (v,r) in full_matches[k]:
                    fm_db.write('  %s %s' % (r, v))
            fm_db.close()
            
        res = "<table border=0><tr><th>Count</th><th align=left>Log</th></tr>\n\n"
        for (k,v) in sorted(matches.items(), key=itemgetter(1), reverse=True)[:20]: # take the top 20 most common provided there are more than 1
            if v > 1:
                res += "<tr>\n<td bgcolor='#DDDDDD'>%s</td><td>%s</td>\n</tr>" % (v, k)
        res += "<tr>\n<td colspan=2>\n<h2>Complete messages</h2>\n<pre>\n"
        res += ''.join(sorted(open(self.complete, 'r').readlines()))
        res += "</pre>\n</td>\n</tr>\n</table>"
        return res


if __name__ == '__main__':
    from epylog.helpers import ModuleTest
    ModuleTest(common_unparsed_mod, sys.argv)