diff options
Diffstat (limited to 'src/plugins/abrt-dump-oops.c')
-rw-r--r-- | src/plugins/abrt-dump-oops.c | 712 |
1 files changed, 712 insertions, 0 deletions
diff --git a/src/plugins/abrt-dump-oops.c b/src/plugins/abrt-dump-oops.c new file mode 100644 index 00000000..ad222edd --- /dev/null +++ b/src/plugins/abrt-dump-oops.c @@ -0,0 +1,712 @@ +/* + Copyright (C) 2011 ABRT team + Copyright (C) 2011 RedHat Inc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Authors: + Anton Arapov <anton@redhat.com> + Arjan van de Ven <arjan@linux.intel.com> + */ +#include <syslog.h> +#include <asm/unistd.h> /* __NR_syslog */ +#include <sys/inotify.h> +#include <sys/ioctl.h> /* ioctl(FIONREAD) */ +#include "abrtlib.h" +#include "parse_options.h" + +#define PROGNAME "abrt-dump-oops" + +static void queue_oops(GList **vec, const char *data, const char *version) +{ + char *ver_data = xasprintf("%s\n%s", version, data); + *vec = g_list_append(*vec, ver_data); +} + +/* + * extract_version tries to find the kernel version in given data + */ +static char *extract_version(const char *linepointer) +{ + if (strstr(linepointer, "Pid") + || strstr(linepointer, "comm") + || strstr(linepointer, "CPU") + || strstr(linepointer, "REGS") + || strstr(linepointer, "EFLAGS") + ) { + char* start; + char* end; + + start = strstr((char*)linepointer, "2.6."); + if (start) + { + end = strchr(start, ')'); + if (!end) + end = strchrnul(start, ' '); + return xstrndup(start, end-start); + } + } + + return NULL; +} + +/* + * extract_oops tries to find oops signatures in a log + */ +struct line_info { + char *ptr; + char level; +}; + +static int record_oops(GList **oopses, struct line_info* lines_info, int oopsstart, int oopsend) +{ + int q; + int len; + char *oops; + char *version; + + len = 2; + for (q = oopsstart; q <= oopsend; q++) + len += strlen(lines_info[q].ptr) + 1; + + oops = (char*)xzalloc(len); + + version = NULL; + for (q = oopsstart; q <= oopsend; q++) + { + if (!version) + version = extract_version(lines_info[q].ptr); + + if (lines_info[q].ptr[0]) + { + strcat(oops, lines_info[q].ptr); + strcat(oops, "\n"); + } + } + int rv = 1; + /* too short oopses are invalid */ + if (strlen(oops) > 100) + queue_oops(oopses, oops, version ? version : "undefined"); + else + { + VERB3 log("Dropped oops: too short"); + rv = 0; + } + free(oops); + free(version); + return rv; +} + +#define REALLOC_CHUNK 1000 +static int extract_oopses(GList **oopses, char *buffer, size_t buflen) +{ + char *c; + int linecount = 0; + int lines_info_alloc = 0; + struct line_info *lines_info = NULL; + + /* Split buffer into lines */ + + if (buflen != 0) + buffer[buflen - 1] = '\n'; /* the buffer usually ends with \n, but let's make sure */ + c = buffer; + while (c < buffer + buflen) + { + char linelevel; + char *c9; + char *colon; + + c9 = (char*)memchr(c, '\n', buffer + buflen - c); /* a \n will always be found */ + assert(c9); + *c9 = '\0'; /* turn the \n into a string termination */ + if (c9 == c) + goto next_line; + + /* Is it a syslog file (/var/log/messages or similar)? + * Even though _usually_ it looks like "Nov 19 12:34:38 localhost kernel: xxx", + * some users run syslog in non-C locale: + * "2010-02-22T09:24:08.156534-08:00 gnu-4 gnome-session[2048]: blah blah" + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ !!! + * We detect it by checking for N:NN:NN pattern in first 15 chars + * (and this still is not good enough... false positive: "pci 0000:15:00.0: PME# disabled") + */ + colon = strchr(c, ':'); + if (colon && colon > c && colon < c + 15 + && isdigit(colon[-1]) /* N:... */ + && isdigit(colon[1]) /* ...N:NN:... */ + && isdigit(colon[2]) + && colon[3] == ':' + && isdigit(colon[4]) /* ...N:NN:NN... */ + && isdigit(colon[5]) + ) { + /* It's syslog file, not a bare dmesg */ + + /* Skip non-kernel lines */ + char *kernel_str = strstr(c, "kernel: "); + if (kernel_str == NULL) + { + /* if we see our own marker: + * "hostname abrt: Kerneloops: Reported 1 kernel oopses to Abrt" + * we know we submitted everything upto here already */ + if (strstr(c, "abrt:") && strstr(c, "Abrt")) + { + VERB3 log("Found our marker at line %d, restarting line count from 0", linecount); + linecount = 0; + lines_info_alloc = 0; + free(lines_info); + lines_info = NULL; + } + goto next_line; + } + c = kernel_str + sizeof("kernel: ")-1; + } + + linelevel = 0; + /* store and remove kernel log level */ + if (*c == '<' && c[1] && c[2] == '>') + { + linelevel = c[1]; + c += 3; + } + /* remove jiffies time stamp counter if present */ + if (*c == '[') + { + char *c2 = strchr(c, '.'); + char *c3 = strchr(c, ']'); + if (c2 && c3 && (c2 < c3) && (c3-c) < 14 && (c2-c) < 8) + { + c = c3 + 1; + if (*c == ' ') + c++; + } + } + if (linecount >= lines_info_alloc) + { + lines_info_alloc += REALLOC_CHUNK; + lines_info = (struct line_info*)xrealloc(lines_info, + lines_info_alloc * sizeof(lines_info[0])); + } + lines_info[linecount].ptr = c; + lines_info[linecount].level = linelevel; + linecount++; +next_line: + c = c9 + 1; + } + + /* Analyze lines */ + + int i; + char prevlevel = 0; + int oopsstart = -1; + int inbacktrace = 0; + int oopsesfound = 0; + + i = 0; + while (i < linecount) + { + char *curline = lines_info[i].ptr; + + if (curline == NULL) + { + i++; + continue; + } + while (*curline == ' ') + curline++; + + if (oopsstart < 0) + { + /* Find start-of-oops markers */ + /* In some comparisons, we skip 1st letter, to avoid dealing with + * changes in capitalization in kernel. For example, I see that + * current kernel git (at 2011-01-01) has both "kernel BUG at ..." + * and "Kernel BUG at ..." messages, and I don't want to change + * the code below whenever kernel is changed to use "K" (or "k") + * uniformly. + */ + if (strstr(curline, /*g*/ "eneral protection fault:")) + oopsstart = i; + else if (strstr(curline, "BUG:")) + oopsstart = i; + else if (strstr(curline, /*k*/ "ernel BUG at")) + oopsstart = i; + else if (strstr(curline, "do_IRQ: stack overflow:")) + oopsstart = i; + else if (strstr(curline, "RTNL: assertion failed")) + oopsstart = i; + else if (strstr(curline, /*e*/ "eek! page_mapcount(page) went negative!")) + oopsstart = i; + else if (strstr(curline, /*n*/ "ear stack overflow (cur:")) + oopsstart = i; + else if (strstr(curline, /*d*/ "ouble fault:")) + oopsstart = i; + else if (strstr(curline, /*b*/ "adness at")) + oopsstart = i; + else if (strstr(curline, "NETDEV WATCHDOG")) + oopsstart = i; + else if (strstr(curline, "WARNING: at ")) /* WARN_ON() generated message */ + oopsstart = i; + else if (strstr(curline, /*u*/ "nable to handle kernel")) + oopsstart = i; + else if (strstr(curline, /*s*/ "ysctl table check failed")) + oopsstart = i; + else if (strstr(curline, "INFO: possible recursive locking detected")) + oopsstart = i; + // Not needed: "--[ cut here ]--" is always followed + // by "Badness at", "kernel BUG at", or "WARNING: at" string + //else if (strstr(curline, "------------[ cut here ]------------")) + // oopsstart = i; + else if (strstr(curline, "list_del corruption")) + oopsstart = i; + else if (strstr(curline, "list_add corruption")) + oopsstart = i; + + if (i >= 3 && strstr(curline, "Oops:")) + oopsstart = i-3; + + if (oopsstart >= 0) + { + /* debug information */ + VERB3 { + log("Found oops at line %d: '%s'", oopsstart, lines_info[oopsstart].ptr); + if (oopsstart != i) + log("Trigger line is %d: '%s'", i, c); + } + /* try to find the end marker */ + int i2 = i + 1; + while (i2 < linecount && i2 < (i+50)) + { + if (strstr(lines_info[i2].ptr, "---[ end trace")) + { + inbacktrace = 1; + i = i2; + break; + } + i2++; + } + } + } + + /* Are we entering a call trace part? */ + /* a call trace starts with "Call Trace:" or with the " [<.......>] function+0xFF/0xAA" pattern */ + if (oopsstart >= 0 && !inbacktrace) + { + if (strstr(curline, "Call Trace:")) + inbacktrace = 1; + else + if (strnlen(curline, 9) > 8 + && curline[0] == '[' && curline[1] == '<' + && strstr(curline, ">]") + && strstr(curline, "+0x") + && strstr(curline, "/0x") + ) { + inbacktrace = 1; + } + } + + /* Are we at the end of an oops? */ + else if (oopsstart >= 0 && inbacktrace) + { + int oopsend = INT_MAX; + + /* line needs to start with " [" or have "] [" if it is still a call trace */ + /* example: "[<ffffffffa006c156>] radeon_get_ring_head+0x16/0x41 [radeon]" */ + if (curline[0] != '[' + && !strstr(curline, "] [") + && !strstr(curline, "--- Exception") + && !strstr(curline, "LR =") + && !strstr(curline, "<#DF>") + && !strstr(curline, "<IRQ>") + && !strstr(curline, "<EOI>") + && !strstr(curline, "<<EOE>>") + && strncmp(curline, "Code: ", 6) != 0 + && strncmp(curline, "RIP ", 4) != 0 + && strncmp(curline, "RSP ", 4) != 0 + ) { + oopsend = i-1; /* not a call trace line */ + } + /* oops lines are always more than 8 chars long */ + else if (strnlen(curline, 8) < 8) + oopsend = i-1; + /* single oopses are of the same loglevel */ + else if (lines_info[i].level != prevlevel) + oopsend = i-1; + else if (strstr(curline, "Instruction dump:")) + oopsend = i; + /* if a new oops starts, this one has ended */ + else if (strstr(curline, "WARNING: at ") && oopsstart != i) /* WARN_ON() generated message */ + oopsend = i-1; + else if (strstr(curline, "Unable to handle") && oopsstart != i) + oopsend = i-1; + /* kernel end-of-oops marker (not including marker itself) */ + else if (strstr(curline, "---[ end trace")) + oopsend = i-1; + + if (oopsend <= i) + { + VERB3 log("End of oops at line %d (%d): '%s'", oopsend, i, lines_info[oopsend].ptr); + if (record_oops(oopses, lines_info, oopsstart, oopsend)) + oopsesfound++; + oopsstart = -1; + inbacktrace = 0; + } + } + + prevlevel = lines_info[i].level; + i++; + + if (oopsstart >= 0) + { + /* Do we have a suspiciously long oops? Cancel it */ + if (i-oopsstart > 60) + { + inbacktrace = 0; + oopsstart = -1; + VERB3 log("Dropped oops, too long"); + continue; + } + if (!inbacktrace && i-oopsstart > 40) + { + /*inbacktrace = 0; - already is */ + oopsstart = -1; + VERB3 log("Dropped oops, too long"); + continue; + } + } + } /* while (i < linecount) */ + + /* process last oops if we have one */ + if (oopsstart >= 0 && inbacktrace) + { + int oopsend = i-1; + VERB3 log("End of oops at line %d (end of file): '%s'", oopsend, lines_info[oopsend].ptr); + if (record_oops(oopses, lines_info, oopsstart, oopsend)) + oopsesfound++; + } + + free(lines_info); + return oopsesfound; +} + +#define MAX_SCAN_BLOCK (4*1024*1024) +#define READ_AHEAD (10*1024) + +static void scan_dmesg(GList **oops_list) +{ + VERB1 log("Scanning dmesg"); + + /* syslog(3) - read the last len bytes from the log buffer + * (non-destructively), but dont read more than was written + * into the buffer since the last "clear ring buffer" cmd. + * Returns the number of bytes read. + */ + char *buffer = xzalloc(16*1024); + syscall(__NR_syslog, 3, buffer, 16*1024 - 1); /* always NUL terminated */ + extract_oopses(oops_list, buffer, strlen(buffer)); + free(buffer); +} + +static int scan_syslog_file(GList **oops_list, int fd, struct stat *statbuf, int partial_line_len) +{ + /* fstat(fd, &statbuf) was just done by caller */ + + off_t cur_pos = lseek(fd, 0, SEEK_CUR); + if (statbuf->st_size <= cur_pos) + return partial_line_len; /* we are at EOF, nothing to do */ + + VERB3 log("File grew by %llu bytes, from %llu to %llu", + (long long)(statbuf->st_size - cur_pos), + (long long)(cur_pos), + (long long)(statbuf->st_size)); + + /* Do not try to allocate an absurd amount of memory. */ + int sz = MAX_SCAN_BLOCK - READ_AHEAD; + if (sz > statbuf->st_size - cur_pos) + sz = statbuf->st_size - cur_pos; + + /* Rewind to the beginning of the current line */ + if (partial_line_len > 0 && lseek(fd, -partial_line_len, SEEK_CUR) != (off_t)-1) + { + VERB3 log("Went back %u bytes", partial_line_len); + cur_pos -= partial_line_len; + sz += partial_line_len; + } + + /* + * In theory we have a race here, since someone can spew + * to /var/log/messages before we read it in... + * We try to deal with it by reading READ_AHEAD extra. + */ + sz += READ_AHEAD; + char *buffer = xzalloc(sz); + + partial_line_len = 0; + do { + int r = full_read(fd, buffer, sz-1); + if (r <= 0) + break; + VERB3 log("Read %u bytes", r); + + /* For future scans, try to find where last (incomplete) line starts */ + partial_line_len = 0; + char *last_newline = memrchr(buffer, '\n', r) ? : buffer-1; + partial_line_len = buffer+r - (last_newline+1); + if (partial_line_len > 500) /* cap it */ + partial_line_len = 500; + + extract_oopses(oops_list, buffer, r); + cur_pos += r; + } while (cur_pos < statbuf->st_size); + + free(buffer); + + return partial_line_len; +} + +/* returns number of errors */ +static int save_oops_to_dump_dir(GList *oops_list, unsigned oops_cnt) +{ + unsigned countdown = 16; /* do not report hundreds of oopses */ + unsigned idx = oops_cnt; + time_t t = time(NULL); + pid_t my_pid = getpid(); + + VERB1 log("Saving %u oopses as crash dump dirs", idx >= countdown ? countdown-1 : idx); + + char *tainted_str = NULL; + /* once tainted flag is set to 1, only restart can reset the flag to 0 */ + FILE *tainted_fp = fopen("/proc/sys/kernel/tainted", "r"); + if (tainted_fp) + { + tainted_str = xmalloc_fgetline(tainted_fp); + fclose(tainted_fp); + } + else + error_msg("/proc/sys/kernel/tainted does not exist"); + + int errors = 0; + + while (idx != 0 && --countdown != 0) + { + char path[sizeof(DEBUG_DUMPS_DIR"/kerneloops-%lu-%lu-%lu") + 3 * sizeof(long)*3]; + sprintf(path, DEBUG_DUMPS_DIR"/kerneloops-%lu-%lu-%lu", (long)t, (long)my_pid, (long)idx); + + char *first_line = (char*)g_list_nth_data(oops_list, --idx); + char *second_line = (char*)strchr(first_line, '\n'); /* never NULL */ + *second_line++ = '\0'; + + struct dump_dir *dd = dd_create(path, /*uid:*/ 0); + if (dd) + { + dd_save_text(dd, FILENAME_ANALYZER, "Kerneloops"); + dd_save_text(dd, FILENAME_EXECUTABLE, "kernel"); + dd_save_text(dd, FILENAME_KERNEL, first_line); + dd_save_text(dd, FILENAME_CMDLINE, "not_applicable"); + dd_save_text(dd, FILENAME_BACKTRACE, second_line); + /* Optional, makes generated bz more informative */ + strchrnul(second_line, '\n')[0] = '\0'; + dd_save_text(dd, FILENAME_REASON, second_line); + + if (tainted_str && tainted_str[0] != '0') + dd_save_text(dd, FILENAME_TAINTED, tainted_str); + + free(tainted_str); + dd_close(dd); + } + else + errors++; + } + + return errors; +} + +int main(int argc, char **argv) +{ + char *env_verbose = getenv("ABRT_VERBOSE"); + if (env_verbose) + g_verbose = atoi(env_verbose); + + const char *filename = "/var/log/messages"; + + const char *program_usage = _( + PROGNAME" [-vsrdow] FILE\n" + "\n" + "Extract oops from syslog/dmesg file" + ); + enum { + OPT_v = 1 << 0, + OPT_s = 1 << 1, + OPT_r = 1 << 2, + OPT_d = 1 << 3, + OPT_o = 1 << 4, + OPT_w = 1 << 5, + }; + /* Keep enum above and order of options below in sync! */ + struct options program_options[] = { + OPT__VERBOSE(&g_verbose), + OPT_BOOL('s', NULL, NULL, _("Log to syslog")), + OPT_BOOL('r', NULL, NULL, _("Parse kernel's message buffer before parsing FILE")), + OPT_BOOL('d', NULL, NULL, _("Create ABRT dump for every oops found")), + OPT_BOOL('o', NULL, NULL, _("Print found oopses on standard output")), + OPT_BOOL('w', NULL, NULL, _("Do not exit, watch the file for new oopses")), + OPT_END() + }; + + unsigned opts = parse_opts(argc, argv, program_options, program_usage); + argv += optind; + if (!argv[0]) + show_usage_and_die(program_usage, program_options); + filename = argv[0]; + + putenv(xasprintf("ABRT_VERBOSE=%u", g_verbose)); + msg_prefix = PROGNAME; + if ((opts & OPT_s) + || getenv("ABRT_SYSLOG") + ) { + openlog(msg_prefix, 0, LOG_DAEMON); + logmode = LOGMODE_SYSLOG; + } + + int inotify_fd = -1; + if (opts & OPT_w) + { + inotify_fd = inotify_init(); + if (inotify_fd == -1) + perror_msg_and_die("inotify_init failed"); + //close_on_exec_on(inotify_fd); + } + + GList *oops_list = NULL; + if (opts & OPT_r) + /* Scan dmesg (only once even with -w) */ + scan_dmesg(&oops_list); + + int partial_line_len = 0; + struct stat statbuf; + int file_fd = -1; + int wd = -1; + + while (1) /* loops only if -w */ + { + /* If file is already opened, parse oopses from current pos */ + if (file_fd >= 0) + { + memset(&statbuf, 0, sizeof(statbuf)); + fstat(file_fd, &statbuf); + partial_line_len = scan_syslog_file(&oops_list, file_fd, &statbuf, partial_line_len); + + /* Was file deleted or replaced? */ + ino_t fd_ino = statbuf.st_ino; + if (stat(filename, &statbuf) != 0 || statbuf.st_ino != fd_ino) /* yes */ + { + VERB2 log("Can't stat '%s', closing fd", filename); + close(file_fd); + if (wd >= 0) + inotify_rm_watch(inotify_fd, wd); + file_fd = -1; + wd = -1; + partial_line_len = 0; + } + } + + /* If file isn't opened, try to open it and parse oopses */ + if (file_fd < 0) + { + file_fd = open(filename, O_RDONLY); + if (file_fd < 0) + { + if (!(opts & OPT_w)) + perror_msg_and_die("Can't open '%s'", filename); + /* with -w, we ignore open errors */ + } + else + { + VERB2 log("Opened '%s'", filename); + /* For -w case, if we don't have inotify watch yet, open one */ + if ((opts & OPT_w) && wd < 0) + { + wd = inotify_add_watch(inotify_fd, filename, IN_MODIFY | IN_MOVE_SELF | IN_DELETE_SELF); + if (wd < 0) + perror_msg("inotify_add_watch failed on '%s'", filename); + else + VERB2 log("Added inotify watch for '%s'", filename); + } + if (fstat(file_fd, &statbuf) == 0) + { + /* If file is large, skip the beginning. + * IOW: ignore old log messages because they are unlikely + * to have sufficiently recent data to be useful. + */ + if (statbuf.st_size > (MAX_SCAN_BLOCK - READ_AHEAD)) + lseek(file_fd, statbuf.st_size - (MAX_SCAN_BLOCK - READ_AHEAD), SEEK_SET); + /* Note that statbuf is filled by fstat by now, + * scan_syslog_file needs that + */ + partial_line_len = scan_syslog_file(&oops_list, file_fd, &statbuf, partial_line_len); + } + } + } + + /* Print and/or save oopses */ + int oops_cnt = g_list_length(oops_list); + if (!(opts & OPT_w) || oops_cnt != 0) + log("Found oopses: %d", oops_cnt); + if (oops_cnt != 0) + { + if (opts & OPT_o) + { + int i = 0; + while (i < oops_cnt) + printf("\nVersion: %s", (char*)g_list_nth_data(oops_list, i++)); + } + if (opts & OPT_d) + { + log("Creating dump directories"); + int errors = save_oops_to_dump_dir(oops_list, oops_cnt); + if (errors > 0) + log("%d errors while dumping oopses", errors); + } + } + for (GList *li = oops_list; li; li = g_list_next(li)) + free((char*)li->data); + g_list_free(oops_list); + oops_list = NULL; + + /* Done if no -w */ + if (!(opts & OPT_w)) + break; + + /* Even if log file grows all the time, say, a new line every 5 ms, + * we don't want to scan it all the time. Sleep a bit and let it grow + * in bigger increments. + * Sleep longer if file does not exist. + */ + sleep(file_fd >= 0 ? 1 : 59); + + /* Now wait for it to change, be moved or deleted */ + if (wd >= 0) + { + char buf[4096]; + VERB3 log("Waiting for '%s' to change", filename); + /* We block here: */ + int len = read(inotify_fd, buf, sizeof(buf)); + if (len < 0 && errno != EINTR) /* I saw EINTR here on strace attach */ + perror_msg("Error reading inotify fd"); + /* we don't actually check what happened to file - + * the code will handle all possibilities. + */ + VERB3 log("Change in '%s' detected", filename); + } + + } /* while (1) */ + + return 0; +} |