summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com>2020-11-23 08:09:44 +0530
committerGitHub <noreply@github.com>2020-11-23 08:09:44 +0530
commitf74d3ab643a00632fdeb9b4912211b1767fa120a (patch)
tree3f59f26c8c0edf91df15c976c6398e3bf789e43d
parentda5ac9ae437079b20d5d559171be16d2d90ec46a (diff)
downloadglusterfs-f74d3ab643a00632fdeb9b4912211b1767fa120a.tar.gz
glusterfs-f74d3ab643a00632fdeb9b4912211b1767fa120a.tar.xz
glusterfs-f74d3ab643a00632fdeb9b4912211b1767fa120a.zip
enahancement/debug: Option to generate core dump without killing the process (#1814)
Comments and idea proposed by: Xavi Hernandez(jahernan@redhat.com): On production systems sometimes we see a log message saying that an assertion has failed. But it's hard to track why it failed without additional information (on debug builds, a GF_ASSERT() generates a core dump and kills the process, so it can be used to debug the issue, but many times we are only able to reproduce assertion failures on production systems, where GF_ASSERT() only logs a message and continues). In other cases we may have a core dump caused by a bug, but the core dump doesn't necessarily happen when the bug has happened. Sometimes the crash happens so much later that the causes that triggered the bug are lost. In these cases we can add more assertions to the places that touch the potential candidates to cause the bug, but the only thing we'll get is a log message, which may not be enough. One solution would be to always generate a core dump in case of assertion failure, but this was already discussed and it was decided that it was too drastic. If a core dump was really needed, a new macro was created to do so: GF_ABORT(), but GF_ASSERT() would continue to not kill the process on production systems. I'm proposing to modify GF_ASSERT() on production builds so that it conditionally triggers a signal when a debugger is attached. When this happens, the debugger will generate a core dump and continue the process as if nothing had happened. If there's no debugger attached, GF_ASSERT() will behave as always. The idea I have is to use SIGCONT to do that. This signal is harmless, so we can unmask it (we currently mask all unneeded signals) and raise it inside a GF_ASSERT() when some global variable is set to true. To produce the core dump, run the script under extras/debug/gfcore.py on other terminal. gdb breaks and produces coredump when GF_ASSERT is hit. The script is copied from #1810 which is written by Xavi Hernandez(jahernan@redhat.com) Fixes: #1810 Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53 Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
-rwxr-xr-xextras/debug/gfcore.py77
-rw-r--r--libglusterfs/src/common-utils.c11
-rw-r--r--libglusterfs/src/glusterfs/common-utils.h10
-rw-r--r--libglusterfs/src/libglusterfs.sym1
4 files changed, 97 insertions, 2 deletions
diff --git a/extras/debug/gfcore.py b/extras/debug/gfcore.py
new file mode 100755
index 0000000000..9f097f0de4
--- /dev/null
+++ b/extras/debug/gfcore.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+def launch():
+ if len(sys.argv) < 3:
+ sys.stderr.write("Syntax: {} <pid> <count> [<dir>]\n".format(os.path.basename(sys.argv[0])))
+ sys.exit(1)
+
+ pid = int(sys.argv[1])
+ count = int(sys.argv[2])
+ base = os.getcwd()
+ if len(sys.argv) > 3:
+ base = sys.argv[3]
+ base = os.path.realpath(base)
+
+ subprocess.run([
+ "gdb", "-batch",
+ "-p", str(pid),
+ "-ex", "py arg_count = {}".format(count),
+ "-ex", "py arg_dir = '{}'".format(base),
+ "-x", __file__
+ ])
+
+class GFCore(object):
+ def __init__(self, count, base):
+ self.count = count
+ self.base = base
+ gdb.execute('set pagination off')
+ gdb.execute('set gf_signal_on_assert = 1')
+ gdb.events.stop.connect(self.gf_stop)
+
+ self.cont()
+
+ def cont(self, quit = False):
+ if not(quit) and (self.count > 0):
+ gdb.execute('continue')
+ else:
+ gdb.execute('set gf_signal_on_assert = 0')
+ gdb.execute('quit')
+
+ def gf_stop(self, event):
+ quit = False
+
+ if isinstance(event, gdb.SignalEvent):
+ if event.stop_signal == 'SIGCONT':
+ now = datetime.utcnow().isoformat()
+ pid = gdb.selected_inferior().pid
+ name = "{}/gfcore.{}.{}".format(self.base, pid, now)
+ print("Generating coredump '{}'".format(name))
+ gdb.execute('gcore {}'.format(name))
+ self.count -= 1
+
+ elif event.stop_signal == 'SIGINT':
+ print("SIGINT received. Exiting")
+ quit = True
+
+ else:
+ print("Ignoring signal {}".format(event.stop_signal))
+ else:
+ print("Unexpected event {}".format(type(event)))
+
+ self.cont(quit)
+
+# Module 'gdb' is not available when running outside gdb.
+try:
+ import gdb
+ from datetime import datetime
+
+ GFCore(arg_count, arg_dir)
+except ModuleNotFoundError:
+ import sys
+ import os
+ import subprocess
+
+ try:
+ launch()
+ except KeyboardInterrupt:
+ pass
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 682cbf2805..9c684385f2 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -76,6 +76,8 @@ char *vol_type_str[] = {
"Distributed-Disperse",
};
+gf_boolean_t gf_signal_on_assert = false;
+
typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size);
typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size);
@@ -88,6 +90,14 @@ char *xattrs_to_heal[] = {"user.",
GF_XATTR_MDATA_KEY,
NULL};
+void gf_assert(void)
+{
+ if (gf_signal_on_assert) {
+ raise(SIGCONT);
+ }
+
+}
+
void
gf_xxh64_wrapper(const unsigned char *data, size_t const len,
unsigned long long const seed, char *xxh64)
@@ -4069,6 +4079,7 @@ gf_thread_vcreate(pthread_t *thread, const pthread_attr_t *attr,
sigdelset(&set, SIGSYS);
sigdelset(&set, SIGFPE);
sigdelset(&set, SIGABRT);
+ sigdelset(&set, SIGCONT);
pthread_sigmask(SIG_BLOCK, &set, &old);
diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h
index f297fdab5c..c8014a127b 100644
--- a/libglusterfs/src/glusterfs/common-utils.h
+++ b/libglusterfs/src/glusterfs/common-utils.h
@@ -26,6 +26,7 @@
#include <limits.h>
#include <fnmatch.h>
#include <uuid/uuid.h>
+#include <urcu/compiler.h>
/* FreeBSD, etc. */
#ifndef __BITS_PER_LONG
@@ -443,14 +444,19 @@ BIT_VALUE(unsigned char *array, unsigned int index)
} \
} while (0)
+void gf_assert(void);
+
#ifdef DEBUG
#define GF_ASSERT(x) assert(x);
#else
#define GF_ASSERT(x) \
do { \
- if (!(x)) { \
+ if (caa_unlikely(!(x))) { \
+ gf_assert(); \
gf_msg_callingfn("", GF_LOG_ERROR, 0, LG_MSG_ASSERTION_FAILED, \
- "Assertion failed: " #x); \
+ "Assertion failed: To attach gdb and coredump," \
+ " Run the script under " \
+ "\"glusterfs/extras/debug/gfcore.py\""); \
} \
} while (0)
#endif
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
index 5f18cd56cb..24735079d2 100644
--- a/libglusterfs/src/libglusterfs.sym
+++ b/libglusterfs/src/libglusterfs.sym
@@ -1191,3 +1191,4 @@ gf_latency_new
gf_latency_reset
gf_latency_update
gf_frame_latency_update
+gf_assert