summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ctdb/client/ctdb_client.c21
-rw-r--r--ctdb/common/ctdb_util.c16
-rw-r--r--ctdb/include/ctdb_private.h8
-rw-r--r--ctdb/server/ctdb_control.c4
-rw-r--r--ctdb/server/ctdb_recover.c6
-rw-r--r--ctdb/server/ctdb_recoverd.c5
-rw-r--r--ctdb/server/ctdb_tunables.c1
-rw-r--r--ctdb/tools/ctdb.c3
8 files changed, 64 insertions, 0 deletions
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 3bdb4b2617d..fa6a990a4d9 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -3615,3 +3615,24 @@ int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb,
return 0;
}
+/*
+ tell the main daemon how long it took to lock the reclock file
+ */
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
+{
+ int ret;
+ int32_t res;
+ TDB_DATA data;
+
+ data.dptr = (uint8_t *)&latency;
+ data.dsize = sizeof(latency);
+
+ ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data,
+ ctdb, NULL, &res, NULL, NULL);
+ if (ret != 0 || res != 0) {
+ DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/ctdb/common/ctdb_util.c b/ctdb/common/ctdb_util.c
index a3f1731fa13..3604cc8ed4b 100644
--- a/ctdb/common/ctdb_util.c
+++ b/ctdb/common/ctdb_util.c
@@ -138,6 +138,22 @@ void ctdb_latency(struct ctdb_db_context *ctdb_db, const char *name, double *lat
}
}
+/*
+ update a reclock latency number
+ */
+void ctdb_reclock_latency(struct ctdb_context *ctdb, const char *name, double *latency, double l)
+{
+ if (l > *latency) {
+ *latency = l;
+ }
+
+ if (ctdb->tunable.reclock_latency_ms !=0) {
+ if (l*1000 > ctdb->tunable.reclock_latency_ms) {
+ DEBUG(DEBUG_ERR, ("High RECLOCK latency %fs for operation %s\n", l, name));
+ }
+ }
+}
+
uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state)
{
uint32_t id;
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index f37560abe06..5ecf8a6494f 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -105,6 +105,7 @@ struct ctdb_tunable {
uint32_t recd_ping_timeout;
uint32_t recd_ping_failcount;
uint32_t log_latency_ms;
+ uint32_t reclock_latency_ms;
uint32_t recovery_drop_all_ips;
uint32_t verify_recovery_lock;
};
@@ -299,6 +300,10 @@ struct ctdb_statistics {
uint32_t control;
uint32_t traverse;
} timeouts;
+ struct {
+ double ctdbd;
+ double recd;
+ } reclock;
uint32_t total_calls;
uint32_t pending_calls;
uint32_t lockwait_calls;
@@ -562,6 +567,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0,
CTDB_CONTROL_EVENT_SCRIPT_FINISHED = 95,
CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS = 96,
CTDB_CONTROL_TRAVERSE_KILL = 97,
+ CTDB_CONTROL_RECD_RECLOCK_LATENCY = 98,
};
/*
@@ -954,6 +960,7 @@ void ctdb_recv_raw_pkt(void *p, uint8_t *data, uint32_t length);
int ctdb_socket_connect(struct ctdb_context *ctdb);
void ctdb_latency(struct ctdb_db_context *ctdb_db, const char *name, double *latency, struct timeval t);
+void ctdb_reclock_latency(struct ctdb_context *ctdb, const char *name, double *latency, double l);
uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state);
void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location);
@@ -1431,5 +1438,6 @@ int32_t ctdb_control_event_script_finished(struct ctdb_context *ctdb);
int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb, TDB_DATA *outdata);
int ctdb_log_event_script_output(struct ctdb_context *ctdb, char *str, uint16_t len);
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency);
#endif
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index d5d4766490d..72e60c526b9 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -441,6 +441,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
CHECK_CONTROL_DATA_SIZE(0);
return ctdb_control_get_event_script_status(ctdb, outdata);
+ case CTDB_CONTROL_RECD_RECLOCK_LATENCY:
+ CHECK_CONTROL_DATA_SIZE(sizeof(double));
+ ctdb_reclock_latency(ctdb, "recd reclock", &ctdb->statistics.reclock.recd, *((double *)indata.dptr));
+ return 0;
default:
DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
return -1;
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index c979398f30a..b4428fa4946 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -492,6 +492,7 @@ struct ctdb_set_recmode_state {
struct timed_event *te;
struct fd_event *fde;
pid_t child;
+ struct timeval start_time;
};
/*
@@ -520,6 +521,10 @@ static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_even
*/
static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
{
+ double l = timeval_elapsed(&state->start_time);
+
+ ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
+
kill(state->child, SIGKILL);
return 0;
}
@@ -632,6 +637,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
state = talloc(ctdb, struct ctdb_set_recmode_state);
CTDB_NO_MEMORY(ctdb, state);
+ state->start_time = timeval_current();
if (ctdb->tunable.verify_recovery_lock == 0) {
/* dont need to verify the reclock file */
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index c6d0a7a5f4a..995284fbb22 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1328,6 +1328,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
struct ctdb_dbid_map *dbmap;
TDB_DATA data;
uint32_t *nodes;
+ struct timeval start_time;
DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
@@ -1351,11 +1352,15 @@ static int do_recovery(struct ctdb_recoverd *rec,
ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
}
+ DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
+ start_time = timeval_current();
if (!ctdb_recovery_lock(ctdb, true)) {
ctdb_set_culprit(rec, pnn);
DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
return -1;
}
+ ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
+ DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c
index bab9aef25a8..6a8876fc2b7 100644
--- a/ctdb/server/ctdb_tunables.c
+++ b/ctdb/server/ctdb_tunables.c
@@ -53,6 +53,7 @@ static const struct {
{ "RecdPingTimeout", 20, offsetof(struct ctdb_tunable, recd_ping_timeout) },
{ "RecdFailCount", 3, offsetof(struct ctdb_tunable, recd_ping_failcount) },
{ "LogLatencyMs", 0, offsetof(struct ctdb_tunable, log_latency_ms) },
+ { "RecLockLatencyMs", 1000, offsetof(struct ctdb_tunable, reclock_latency_ms) },
{ "RecoveryDropAllIPs", 60, offsetof(struct ctdb_tunable, recovery_drop_all_ips) },
{ "VerifyRecoveryLock", 1, offsetof(struct ctdb_tunable, verify_recovery_lock) },
};
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 35a4190eb73..b85354c0019 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -209,6 +209,9 @@ static void show_statistics(struct ctdb_statistics *s)
preflen?0:4, "",
*(uint32_t *)(fields[i].offset+(uint8_t *)s));
}
+ printf(" %-30s %.6f sec\n", "max_reclock_ctdbd", s->reclock.ctdbd);
+ printf(" %-30s %.6f sec\n", "max_reclock_recd", s->reclock.recd);
+
printf(" %-30s %.6f sec\n", "max_call_latency", s->max_call_latency);
printf(" %-30s %.6f sec\n", "max_lockwait_latency", s->max_lockwait_latency);
printf(" %-30s %.6f sec\n", "max_childwrite_latency", s->max_childwrite_latency);