summaryrefslogtreecommitdiffstats
path: root/ctdb
diff options
context:
space:
mode:
authorRonnie Sahlberg <ronniesahlberg@gmail.com>2008-05-06 13:27:17 +1000
committerRonnie Sahlberg <ronniesahlberg@gmail.com>2008-05-06 13:27:17 +1000
commit6863c8f5736f0910d69578ff139dd580ac8c69a1 (patch)
tree2dc745e47292af162667c0f1fac9462cd56df7e3 /ctdb
parent80f85dc390285725fd0019b70b75466d03f44fce (diff)
close and reopen the reclock pnn file at regular intervals.
handle failure to get/hold the reclock pnn file better and just treat it as a transient backend filesystem error and try again later instead of shutting down the recovery daemon when we have lost the pnn file and if we are recmaster release the recmaster role so that someone else can become recmaster isntead (This used to be ctdb commit e513277fb09b951427be8351d04c877e0a15359d)
Diffstat (limited to 'ctdb')
-rw-r--r--ctdb/server/ctdb_recoverd.c25
1 files changed, 20 insertions, 5 deletions
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 1a53bb8334..3617efd4e2 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -2017,8 +2017,15 @@ ctdb_recoverd_write_pnn_connect_count(struct ctdb_recoverd *rec)
const char count = rec->num_connected;
struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
+ if (rec->rec_file_fd == -1) {
+ DEBUG(DEBUG_CRIT,(__location__ " Unable to write pnn count. pnnfile is not open.\n"));
+ return;
+ }
+
if (pwrite(rec->rec_file_fd, &count, 1, ctdb->pnn) == -1) {
DEBUG(DEBUG_CRIT, (__location__ " Failed to write pnn count\n"));
+ close(rec->rec_file_fd);
+ rec->rec_file_fd = -1;
}
}
@@ -2038,8 +2045,8 @@ ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
DEBUG(DEBUG_INFO, ("Setting PNN lock for pnn:%d\n", ctdb->pnn));
if (rec->rec_file_fd != -1) {
- DEBUG(DEBUG_CRIT, (__location__ " rec_lock_fd is already open. Aborting\n"));
- exit(10);
+ close(rec->rec_file_fd);
+ rec->rec_file_fd = -1;
}
pnnfile = talloc_asprintf(rec, "%s.pnn", ctdb->recovery_lock_file);
@@ -2049,7 +2056,8 @@ ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
if (rec->rec_file_fd == -1) {
DEBUG(DEBUG_CRIT,(__location__ " Unable to open %s - (%s)\n",
pnnfile, strerror(errno)));
- exit(10);
+ talloc_free(pnnfile);
+ return;
}
set_close_on_exec(rec->rec_file_fd);
@@ -2063,12 +2071,12 @@ ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
close(rec->rec_file_fd);
rec->rec_file_fd = -1;
DEBUG(DEBUG_CRIT,(__location__ " Failed to get pnn lock on '%s'\n", pnnfile));
- exit(10);
+ talloc_free(pnnfile);
+ return;
}
DEBUG(DEBUG_NOTICE,(__location__ " Got pnn lock on '%s'\n", pnnfile));
-
talloc_free(pnnfile);
/* we start out with 0 connected nodes */
@@ -2086,6 +2094,9 @@ static void ctdb_update_pnn_count(struct event_context *ev, struct timed_event *
struct ctdb_context *ctdb = rec->ctdb;
struct ctdb_node_map *nodemap = rec->nodemap;
+ /* close and reopen the pnn lock file */
+ ctdb_recoverd_get_pnn_lock(rec);
+
ctdb_recoverd_write_pnn_connect_count(rec);
event_add_timed(rec->ctdb->ev, rec->ctdb,
@@ -2108,6 +2119,10 @@ static void ctdb_update_pnn_count(struct event_context *ev, struct timed_event *
return;
}
if (ctdb->recovery_lock_fd == -1) {
+ DEBUG(DEBUG_ERR, (__location__ " Lost reclock pnn file. Yielding recmaster role\n"));
+ close(ctdb->recovery_lock_fd);
+ ctdb->recovery_lock_fd = -1;
+ force_election(rec, ctdb->pnn, rec->nodemap);
return;
}
for (i=0; i<nodemap->num; i++) {