diff options
Diffstat (limited to 'ctdb/server/ctdb_ltdb_server.c')
-rw-r--r-- | ctdb/server/ctdb_ltdb_server.c | 389 |
1 files changed, 385 insertions, 4 deletions
diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c index 9a4044e0ca..b966386b13 100644 --- a/ctdb/server/ctdb_ltdb_server.c +++ b/ctdb/server/ctdb_ltdb_server.c @@ -23,6 +23,7 @@ #include "system/network.h" #include "system/filesys.h" #include "system/dir.h" +#include "system/time.h" #include "../include/ctdb_private.h" #include "db_wrap.h" #include "lib/util/dlinklist.h" @@ -190,6 +191,250 @@ static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db) } } +int ctdb_load_persistent_health(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db) +{ + struct tdb_context *tdb = ctdb->db_persistent_health->tdb; + char *old; + char *reason = NULL; + TDB_DATA key; + TDB_DATA val; + + key.dptr = discard_const_p(uint8_t, ctdb_db->db_name); + key.dsize = strlen(ctdb_db->db_name); + + old = ctdb_db->unhealthy_reason; + ctdb_db->unhealthy_reason = NULL; + + val = tdb_fetch(tdb, key); + if (val.dsize > 0) { + reason = talloc_strndup(ctdb_db, + (const char *)val.dptr, + val.dsize); + if (reason == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n", + (int)val.dsize)); + ctdb_db->unhealthy_reason = old; + free(val.dptr); + return -1; + } + } + + if (val.dptr) { + free(val.dptr); + } + + talloc_free(old); + ctdb_db->unhealthy_reason = reason; + return 0; +} + +int ctdb_update_persistent_health(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db, + const char *given_reason,/* NULL means healthy */ + int num_healthy_nodes) +{ + struct tdb_context *tdb = ctdb->db_persistent_health->tdb; + int ret; + TDB_DATA key; + TDB_DATA val; + char *new_reason = NULL; + char *old_reason = NULL; + + ret = tdb_transaction_start(tdb); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n", + tdb_name(tdb), ret, tdb_errorstr(tdb))); + return -1; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n", + ctdb_db->db_name, ret)); + return -1; + } + old_reason = ctdb_db->unhealthy_reason; + + key.dptr = discard_const_p(uint8_t, ctdb_db->db_name); + key.dsize = strlen(ctdb_db->db_name); + + if (given_reason) { + new_reason = talloc_strdup(ctdb_db, given_reason); + if (new_reason == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n", + given_reason)); + return -1; + } + } else if (old_reason && num_healthy_nodes == 0) { + /* + * If the reason indicates ok, but there where no healthy nodes + * available, that it means, we have not recovered valid content + * of the db. So if there's an old reason, prefix it with + * "NO-HEALTHY-NODES - " + */ + const char *prefix; + +#define _TMP_PREFIX "NO-HEALTHY-NODES - " + ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX)); + if (ret != 0) { + prefix = _TMP_PREFIX; + } else { + prefix = ""; + } + new_reason = talloc_asprintf(ctdb_db, "%s%s", + prefix, old_reason); + if (new_reason == NULL) { + DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n", + prefix, old_reason)); + return -1; + } +#undef _TMP_PREFIX + } + + if (new_reason) { + val.dptr = discard_const_p(uint8_t, new_reason); + val.dsize = strlen(new_reason); + + ret = tdb_store(tdb, key, val, TDB_REPLACE); + if (ret != 0) { + tdb_transaction_cancel(tdb); + DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n", + tdb_name(tdb), ctdb_db->db_name, new_reason, + ret, tdb_errorstr(tdb))); + talloc_free(new_reason); + return -1; + } + DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n", + ctdb_db->db_name, new_reason)); + } else if (old_reason) { + ret = tdb_delete(tdb, key); + if (ret != 0) { + tdb_transaction_cancel(tdb); + DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n", + tdb_name(tdb), ctdb_db->db_name, + ret, tdb_errorstr(tdb))); + talloc_free(new_reason); + return -1; + } + DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n", + ctdb_db->db_name)); + } + + ret = tdb_transaction_commit(tdb); + if (ret != TDB_SUCCESS) { + DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n", + tdb_name(tdb), ret, tdb_errorstr(tdb))); + talloc_free(new_reason); + return -1; + } + + talloc_free(old_reason); + ctdb_db->unhealthy_reason = new_reason; + + return 0; +} + +static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb, + struct ctdb_db_context *ctdb_db) +{ + time_t now = time(NULL); + char *new_path; + char *new_reason; + int ret; + struct tm *tm; + + tm = gmtime(&now); + + /* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */ + new_path = talloc_asprintf(ctdb_db, "%s.corrupted." + "%04u%02u%02u%02u%02u%02u.0Z", + ctdb_db->db_path, + tm->tm_year+1900, tm->tm_mon+1, + tm->tm_mday, tm->tm_hour, tm->tm_min, + tm->tm_sec); + if (new_path == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + return -1; + } + + new_reason = talloc_asprintf(ctdb_db, + "ERROR - Backup of corrupted TDB in '%s'", + new_path); + if (new_reason == NULL) { + DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n")); + return -1; + } + ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0); + talloc_free(new_reason); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n", + ctdb_db->db_path)); + return -1; + } + + ret = rename(ctdb_db->db_path, new_path); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n", + ctdb_db->db_path, new_path, + errno, strerror(errno))); + talloc_free(new_path); + return -1; + } + + DEBUG(DEBUG_CRIT,(__location__ + ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n", + ctdb_db->db_path, new_path)); + talloc_free(new_path); + return 0; +} + +int ctdb_recheck_persistent_health(struct ctdb_context *ctdb) +{ + struct ctdb_db_context *ctdb_db; + int ret; + int ok = 0; + int fail = 0; + + for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) { + if (!ctdb_db->persistent) { + continue; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ + " load persistent health for '%s' failed\n", + ctdb_db->db_path)); + return -1; + } + + if (ctdb_db->unhealthy_reason == NULL) { + ok++; + DEBUG(DEBUG_INFO,(__location__ + " persistent db '%s' healthy\n", + ctdb_db->db_path)); + continue; + } + + fail++; + DEBUG(DEBUG_ALERT,(__location__ + " persistent db '%s' unhealthy: %s\n", + ctdb_db->db_path, + ctdb_db->unhealthy_reason)); + } + DEBUG((fail!=0)?DEBUG_ALERT:DEBUG_NOTICE, + ("ctdb_recheck_presistent_health: OK[%d] FAIL[%d]\n", + ok, fail)); + + if (fail != 0) { + return -1; + } + + return 0; +} /* attach to a database, handling both persistent and non-persistent databases @@ -202,6 +447,8 @@ static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name, int ret; struct TDB_DATA key; unsigned tdb_flags; + int mode = 0600; + int remaining_tries = 0; ctdb_db = talloc_zero(ctdb, struct ctdb_db_context); CTDB_NO_MEMORY(ctdb, ctdb_db); @@ -226,6 +473,47 @@ static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name, } } + if (persistent) { + if (unhealthy_reason) { + ret = ctdb_update_persistent_health(ctdb, ctdb_db, + unhealthy_reason, 0); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n", + ctdb_db->db_name, unhealthy_reason, ret)); + talloc_free(ctdb_db); + return -1; + } + } + + if (ctdb->max_persistent_check_errors > 0) { + remaining_tries = 1; + } + if (ctdb->done_startup) { + remaining_tries = 0; + } + + ret = ctdb_load_persistent_health(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n", + ctdb_db->db_name, ret)); + talloc_free(ctdb_db); + return -1; + } + } + + if (ctdb_db->unhealthy_reason && remaining_tries == 0) { + DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + talloc_free(ctdb_db); + return -1; + } + + if (ctdb_db->unhealthy_reason) { + /* this is just a warning, but we want that in the log file! */ + DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + } + /* open the database */ ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u", persistent?ctdb->db_directory_persistent:ctdb->db_directory, @@ -237,18 +525,105 @@ static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name, } tdb_flags |= TDB_DISALLOW_NESTING; +again: ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, ctdb->tunable.database_hash_size, tdb_flags, - O_CREAT|O_RDWR, 0600); + O_CREAT|O_RDWR, mode); if (ctdb_db->ltdb == NULL) { - DEBUG(DEBUG_CRIT,("Failed to open tdb '%s'\n", ctdb_db->db_path)); - talloc_free(ctdb_db); - return -1; + struct stat st; + int saved_errno = errno; + + if (!persistent) { + DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + if (remaining_tries == 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to open persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + ret = stat(ctdb_db->db_path, &st); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to open persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to open persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + saved_errno, + strerror(saved_errno))); + talloc_free(ctdb_db); + return -1; + } + + remaining_tries--; + mode = st.st_mode; + goto again; } if (!persistent) { ctdb_check_db_empty(ctdb_db); + } else { + ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL); + if (ret != 0) { + int fd; + struct stat st; + + DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n", + ctdb_db->db_path, ret, + tdb_errorstr(ctdb_db->ltdb->tdb))); + if (remaining_tries == 0) { + talloc_free(ctdb_db); + return -1; + } + + fd = tdb_fd(ctdb_db->ltdb->tdb); + ret = fstat(fd, &st); + if (ret != 0) { + DEBUG(DEBUG_CRIT,(__location__ + "Failed to fstat() persistent tdb '%s': %d - %s\n", + ctdb_db->db_path, + errno, + strerror(errno))); + talloc_free(ctdb_db); + return -1; + } + + /* close the TDB */ + talloc_free(ctdb_db->ltdb); + ctdb_db->ltdb = NULL; + + ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db); + if (ret != 0) { + DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n", + ctdb_db->db_path)); + talloc_free(ctdb_db); + return -1; + } + + remaining_tries--; + mode = st.st_mode; + goto again; + } } DLIST_ADD(ctdb->db_list, ctdb_db); @@ -587,6 +962,12 @@ int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint3 return -1; } + if (ctdb_db->unhealthy_reason) { + DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n", + ctdb_db->db_name, ctdb_db->unhealthy_reason)); + return -1; + } + tdb_increment_seqnum_nonblock(ctdb_db->ltdb->tdb); ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb); return 0; |