diff options
| author | Rich Megginson <rmeggins@redhat.com> | 2009-03-12 02:16:43 +0000 |
|---|---|---|
| committer | Rich Megginson <rmeggins@redhat.com> | 2009-03-12 02:16:43 +0000 |
| commit | 2bc78534fe250dcdf5efd211c5d0904e69287c9b (patch) | |
| tree | edb17775d5d82f260e8348d9ee3dec3da5cf25ec /ldap | |
| parent | 0e53c2ae9d98fb7eef5d3d311b98fc63a437f894 (diff) | |
| download | ds-2bc78534fe250dcdf5efd211c5d0904e69287c9b.tar.gz ds-2bc78534fe250dcdf5efd211c5d0904e69287c9b.tar.xz ds-2bc78534fe250dcdf5efd211c5d0904e69287c9b.zip | |
Resolves: bug 488866
Bug Description: crash in reliab15 test
Reviewed by: nkinder (Thanks!)
Fix Description: There was still a small window of time during which the connection could be closed out from under the other thread which was sending/reading result. The solution is to use explicit locking using the conn->lock to protect access to the conn->ld. Since this also affected the total update code, I tested it under similar conditions, and found that it exhibited the same behavior. I added checking to the total update code to check for disconnection and coordinate access in the entry sending/result reading threads.
I also fixed a spurious error message about the sasl path.
Platforms tested: RHEL5
Flag Day: no
Doc impact: no
Diffstat (limited to 'ldap')
| -rw-r--r-- | ldap/servers/plugins/replication/repl5_connection.c | 51 | ||||
| -rw-r--r-- | ldap/servers/plugins/replication/repl5_tot_protocol.c | 38 | ||||
| -rw-r--r-- | ldap/servers/slapd/util.c | 4 |
3 files changed, 73 insertions, 20 deletions
diff --git a/ldap/servers/plugins/replication/repl5_connection.c b/ldap/servers/plugins/replication/repl5_connection.c index 37bec7ea..5a171cb5 100644 --- a/ldap/servers/plugins/replication/repl5_connection.c +++ b/ldap/servers/plugins/replication/repl5_connection.c @@ -306,12 +306,20 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda while (1) { - if (!conn_connected(conn)) { + /* we have to make sure the update sending thread does not + attempt to call conn_disconnect while we are reading + results - so lock the conn while we get the results */ + PR_Lock(conn->lock); + if ((STATE_CONNECTED != conn->state) || !conn->ld) { rc = -1; return_value = CONN_NOT_CONNECTED; + PR_Unlock(conn->lock); break; } + rc = ldap_result(conn->ld, LDAP_RES_ANY , 1, &local_timeout, &res); + PR_Unlock(conn->lock); + if (0 != rc) { /* Something other than a timeout happened */ @@ -342,6 +350,18 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda repl5_stop_debug_timeout(eqctx, &setlevel); + PR_Lock(conn->lock); + /* we have to check again since the connection may have + been closed in the meantime + acquire the lock for the rest of the function + to protect against another attempt to close + the conn while we are using it + */ + if ((STATE_CONNECTED != conn->state) || !conn->ld) { + rc = -1; + return_value = CONN_NOT_CONNECTED; + } + if (0 == rc) { /* Timeout */ @@ -370,7 +390,7 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda later */ if (IS_DISCONNECT_ERROR(rc)) { - conn_disconnect(conn); + close_connection_internal(conn); /* we already have the lock */ return_value = CONN_NOT_CONNECTED; } else @@ -397,13 +417,13 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda if (IS_DISCONNECT_ERROR(rc)) { conn->last_ldap_error = rc; - conn_disconnect(conn); + close_connection_internal(conn); /* we already have the lock */ return_value = CONN_NOT_CONNECTED; } else if (IS_DISCONNECT_ERROR(err)) { conn->last_ldap_error = err; - conn_disconnect(conn); + close_connection_internal(conn); /* we already have the lock */ return_value = CONN_NOT_CONNECTED; } /* Got a result */ @@ -450,6 +470,7 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda conn->status = STATUS_CONNECTED; } if (res) ldap_msgfree(res); + PR_Unlock(conn->lock); /* release the conn lock */ return return_value; } @@ -565,7 +586,10 @@ perform_operation(Repl_Connection *conn, int optype, const char *dn, server_controls[1] = update_control; server_controls[2] = NULL; - if (conn_connected(conn)) + /* lock the conn to prevent the result reader thread + from closing the connection out from under us */ + PR_Lock(conn->lock); + if (STATE_CONNECTED == conn->state) { int setlevel = 0; @@ -574,6 +598,7 @@ perform_operation(Repl_Connection *conn, int optype, const char *dn, return_value = see_if_write_available( conn, PR_SecondsToInterval(conn->timeout.tv_sec)); if (return_value != CONN_OPERATION_SUCCESS) { + PR_Unlock(conn->lock); return return_value; } conn->last_operation = optype; @@ -627,7 +652,7 @@ perform_operation(Repl_Connection *conn, int optype, const char *dn, conn->last_ldap_error = rc; if (IS_DISCONNECT_ERROR(rc)) { - conn_disconnect(conn); + close_connection_internal(conn); /* already have the lock */ return_value = CONN_NOT_CONNECTED; } else @@ -640,11 +665,12 @@ perform_operation(Repl_Connection *conn, int optype, const char *dn, else { /* conn->last_ldap_error has been set to a more specific value - * in conn_connected() + * in the thread that did the disconnection * conn->last_ldap_error = LDAP_SERVER_DOWN; */ return_value = CONN_NOT_CONNECTED; } + PR_Unlock(conn->lock); /* release the lock */ if (message_id) { *message_id = msgid; @@ -1073,6 +1099,13 @@ conn_connect(Repl_Connection *conn) static void close_connection_internal(Repl_Connection *conn) { + conn->state = STATE_DISCONNECTED; + conn->status = STATUS_DISCONNECTED; + conn->supports_ds50_repl = -1; + conn->supports_ds71_repl = -1; + /* do this last, to minimize the chance that another thread + might read conn->state as not disconnected and attempt + to use conn->ld */ if (NULL != conn->ld) { /* Since we call slapi_ldap_init, @@ -1080,10 +1113,6 @@ close_connection_internal(Repl_Connection *conn) slapi_ldap_unbind(conn->ld); } conn->ld = NULL; - conn->state = STATE_DISCONNECTED; - conn->status = STATUS_DISCONNECTED; - conn->supports_ds50_repl = -1; - conn->supports_ds71_repl = -1; slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, "%s: Disconnected from the consumer\n", agmt_get_long_name(conn->agmt)); } diff --git a/ldap/servers/plugins/replication/repl5_tot_protocol.c b/ldap/servers/plugins/replication/repl5_tot_protocol.c index 7e5a3f8e..11c82553 100644 --- a/ldap/servers/plugins/replication/repl5_tot_protocol.c +++ b/ldap/servers/plugins/replication/repl5_tot_protocol.c @@ -95,6 +95,7 @@ static void get_result (int rc, void *cb_data); static int send_entry (Slapi_Entry *e, void *callback_data); static void repl5_tot_delete(Private_Repl_Protocol **prp); +#define LOST_CONN_ERR(xx) ((xx == -2) || (xx == LDAP_SERVER_DOWN) || (xx == LDAP_CONNECT_ERROR)) /* * Notes on the async version of this code: * First, we need to have the supplier and consumer both be async-capable. @@ -120,9 +121,9 @@ static void repl5_tot_log_operation_failure(int ldap_error, char* ldap_error_string, const char *agreement_name) { slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name, - "%s: Received error %d: %s for total update operation\n", + "%s: Received error %d (%s): %s for total update operation\n", agreement_name, - ldap_error, ldap_error_string ? ldap_error_string : "NULL"); + ldap_error, ldap_err2string(ldap_error), ldap_error_string ? ldap_error_string : ""); } /* Thread that collects results from async operations sent to the consumer */ @@ -206,7 +207,9 @@ static void repl5_tot_result_threadmain(void *param) } /* Should we stop ? */ PR_Lock(cb->lock); - if (cb->stop_result_thread) + /* if the connection is not connected, then we cannot read any more + results - we are finished */ + if (cb->stop_result_thread || (conres == CONN_NOT_CONNECTED)) { finished = 1; } @@ -290,13 +293,17 @@ repl5_tot_waitfor_async_results(callback_data *cb_data) /* If so then we're done */ done = 1; } + if (cb_data->abort && LOST_CONN_ERR(cb_data->rc)) + { + done = 1; /* no connection == no more results */ + } PR_Unlock(cb_data->lock); /* If not then sleep a bit */ DS_Sleep(PR_SecondsToInterval(1)); loops++; /* If we sleep forever then we can conclude that something bad happened, and bail... */ /* Arbitrary 30 second delay : basically we should only expect to wait as long as it takes to process a few operations, which should be on the order of a second at most */ - if (loops > 300) + if (!done && (loops > 300)) { /* Log a warning */ slapi_log_error(SLAPI_LOG_FATAL, NULL, @@ -618,6 +625,18 @@ int send_entry (Slapi_Entry *e, void *cb_data) return -1; } + /* see if the result reader thread encountered + a fatal error */ + PR_Lock(((callback_data*)cb_data)->lock); + rc = ((callback_data*)cb_data)->abort; + PR_Unlock(((callback_data*)cb_data)->lock); + if (rc) + { + conn_disconnect(prp->conn); + prp->stopped = 1; + ((callback_data*)cb_data)->rc = -1; + return -1; + } /* skip ruv tombstone - need to do this because it might be more up to date then the data we are sending to the client. RUV is sent separately via the protocol */ @@ -702,9 +721,14 @@ int send_entry (Slapi_Entry *e, void *cb_data) ber_bvfree(bv); (*num_entriesp)++; - /* For async operation we need to inspect the abort status from the result thread here */ - - if (CONN_OPERATION_SUCCESS == rc) { + /* if the connection has been closed, we need to stop + sending entries and set a special rc value to let + the result reading thread know the connection has been + closed - do not attempt to read any more results */ + if (CONN_NOT_CONNECTED == rc) { + ((callback_data*)cb_data)->rc = -2; + retval = -1; + } else if (CONN_OPERATION_SUCCESS == rc) { retval = 0; } else { ((callback_data*)cb_data)->rc = rc; diff --git a/ldap/servers/slapd/util.c b/ldap/servers/slapd/util.c index 4fcae3ba..2aff1e30 100644 --- a/ldap/servers/slapd/util.c +++ b/ldap/servers/slapd/util.c @@ -959,7 +959,7 @@ slapi_ldap_init_ext( char *pp = NULL; if (NULL == pluginpath || (*pluginpath == '\0')) { - slapi_log_error(SLAPI_LOG_FATAL, "slapi_ldap_init_ext", + slapi_log_error(SLAPI_LOG_SHELL, "slapi_ldap_init_ext", "configpluginpath == NULL\n"); if (!(pluginpath = getenv("SASL_PATH"))) { #if defined(LINUX) && defined(__LP64__) @@ -974,7 +974,7 @@ slapi_ldap_init_ext( (0 != strcmp(++pp, pluginpath)) /* sasl_path has been updated */ ) { PR_snprintf(util_sasl_path, sizeof(util_sasl_path), "SASL_PATH=%s", pluginpath); - slapi_log_error(SLAPI_LOG_FATAL, "slapi_ldap_init_ext", + slapi_log_error(SLAPI_LOG_SHELL, "slapi_ldap_init_ext", "putenv(%s)\n", util_sasl_path); putenv(util_sasl_path); } |
