summaryrefslogtreecommitdiffstats
path: root/ldap
diff options
context:
space:
mode:
authorRich Megginson <rmeggins@redhat.com>2009-03-06 20:02:13 +0000
committerRich Megginson <rmeggins@redhat.com>2009-03-06 20:02:13 +0000
commit2a355b73670780a972a70bcd40366ace157e1658 (patch)
tree15922b6de9d9d1d3e638bd6ef86c7791e4beb839 /ldap
parenta0ece5a1605556b7f05dfeea4394e9b5df714b8d (diff)
downloadds-2a355b73670780a972a70bcd40366ace157e1658.tar.gz
ds-2a355b73670780a972a70bcd40366ace157e1658.tar.xz
ds-2a355b73670780a972a70bcd40366ace157e1658.zip
Resolves: bug 488866
Bug Description: crash in reliab15 test Reviewed by: nhosoi (Thanks!) Fix Description: I could not reproduce the crash, but I think the problem is that the server is not handling the disconnection case correctly. It seems that in the event of disconnection (LDAP_SERVER_DOWN 81 - Can't contact server) the code would continue to read results. repl5_inc_result_threadmain() will call conn_read_result_ex() in a loop. If conn_read_result_ex() detects a disconnection or an unrecoverable error, it will call conn_disconnect to close the connection, and return CONN_NOT_CONNECTED. Once this happens, the code must not use conn->ld any more. However, the code did not differentiate between the not connected case and other errors, so it would keep trying to read results (in case some errors are recoverable, the thread still has to read all of the pending results). The code has been fixed to handle disconnect cases specially. I also added some additional locking to make sure the result and the abort flags were set/read correctly. Finally, I changed the code that waits for results to come in, so that if the connection has been closed, it will just return immediately. Platforms tested: RHEL5 x86_64 Flag Day: no Doc impact: no
Diffstat (limited to 'ldap')
-rw-r--r--ldap/servers/plugins/replication/repl5_inc_protocol.c33
1 files changed, 26 insertions, 7 deletions
diff --git a/ldap/servers/plugins/replication/repl5_inc_protocol.c b/ldap/servers/plugins/replication/repl5_inc_protocol.c
index 94ee1c6f..ebf06669 100644
--- a/ldap/servers/plugins/replication/repl5_inc_protocol.c
+++ b/ldap/servers/plugins/replication/repl5_inc_protocol.c
@@ -329,6 +329,7 @@ static void repl5_inc_result_threadmain(void *param)
}
if (conres != CONN_TIMEOUT)
{
+ int return_value;
int should_finish = 0;
if (message_id)
{
@@ -347,17 +348,26 @@ static void repl5_inc_result_threadmain(void *param)
conn_get_error_ex(conn, &operation_code, &connection_error, &ldap_error_string);
slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: result %d, %d, %d, %d, %s\n", operation_code,connection_error,conres,message_id,ldap_error_string);
- rd->result = repl5_inc_update_from_op_result(rd->prp, conres, connection_error, csn_str, uniqueid, replica_id, &should_finish, &(rd->num_changes_sent));
- if (rd->result || should_finish)
+ return_value = repl5_inc_update_from_op_result(rd->prp, conres, connection_error, csn_str, uniqueid, replica_id, &should_finish, &(rd->num_changes_sent));
+ if (return_value || should_finish)
{
- slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: got op result %d should finish %d\n", rd->result, should_finish);
+ slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: got op result %d should finish %d\n", return_value, should_finish);
/* If so then we need to take steps to abort the update process */
PR_Lock(rd->lock);
+ rd->result = return_value;
rd->abort = 1;
PR_Unlock(rd->lock);
/* We also need to log the error, including details stored from when the operation was sent */
/* we cannot finish yet - we still need to waitfor the pending results, then
the main repl code will shut down this thread */
+ /* we can finish if we have disconnected - in that case, there will be nothing
+ to read */
+ if (return_value == UPDATE_CONNECTION_LOST) {
+ finished = 1;
+ }
+ } else {
+ /* old semantics had result set outside of lock */
+ rd->result = return_value;
}
}
/* Should we stop ? */
@@ -470,13 +480,17 @@ repl5_inc_waitfor_async_results(result_data *rd)
/* If so then we're done */
done = 1;
}
+ if (rd->abort && (rd->result == UPDATE_CONNECTION_LOST))
+ {
+ done = 1; /* no connection == no more results */
+ }
PR_Unlock(rd->lock);
/* If not then sleep a bit */
DS_Sleep(PR_SecondsToInterval(1));
loops++;
/* If we sleep forever then we can conclude that something bad happened, and bail... */
/* Arbitrary 30 second delay : basically we should only expect to wait as long as it takes to process a few operations, which should be on the order of a second at most */
- if (loops > 300)
+ if (!done && (loops > 300))
{
/* Log a warning */
slapi_log_error(SLAPI_LOG_FATAL, NULL,
@@ -1551,7 +1565,7 @@ repl5_inc_update_from_op_result(Private_Repl_Protocol *prp, ConnResult replay_cr
{
/* We lost the connection - enter backoff state */
- return_value = UPDATE_TRANSIENT_ERROR;
+ return_value = UPDATE_CONNECTION_LOST;
*finished = 1;
slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
"%s: Consumer failed to replay change (uniqueid %s, CSN %s): "
@@ -1794,7 +1808,7 @@ send_updates(Private_Repl_Protocol *prp, RUV *remote_update_vector, PRUint32 *nu
{
/* We lost the connection - enter backoff state */
- return_value = UPDATE_TRANSIENT_ERROR;
+ return_value = UPDATE_CONNECTION_LOST;
finished = 1;
slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
"%s: Consumer failed to replay change (uniqueid %s, CSN %s): "
@@ -1914,19 +1928,24 @@ send_updates(Private_Repl_Protocol *prp, RUV *remote_update_vector, PRUint32 *nu
return_value = UPDATE_YIELD;
finished = 1;
}
+ PR_Lock(rd->lock);
/* See if the result thread has hit a problem */
if (!finished && rd->abort)
{
return_value = rd->result;
finished = 1;
}
+ PR_Unlock(rd->lock);
} while (!finished);
/* Terminate the results reading thread */
if (!prp->repl50consumer)
{
/* We need to ensure that we wait until all the responses have been recived from our operations */
- repl5_inc_waitfor_async_results(rd);
+ if (return_value != UPDATE_CONNECTION_LOST) {
+ /* if connection was lost/closed, there will be nothing to read */
+ repl5_inc_waitfor_async_results(rd);
+ }
rc = repl5_inc_destroy_async_result_thread(rd);
if (rc) {