summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr
diff options
context:
space:
mode:
authorVikas Gorur <vikas@gluster.com>2010-03-01 17:47:06 +0000
committerAnand V. Avati <avati@dev.gluster.com>2010-03-04 02:04:00 -0800
commit85a1716cd55b002b5d20a7903e28f106ee614814 (patch)
tree757c6364de8e58fd9af7e8ecade3eb18f7466806 /xlators/cluster/afr
parent2970ec54473fd55b27aeeb43a1f6bca879281cf0 (diff)
downloadglusterfs-85a1716cd55b002b5d20a7903e28f106ee614814.tar.gz
glusterfs-85a1716cd55b002b5d20a7903e28f106ee614814.tar.xz
glusterfs-85a1716cd55b002b5d20a7903e28f106ee614814.zip
cluster/afr: Failover readdir calls.
This patch makes the replicate readdir call fail over to the next subvolume if the first call fails. It takes care to ensure that entries are not duplicated. The failover behavior of readdir only comes into effect if the option 'strict-readdir' is on. Signed-off-by: Vikas Gorur <vikas@dev.gluster.com> Signed-off-by: root <root@client02.(none)> Signed-off-by: Anand V. Avati <avati@dev.gluster.com> BUG: 453 (afr_readdir does not fail over) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=453
Diffstat (limited to 'xlators/cluster/afr')
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c302
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.h3
-rw-r--r--xlators/cluster/afr/src/afr.c33
-rw-r--r--xlators/cluster/afr/src/afr.h7
4 files changed, 299 insertions, 46 deletions
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index b48488526e..98cda1e809 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -24,6 +24,7 @@
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
+#include <string.h>
#ifndef _CONFIG_H
#define _CONFIG_H
@@ -219,6 +220,7 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,
afr_local_t * local = NULL;
int call_count = -1;
+ int ret = 0;
LOCK (&frame->lock);
{
@@ -234,30 +236,33 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if ((local->op_ret == 0) &&
- !afr_is_opendir_done (this, fd->inode)) {
-
- /*
- * This is the first opendir on this inode. We need
- * to check if the directory's entries are the same
- * on all subvolumes. This is needed in addition
- * to regular entry self-heal because the readdir
- * call is sent only to the first subvolume, and
- * thus files that exist only there will never be healed
- * otherwise (assuming changelog shows no anamolies).
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "reading contents of directory %s looking for mismatch",
- local->loc.path);
-
- afr_examine_dir (frame, this);
-
- } else {
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
+ if (local->op_ret == 0) {
+ ret = afr_fd_ctx_set (this, local->fd);
+
+ if (!afr_is_opendir_done (this, fd->inode)) {
+
+ /*
+ * This is the first opendir on this inode. We need
+ * to check if the directory's entries are the same
+ * on all subvolumes. This is needed in addition
+ * to regular entry self-heal because the readdir
+ * call is sent only to the first subvolume, and
+ * thus files that exist only there will never be healed
+ * otherwise (assuming changelog shows no anamolies).
+ */
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "reading contents of directory %s looking for mismatch",
+ local->loc.path);
+
+ afr_examine_dir (frame, this);
+
+ } else {
+ AFR_STACK_UNWIND (opendir, frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
}
- }
+ }
return 0;
}
@@ -333,6 +338,120 @@ out:
* Applicable to: readdir
*/
+
+struct entry_name {
+ char *name;
+ struct list_head list;
+};
+
+
+static gf_boolean_t
+remembered_name (const char *name, struct list_head *entries)
+{
+ struct entry_name *e;
+ gf_boolean_t ret = _gf_false;
+
+ list_for_each_entry (e, entries, list) {
+ if (!strcmp (name, e->name)) {
+ ret = _gf_true;
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+
+static void
+afr_remember_entries (gf_dirent_t *entries, fd_t *fd)
+{
+ struct entry_name *n = NULL;
+ gf_dirent_t * entry = NULL;
+
+ int ret = 0;
+
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx;
+
+ ret = fd_ctx_get (fd, THIS, &ctx);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "could not get fd ctx for fd=%p", fd);
+ return;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ n = CALLOC (1, sizeof (*n));
+ n->name = strdup (entry->d_name);
+ INIT_LIST_HEAD (&n->list);
+
+ list_add (&n->list, &fd_ctx->entries);
+ }
+}
+
+
+static off_t
+afr_filter_entries (gf_dirent_t *entries, fd_t *fd)
+{
+ gf_dirent_t *entry, *tmp;
+ int ret = 0;
+
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx;
+
+ off_t offset;
+
+ ret = fd_ctx_get (fd, THIS, &ctx);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "could not get fd ctx for fd=%p", fd);
+ return -1;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ list_for_each_entry_safe (entry, tmp, &entries->list, list) {
+ offset = entry->d_off;
+
+ if (remembered_name (entry->d_name, &fd_ctx->entries)) {
+ list_del (&entry->list);
+ FREE (entry);
+ }
+ }
+
+ return offset;
+}
+
+
+static void
+afr_forget_entries (fd_t *fd)
+{
+ struct entry_name *entry, *tmp;
+ int ret = 0;
+
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx;
+
+ ret = fd_ctx_get (fd, THIS, &ctx);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "could not get fd ctx for fd=%p", fd);
+ return;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) {
+ FREE (entry->name);
+ list_del (&entry->list);
+ FREE (entry);
+ }
+}
+
+
int32_t
afr_readdir_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
@@ -383,11 +502,19 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t ** children = NULL;
ino_t inum = 0;
+ int call_child = 0;
+ int ret = 0;
+
gf_dirent_t * entry = NULL;
gf_dirent_t * tmp = NULL;
int child_index = -1;
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx;
+
+ off_t offset = 0;
+
priv = this->private;
children = priv->children;
@@ -395,23 +522,88 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
child_index = (long) cookie;
- if (op_ret != -1) {
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- inum = afr_itransform (entry->d_ino, priv->child_count,
- child_index);
- entry->d_ino = inum;
- inum = afr_itransform (entry->d_stat.st_ino,
- priv->child_count, child_index);
- entry->d_stat.st_ino = inum;
+ if (priv->strict_readdir) {
+ ret = fd_ctx_get (local->fd, this, &ctx);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "could not get fd ctx for fd=%p", local->fd);
+ op_ret = -1;
+ op_errno = -ret;
+ goto out;
+ }
- if ((local->fd->inode == local->fd->inode->table->root)
- && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
- list_del_init (&entry->list);
- FREE (entry);
- }
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ if (child_went_down (op_ret, op_errno)) {
+ if (all_tried (child_index, priv->child_count)) {
+ goto out;
+ }
+
+ call_child = ++child_index;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "starting readdir afresh on child %d, offset %"PRId64,
+ call_child, (uint64_t) 0);
+
+ fd_ctx->failed_over = _gf_true;
+
+ STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
+ (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->readdirp, local->fd,
+ local->cont.readdir.size, 0);
+ return 0;
+ }
+ }
+
+ list_for_each_entry_safe (entry, tmp, &entries->list, list) {
+ inum = afr_itransform (entry->d_ino, priv->child_count,
+ child_index);
+ entry->d_ino = inum;
+ inum = afr_itransform (entry->d_stat.st_ino,
+ priv->child_count, child_index);
+ entry->d_stat.st_ino = inum;
+
+ if ((local->fd->inode == local->fd->inode->table->root)
+ && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ list_del_init (&entry->list);
+ FREE (entry);
}
}
+ if (priv->strict_readdir) {
+ if (fd_ctx->failed_over) {
+ if (list_empty (&entries->list)) {
+ goto out;
+ }
+
+ offset = afr_filter_entries (entries, local->fd);
+
+ afr_remember_entries (entries, local->fd);
+
+ if (list_empty (&entries->list)) {
+ /* All the entries we got were duplicate. We
+ shouldn't send an empty list now, because
+ that'll make the application stop reading. So
+ try to get more entries */
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "trying to fetch non-duplicate entries from offset %"PRId64", child %s",
+ offset, children[child_index]->name);
+
+ STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
+ (void *) (long) child_index,
+ children[child_index],
+ children[child_index]->fops->readdirp,
+ local->fd, local->cont.readdir.size, offset);
+ return 0;
+ }
+ } else {
+ afr_remember_entries (entries, local->fd);
+ }
+ }
+
+out:
AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries);
return 0;
@@ -427,6 +619,9 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,
int call_child = 0;
afr_local_t *local = NULL;
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx;
+
int ret = -1;
int32_t op_ret = -1;
@@ -445,7 +640,7 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,
op_errno = -ret;
goto out;
}
-
+
frame->local = local;
call_child = afr_first_up_child (priv);
@@ -458,7 +653,29 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this,
local->fd = fd_ref (fd);
local->cont.readdir.size = size;
- local->cont.readdir.offset = offset;
+
+ if (priv->strict_readdir) {
+ ret = fd_ctx_get (fd, this, &ctx);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "could not get fd ctx for fd=%p", fd);
+ op_errno = -ret;
+ goto out;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ if (fd_ctx->last_tried != call_child) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "first up child has changed from %d to %d, restarting readdir from offset 0",
+ fd_ctx->last_tried, call_child);
+
+ fd_ctx->failed_over = _gf_true;
+ offset = 0;
+ }
+
+ fd_ctx->last_tried = call_child;
+ }
if (whichop == GF_FOP_READDIR)
STACK_WIND_COOKIE (frame, afr_readdir_cbk,
@@ -545,6 +762,15 @@ out:
int32_t
+afr_releasedir (xlator_t *this, fd_t *fd)
+{
+ afr_forget_entries (fd);
+
+ return 0;
+}
+
+
+int32_t
afr_getdents (call_frame_t *frame, xlator_t *this,
fd_t *fd, size_t size, off_t offset, int32_t flag)
{
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
index 98ce1ca168..abde2534de 100644
--- a/xlators/cluster/afr/src/afr-dir-read.h
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -26,8 +26,7 @@ afr_opendir (call_frame_t *frame, xlator_t *this,
loc_t *loc, fd_t *fd);
int32_t
-afr_closedir (call_frame_t *frame, xlator_t *this,
- fd_t *fd);
+afr_releasedir (xlator_t *this, fd_t *fd);
int32_t
afr_readdir (call_frame_t *frame, xlator_t *this,
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 91189267fe..c041adc99e 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -1128,6 +1128,8 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
if (ret < 0) {
op_ret = ret;
}
+
+ INIT_LIST_HEAD (&fd_ctx->entries);
}
unlock:
UNLOCK (&fd->lock);
@@ -2693,11 +2695,12 @@ init (xlator_t *this)
int ret = -1;
int op_errno = 0;
- char * read_subvol = NULL;
- char * fav_child = NULL;
- char * self_heal = NULL;
- char * algo = NULL;
- char * change_log = NULL;
+ char * read_subvol = NULL;
+ char * fav_child = NULL;
+ char * self_heal = NULL;
+ char * algo = NULL;
+ char * change_log = NULL;
+ char * strict_readdir = NULL;
int32_t background_count = 0;
int32_t lock_server_count = 1;
@@ -2893,6 +2896,20 @@ init (xlator_t *this)
priv->entry_lock_server_count = lock_server_count;
}
+ priv->strict_readdir = _gf_false;
+
+ dict_ret = dict_get_str (this->options, "strict-readdir",
+ &strict_readdir);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (strict_readdir, &priv->strict_readdir);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid 'option strict-readdir %s'. "
+ "Defaulting to strict-readdir as 'off'.",
+ strict_readdir);
+ }
+ }
+
trav = this->children;
while (trav) {
if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
@@ -3037,7 +3054,8 @@ struct xlator_dumpops dumpops = {
struct xlator_cbks cbks = {
- .release = afr_release,
+ .release = afr_release,
+ .releasedir = afr_releasedir,
};
@@ -3090,5 +3108,8 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = 0
},
+ { .key = {"strict-readdir"},
+ .type = GF_OPTION_TYPE_BOOL,
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 71f4b7e561..a5c75add7b 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -67,6 +67,8 @@ typedef struct _afr_private {
unsigned int metadata_lock_server_count;
unsigned int entry_lock_server_count;
+ gf_boolean_t strict_readdir;
+
unsigned int wait_count; /* # of servers to wait for success */
uint64_t up_count; /* number of CHILD_UPs we have seen */
@@ -327,6 +329,7 @@ typedef struct _afr_local {
size_t size;
off_t offset;
+ gf_boolean_t failed;
int last_tried;
} readdir;
@@ -547,6 +550,10 @@ typedef struct {
int32_t wbflags;
uint64_t up_count; /* number of CHILD_UPs this fd has seen */
uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
+
+ int32_t last_tried;
+ gf_boolean_t failed_over;
+ struct list_head entries; /* needed for readdir failover */
} afr_fd_ctx_t;