From b1f53372c08a6eb6111a6f42d3b44ca9abba5508 Mon Sep 17 00:00:00 2001 From: William Brown Date: Fri, 23 Jun 2017 11:06:00 +1000 Subject: [PATCH] Ticket 49290 - improve idl handling in complex searches Bug Description: Previously we performed iterative set manipulations during searches. For example, (&(a)(b)(c)(d)) would result in: t1 = intersect(a, b) t2 = intersect(t1, c) t3 = intersect(t2, d) This is similar for or with union. The issue is that if the candidate sets were large (until d), then we would allocate and duplicate a large set of ID's repeatedly, until the set was reduced in the third step. The same is true for unions, but the set would continue to grow. As well due to the sorted nature of the IDList, we would perform an O(m + n) merge of the arrays, potentially many times. This would be amplified the more terms added. Especially in the and case, this could result in significant time differences if you move smaller candidate sets to the start of the query, and for the or case, would result in exponential time increases by adding extra terms. Fix Description: To resolve this, rather than processing each set in an interative fashion, we store each subfilters result in the new idl_set structure. When we have collected each candidate set we can then perform better intersections. In the case of the and query, we can and the *smallest* candidate to each term first, which significantly reduces the number of comparisons and copies. A benefit of this change, is that future improvements to not/or queries are easier to make because we now have "perfect" knowledge of all the IDLists involved, rather than just pairs at a time. https://pagure.io/389-ds-base/issue/49290 Author: wibrown Review by: ??? --- Makefile.am | 1 + dirsrvtests/tests/suites/filter/filter_logic.py | 148 +++++++++++++ ldap/servers/slapd/back-ldbm/back-ldbm.h | 39 ++-- ldap/servers/slapd/back-ldbm/filterindex.c | 148 +++++++------ ldap/servers/slapd/back-ldbm/idl_common.c | 120 ++++++++--- ldap/servers/slapd/back-ldbm/idl_set.c | 274 ++++++++++++++++++++++++ ldap/servers/slapd/back-ldbm/index.c | 6 + ldap/servers/slapd/back-ldbm/proto-back-ldbm.h | 14 ++ 8 files changed, 639 insertions(+), 111 deletions(-) create mode 100644 dirsrvtests/tests/suites/filter/filter_logic.py create mode 100644 ldap/servers/slapd/back-ldbm/idl_set.c diff --git a/Makefile.am b/Makefile.am index f075a47..6afa3fd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1262,6 +1262,7 @@ libback_ldbm_la_SOURCES = ldap/servers/slapd/back-ldbm/ancestorid.c \ ldap/servers/slapd/back-ldbm/idl.c \ ldap/servers/slapd/back-ldbm/idl_shim.c \ ldap/servers/slapd/back-ldbm/idl_new.c \ + ldap/servers/slapd/back-ldbm/idl_set.c \ ldap/servers/slapd/back-ldbm/idl_common.c \ ldap/servers/slapd/back-ldbm/import.c \ ldap/servers/slapd/back-ldbm/import-merge.c \ diff --git a/dirsrvtests/tests/suites/filter/filter_logic.py b/dirsrvtests/tests/suites/filter/filter_logic.py new file mode 100644 index 0000000..7b3a63f --- /dev/null +++ b/dirsrvtests/tests/suites/filter/filter_logic.py @@ -0,0 +1,148 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2017 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# + +import pytest +import ldap + +from lib389.topologies import topology_st +from lib389._constants import DEFAULT_SUFFIX + +from lib389.idm.user import UserAccounts + +""" +This test case asserts that various logical filters apply correctly and as expected. +This is to assert that we have correct and working search operations, especially related +to indexed content from filterindex.c and idl_sets. +""" + +USER0_DN = 'uid=user0,ou=People,%s' % DEFAULT_SUFFIX +USER1_DN = 'uid=user1,ou=People,%s' % DEFAULT_SUFFIX +USER2_DN = 'uid=user2,ou=People,%s' % DEFAULT_SUFFIX +USER3_DN = 'uid=user3,ou=People,%s' % DEFAULT_SUFFIX +USER4_DN = 'uid=user4,ou=People,%s' % DEFAULT_SUFFIX +USER5_DN = 'uid=user5,ou=People,%s' % DEFAULT_SUFFIX +USER6_DN = 'uid=user6,ou=People,%s' % DEFAULT_SUFFIX +USER7_DN = 'uid=user7,ou=People,%s' % DEFAULT_SUFFIX +USER8_DN = 'uid=user8,ou=People,%s' % DEFAULT_SUFFIX +USER9_DN = 'uid=user9,ou=People,%s' % DEFAULT_SUFFIX + +@pytest.fixture(scope="module") +def topology_st_f(topology_st): + # Add our users to the topology_st + users = UserAccounts(topology_st.standalone, DEFAULT_SUFFIX) + for i in range(0, 10): + users.create(properties={ + 'uid': 'user%s' % i, + 'cn': 'user%s' % i, + 'sn': '%s' % i, + 'uidNumber': '%s' % i, + 'gidNumber': '%s' % i, + 'homeDirectory': '/home/user%s' % i + }) + # return it + # print("ATTACH NOW") + # import time + # time.sleep(30) + return topology_st.standalone + +def _check_filter(topology_st_f, filt, expect_len, expect_dns): + # print("checking %s" % filt) + results = topology_st_f.search_s("ou=People,%s" % DEFAULT_SUFFIX, ldap.SCOPE_ONELEVEL, filt, ['uid',]) + assert len(results) == expect_len + result_dns = [result.dn for result in results] + assert set(expect_dns) == set(result_dns) + +def test_filter_logic_eq(topology_st_f): + _check_filter(topology_st_f, '(uid=user0)', 1, [USER0_DN]) + +def test_filter_logic_sub(topology_st_f): + _check_filter(topology_st_f, '(uid=user*)', 10, [ + USER0_DN, USER1_DN, USER2_DN, USER3_DN, USER4_DN, + USER5_DN, USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + +def test_filter_logic_not_eq(topology_st_f): + _check_filter(topology_st_f, '(!(uid=user0))', 9, [ + USER1_DN, USER2_DN, USER3_DN, USER4_DN, USER5_DN, + USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + +# More not cases? + +def test_filter_logic_range(topology_st_f): + _check_filter(topology_st_f, '(uid>=user5)', 5, [ + USER5_DN, USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + +def test_filter_logic_and_eq(topology_st_f): + _check_filter(topology_st_f, '(&(uid=user0)(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(&(uid=user0)(cn=user1))', 0, []) + +def test_filter_logic_and_range(topology_st_f): + _check_filter(topology_st_f, '(&(uid>=user5)(cn<=user7))', 3, [ + USER5_DN, USER6_DN, USER7_DN + ]) + +def test_filter_logic_or_eq(topology_st_f): + _check_filter(topology_st_f, '(|(uid=user0)(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(|(uid=user0)(uid=user1))', 2, [USER0_DN, USER1_DN]) + +def test_filter_logic_and_not_eq(topology_st_f): + _check_filter(topology_st_f, '(&(uid=user0)(!(cn=user0)))', 0, []) + _check_filter(topology_st_f, '(&(uid=*)(!(uid=user0)))', 9, [ + USER1_DN, USER2_DN, USER3_DN, USER4_DN, USER5_DN, + USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + +def test_filter_logic_or_not_eq(topology_st_f): + _check_filter(topology_st_f, '(|(!(uid=user0))(!(uid=user1)))', 10, [ + USER0_DN, USER1_DN, USER2_DN, USER3_DN, USER4_DN, + USER5_DN, USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + +def test_filter_logic_and_range(topology_st_f): + _check_filter(topology_st_f, '(&(uid>=user5)(uid=user6))', 1, [USER6_DN]) + _check_filter(topology_st_f, '(&(uid>=user5)(uid=user0))', 0, []) + +def test_filter_logic_or_range(topology_st_f): + _check_filter(topology_st_f, '(|(uid>=user5)(uid=user6))', 5, [ + USER5_DN, USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + _check_filter(topology_st_f, '(|(uid>=user5)(uid=user0))', 6, [ + USER0_DN, + USER5_DN, USER6_DN, USER7_DN, USER8_DN, USER9_DN + ]) + +def test_filter_logic_and_and_eq(topology_st_f): + _check_filter(topology_st_f, '(&(&(uid=user0)(sn=0))(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(&(&(uid=user1)(sn=0))(cn=user0))', 0, []) + _check_filter(topology_st_f, '(&(&(uid=user0)(sn=1))(cn=user0))', 0, []) + _check_filter(topology_st_f, '(&(&(uid=user0)(sn=0))(cn=user1))', 0, []) + +def test_filter_logic_or_or_eq(topology_st_f): + _check_filter(topology_st_f, '(|(|(uid=user0)(sn=0))(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(|(|(uid=user1)(sn=0))(cn=user0))', 2, [USER0_DN, USER1_DN]) + _check_filter(topology_st_f, '(|(|(uid=user0)(sn=1))(cn=user0))', 2, [USER0_DN, USER1_DN]) + _check_filter(topology_st_f, '(|(|(uid=user0)(sn=0))(cn=user1))', 2, [USER0_DN, USER1_DN]) + _check_filter(topology_st_f, '(|(|(uid=user0)(sn=1))(cn=user2))', 3, [USER0_DN, USER1_DN, USER2_DN]) + +def test_filter_logic_and_or_eq(topology_st_f): + _check_filter(topology_st_f, '(&(|(uid=user0)(sn=0))(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(&(|(uid=user1)(sn=0))(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(&(|(uid=user0)(sn=1))(cn=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(&(|(uid=user0)(sn=0))(cn=user1))', 0, []) + _check_filter(topology_st_f, '(&(|(uid=user0)(sn=1))(cn=*))', 2, [USER0_DN, USER1_DN]) + +def test_filter_logic_or_and_eq(topology_st_f): + _check_filter(topology_st_f, '(|(&(uid=user0)(sn=0))(uid=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(|(&(uid=user1)(sn=2))(uid=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(|(&(uid=user0)(sn=1))(uid=user0))', 1, [USER0_DN]) + _check_filter(topology_st_f, '(|(&(uid=user1)(sn=1))(uid=user0))', 2, [USER0_DN, USER1_DN]) + + diff --git a/ldap/servers/slapd/back-ldbm/back-ldbm.h b/ldap/servers/slapd/back-ldbm/back-ldbm.h index 88a91ed..4568ba7 100644 --- a/ldap/servers/slapd/back-ldbm/back-ldbm.h +++ b/ldap/servers/slapd/back-ldbm/back-ldbm.h @@ -237,7 +237,7 @@ typedef u_int32_t ID; /* * Use this to count and index into an array of ID. */ -typedef u_int32_t NIDS; +typedef uint32_t NIDS; /* * This structure represents an id block on disk and an id list @@ -245,22 +245,35 @@ typedef u_int32_t NIDS; * * The fields have the following meanings: * - * b_nmax maximum number of ids in this block. if this is == ALLIDSBLOCK, - * then this block represents all ids. - * b_nids current number of ids in use in this block. if this - * is == INDBLOCK, then this block is an indirect block - * containing a list of other blocks containing actual ids. - * the list is terminated by an id of NOID. - * b_ids a list of the actual ids themselves + * b_nmax maximum number of ids in this block. if this is == ALLIDSBLOCK, + * then this block represents all ids. + * b_nids current number of ids in use in this block. if this + * is == INDBLOCK, then this block is an indirect block + * containing a list of other blocks containing actual ids. + * the list is terminated by an id of NOID. + * b_ids a list of the actual ids themselves */ +#define ALLIDSBLOCK 0 /* == 0 => this is an allid block */ +#define INDBLOCK 0 /* == 0 => this is an indirect blk */ + typedef struct block { - NIDS b_nmax; /* max number of ids in this list */ -#define ALLIDSBLOCK 0 /* == 0 => this is an allid block */ - NIDS b_nids; /* current number of ids used */ -#define INDBLOCK 0 /* == 0 => this is an indirect blk */ - ID b_ids[1]; /* the ids - actually bigger */ + NIDS b_nmax; /* max number of ids in this list */ + NIDS b_nids; /* current number of ids used */ + struct block *next; /* Pointer to the next block in the list + * used by idl_set + */ + ID b_ids[1]; /* the ids - actually bigger */ } Block, IDList; +typedef struct _idlist_set { + int64_t count; + int64_t allids; + IDList *min; + IDList *max; + IDList *head; + IDList *complement_head; +} IDListSet; + #define ALLIDS( idl ) ((idl)->b_nmax == ALLIDSBLOCK) #define INDIRECT_BLOCK( idl ) ((idl)->b_nids == INDBLOCK) #define IDL_NIDS(idl) (idl ? (idl)->b_nids : (NIDS)0) diff --git a/ldap/servers/slapd/back-ldbm/filterindex.c b/ldap/servers/slapd/back-ldbm/filterindex.c index 6af8e09..0f4e76a 100644 --- a/ldap/servers/slapd/back-ldbm/filterindex.c +++ b/ldap/servers/slapd/back-ldbm/filterindex.c @@ -652,7 +652,8 @@ list_candidates( int allidslimit ) { - IDList *idl, *tmp, *tmp2; + IDList *idl; + IDList *tmp; Slapi_Filter *f, *nextf, *f_head; int range = 0; int isnot; @@ -663,6 +664,7 @@ list_candidates( char *tpairs[2] = {NULL, NULL}; struct berval *vpairs[2] = {NULL, NULL}; int is_and = 0; + IDListSet *idl_set = NULL; slapi_log_err(SLAPI_LOG_TRACE, "list_candidates", "=> 0x%x\n", ftype); @@ -766,6 +768,10 @@ list_candidates( goto out; } + if (ftype == LDAP_FILTER_OR || ftype == LDAP_FILTER_AND) { + idl_set = idl_set_create(); + } + idl = NULL; nextf = NULL; isnot = 0; @@ -778,15 +784,21 @@ list_candidates( (LDAP_FILTER_EQUALITY == slapi_filter_get_choice(slapi_filter_list_first(f)))); if (isnot) { - /* if this is the first filter we have an allid search anyway, so bail */ - if(f == f_head) - { + /* + * If this is the first filter, make sure we have something to + * subtract from. + */ + if (f == f_head) { idl = idl_allids( be ); - break; + idl_set_insert_idl(idl_set, idl); } + /* + * Not the first filter - good! Get the indexed type out. + * if this is unindexed, we'll get allids. During the intersection + * in notin, we'll mark this as don't skip filter, because else we might + * get the wrong results. + */ - /* Fetch the IDL for foo */ - /* Later we'll remember to call idl_notin() */ slapi_log_err(SLAPI_LOG_TRACE, "list_candidates" ,"NOT filter\n"); if (filter_is_subtype(slapi_filter_list_first(f))) { /* @@ -832,64 +844,65 @@ list_candidates( } } - tmp2 = idl; - if ( idl == NULL ) { - idl = tmp; - if ( (ftype == LDAP_FILTER_AND) && ((idl == NULL) || - (idl_length(idl) <= FILTER_TEST_THRESHOLD))) { - break; /* We can exit the loop now, since the candidate list is small already */ - } - } else if ( ftype == LDAP_FILTER_AND ) { - if (isnot) { - /* - * If tmp is NULL or ALLID, idl_notin just duplicates idl. - * We don't have to do it. - */ - if (!tmp && !idl_is_allids(tmp)) { - IDList *new_idl = NULL; - int notin_result = 0; - notin_result = idl_notin( be, idl, tmp, &new_idl ); - if (notin_result) { - idl_free(&idl); - idl = new_idl; - } - } - } else { - idl = idl_intersection(be, idl, tmp); - idl_free( &tmp2 ); - } - idl_free( &tmp ); - /* stop if the list has gotten too small */ - if ((idl == NULL) || - (idl_length(idl) <= FILTER_TEST_THRESHOLD)) - break; - } else { - Slapi_Operation *operation; - slapi_pblock_get( pb, SLAPI_OPERATION, &operation ); + /* + * At this point we have the idl set from the subfilter. In idl_set, + * we stash this for later .... + */ - idl = idl_union( be, idl, tmp ); - idl_free( &tmp ); - idl_free( &tmp2 ); - /* stop if we're already committed to an exhaustive - * search. :( + if (ftype == LDAP_FILTER_OR || + (ftype == LDAP_FILTER_AND && !isnot)) { + idl_set_insert_idl(idl_set, tmp); + } else if (ftype == LDAP_FILTER_AND && isnot) { + idl_set_insert_complement_idl(idl_set, tmp); + } + + if (ftype == LDAP_FILTER_OR && idl_set_union_shortcut(idl_set) != 0) { + /* + * If we encounter an allids idl, this means that union will return + * and allids - we should not process anymore, and fallback to full + * table scan at this point. */ - /* PAGED RESULTS: we strictly limit the idlist size by the allids (aka idlistscan) limit. + goto apply_set_op; + } + + if (ftype == LDAP_FILTER_AND && idl_set_intersection_shortcut(idl_set) != 0) { + /* + * If we encounter a zero length idl, we bail now because this can never + * result in a meaningful result besides zero. */ + goto apply_set_op; + } + + } + + /* + * Do the idl_set operation if required. + * these are far more efficient than the iterative union and + * intersections we previously used. + */ +apply_set_op: + + if (ftype == LDAP_FILTER_OR) { + /* If one of the idl_set is allids, this shortcuts :) */ + idl = idl_set_union(idl_set, be); + size_t nids = IDL_NIDS(idl); + if ( allidslimit > 0 && nids > allidslimit ) { + Slapi_Operation *operation; + slapi_pblock_get( pb, SLAPI_OPERATION, &operation ); + /* PAGED RESULTS: we strictly limit the idlist size by the allids (aka idlistscan) limit. */ if (op_is_pagedresults(operation)) { - int nids = IDL_NIDS(idl); - if ( allidslimit > 0 && nids > allidslimit ) { - idl_free( &idl ); - idl = idl_allids( be ); - } + idl_free( &idl ); + idl = idl_allids( be ); } - if (idl_is_allids(idl)) - break; } + } else if (ftype == LDAP_FILTER_AND) { + idl = idl_set_intersect(idl_set, be); } slapi_log_err(SLAPI_LOG_TRACE, "list_candidates", "<= %lu\n", (u_long)IDL_NIDS(idl)); out: + idl_set_destroy(idl_set); if (is_and) { /* * Sets IS_AND back to 0 only when this function set 1. @@ -989,14 +1002,11 @@ keys2idl( int allidslimit ) { - IDList *idl; - int i; + IDList *idl = NULL; - slapi_log_err(SLAPI_LOG_TRACE, "keys2idl", "=> type %s indextype %s\n", - type, indextype); - idl = NULL; - for ( i = 0; ivals[i] != NULL; i++ ) { - IDList *idl2; + slapi_log_err(SLAPI_LOG_TRACE, "keys2idl", "=> type %s indextype %s\n", type, indextype); + for ( size_t i = 0; ivals[i] != NULL; i++ ) { + IDList *idl2 = NULL; idl2 = index_read_ext_allids( pb, be, type, indextype, slapi_value_get_berval(ivals[i]), txn, err, unindexed, allidslimit ); @@ -1011,23 +1021,25 @@ keys2idl( } #endif if ( idl2 == NULL ) { - idl_free( &idl ); - idl = NULL; - break; + slapi_log_err(SLAPI_LOG_WARNING, "keys2idl", "recieved NULL idl from index_read_ext_allids, treating as empty set\n"); + slapi_log_err(SLAPI_LOG_WARNING, "keys2idl", "this is probably a bug that should be reported\n"); + idl2 = idl_alloc(0); } + /* First iteration of the ivals, stash idl2. */ if (idl == NULL) { idl = idl2; } else { - IDList *tmp; + /* + * second iteration of the ivals - do an intersection and free + * the intermediates. + */ + IDList *tmp = NULL; tmp = idl; idl = idl_intersection(be, idl, idl2); idl_free( &idl2 ); idl_free( &tmp ); - if ( idl == NULL ) { - break; - } } } diff --git a/ldap/servers/slapd/back-ldbm/idl_common.c b/ldap/servers/slapd/back-ldbm/idl_common.c index cd3458e..d2394d0 100644 --- a/ldap/servers/slapd/back-ldbm/idl_common.c +++ b/ldap/servers/slapd/back-ldbm/idl_common.c @@ -15,12 +15,15 @@ #include "back-ldbm.h" +/* Default to holding 512 ids for idl_fetch_ext */ +#define IDLIST_MIN_BLOCK_SIZE 512 + size_t idl_sizeof(IDList *idl) { if (NULL == idl) { return 0; } - return (2 + idl->b_nmax) * sizeof(ID); + return sizeof(IDList) + (idl->b_nmax * sizeof(ID)); } NIDS idl_length(IDList *idl) @@ -44,10 +47,19 @@ idl_alloc( NIDS nids ) { IDList *new; + if (nids == 0) { + /* + * Because we use allids in b_nmax as 0, when we request an + * empty set, this *looks* like an empty set. Instead, we make + * a minimum b_nmax to trick it. + */ + nids = 1; + } + /* nmax + nids + space for the ids */ - new = (IDList *) slapi_ch_calloc( (2 + nids), sizeof(ID) ); + new = (IDList *) slapi_ch_calloc( 1, sizeof(IDList) + (sizeof(ID) * nids) ); new->b_nmax = nids; - new->b_nids = 0; + /* new->b_nids = 0; */ return( new ); } @@ -116,7 +128,7 @@ idl_append_extend(IDList **orig_idl, ID id) IDList *idl = *orig_idl; if (idl == NULL) { - idl = idl_alloc(32); /* used to be 0 */ + idl = idl_alloc(IDLIST_MIN_BLOCK_SIZE); /* used to be 0 */ idl_append(idl, id); *orig_idl = idl; @@ -125,17 +137,11 @@ idl_append_extend(IDList **orig_idl, ID id) if ( idl->b_nids == idl->b_nmax ) { /* No more room, need to extend */ - /* Allocate new IDL with twice the space of this one */ - IDList *idl_new = NULL; - idl_new = idl_alloc(idl->b_nmax * 2); - if (NULL == idl_new) { + idl->b_nmax = idl->b_nmax * 2; + idl = slapi_ch_realloc(idl, sizeof(IDList) + (sizeof(ID) * idl->b_nmax)); + if (idl == NULL) { return ENOMEM; } - /* copy over the existing contents */ - idl_new->b_nids = idl->b_nids; - memcpy(idl_new->b_ids, idl->b_ids, sizeof(ID) * idl->b_nids); - idl_free(&idl); - idl = idl_new; } idl->b_ids[idl->b_nids] = id; @@ -155,8 +161,7 @@ idl_dup( IDList *idl ) } new = idl_alloc( idl->b_nmax ); - SAFEMEMCPY( (char *) new, (char *) idl, (idl->b_nmax + 2) - * sizeof(ID) ); + memcpy(new, idl, idl_sizeof(idl) ); return( new ); } @@ -186,6 +191,39 @@ idl_id_is_in_idlist(IDList *idl, ID id) } /* + * idl_compare - compare idl a and b for value equality and ordering. + */ +int64_t +idl_compare(IDList *a, IDList *b) { + /* Assert they are not none. */ + if (a == NULL || b == NULL) { + return 1; + } + /* Are they the same pointer? */ + if (a == b) { + return 0; + } + /* Do they have the same number of IDs? */ + if (a->b_nids != b->b_nids) { + return 1; + } + + /* Are they both allid blocks? */ + if ( ALLIDS( a ) && ALLIDS( b ) ) { + return 0; + } + + /* Same size, and not the same array. Lets check! */ + for (size_t i = 0; i < a->b_nids; i++) { + if (a->b_ids[i] != b->b_ids[i]) { + return 1; + } + } + /* Must be the same! */ + return 0; +} + +/* * idl_intersection - return a intersection b */ IDList * @@ -198,9 +236,14 @@ idl_intersection( NIDS ai, bi, ni; IDList *n; - if ( a == NULL || b == NULL ) { - return( NULL ); + if ( a == NULL || a->b_nids == 0 ) { + return idl_dup(a); + } + + if ( b == NULL || b->b_nids == 0) { + return idl_dup(b); } + if ( ALLIDS( a ) ) { slapi_be_set_flag(be, SLAPI_BE_FLAG_DONT_BYPASS_FILTERTEST); return( idl_dup( b ) ); @@ -225,10 +268,6 @@ idl_intersection( } } - if ( ni == 0 ) { - idl_free( &n ); - return( NULL ); - } n->b_nids = ni; return( n ); @@ -248,10 +287,10 @@ idl_union( NIDS ai, bi, ni; IDList *n; - if ( a == NULL ) { + if ( a == NULL || a->b_nids == 0 ) { return( idl_dup( b ) ); } - if ( b == NULL ) { + if ( b == NULL || b->b_nids == 0 ) { return( idl_dup( a ) ); } if ( ALLIDS( a ) || ALLIDS( b ) ) { @@ -311,12 +350,25 @@ idl_notin( IDList *n; *new_result = NULL; - if ( a == NULL ) { - return( 0 ); + /* Nothing in a, so nothing to subtract from. */ + if ( a == NULL || a->b_nids == 0 ) { + *new_result = idl_alloc(0); + return 1; } - if ( b == NULL || ALLIDS( b ) ) { - *new_result = idl_dup( a ); - return( 1 ); + /* b is empty, so nothing to remove from a. */ + if ( b == NULL || b->b_nids == 0 ) { + // *new_result = idl_dup( a ); + return 0; + } + + /* b is allIDS, so a - b, should be the empty set. + * but if the type in unindexed, we don't know. Instead we have to + * return a, and mark that we can't skip the filter test. + */ + if ( ALLIDS(b) ) { + // *new_result = idl_alloc(0); + slapi_be_set_flag(be, SLAPI_BE_FLAG_DONT_BYPASS_FILTERTEST); + return 0; } if ( ALLIDS( a ) ) { /* Not convinced that this code is really worth it */ @@ -418,7 +470,7 @@ idl_nextid( IDList *idl, ID id ) { NIDS i; - if (NULL == idl) { + if (NULL == idl || idl->b_nids == 0) { return NOID; } if ( ALLIDS( idl ) ) { @@ -463,10 +515,18 @@ idl_iterator idl_iterator_decrement(idl_iterator *i) ID idl_iterator_dereference(idl_iterator i, const IDList *idl) { + /* + * NOID is used to terminate iteration. When we get an allIDS + * idl->b_nids == number of entries in id2entry. That's how we + * know to stop returning ids. + */ if ( (NULL == idl) || (i >= idl->b_nids)) { return NOID; } if (ALLIDS(idl)) { + /* + * entries in id2entry start at 1, not 0, so we have off by one here. + */ return (ID) i + 1; } else { return idl->b_ids[i]; @@ -484,5 +544,5 @@ ID idl_iterator_dereference_decrement(idl_iterator *i, const IDList *idl) { idl_iterator_decrement(i); return idl_iterator_dereference(*i,idl); - } + diff --git a/ldap/servers/slapd/back-ldbm/idl_set.c b/ldap/servers/slapd/back-ldbm/idl_set.c new file mode 100644 index 0000000..c25a765 --- /dev/null +++ b/ldap/servers/slapd/back-ldbm/idl_set.c @@ -0,0 +1,274 @@ +/** BEGIN COPYRIGHT BLOCK + * Copyright (C) 2017 Red Hat, Inc. + * All rights reserved. + * + * License: GPL (version 3 or any later version). + * See LICENSE for details. + * END COPYRIGHT BLOCK **/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include "back-ldbm.h" + +/* + * In filterindex, rather than calling idl_union multiple times over + * this idl_set provides better apis to do efficent set manipulations + * for idl results. + * + * previously or results were calculated as: + * |(a)(b)(c)(d) + * + * t1 = union(a, b) + * t2 = union(t1, c) + * t3 = union(t2, d) + * As you can see, this results in n-1 union operations. Depending on + * the size of the IDL, this could signifigantly affect performance. + * It actually let to a situation where re-arranging a query would be + * faster. + * + * Instead, if we maintain a set of IDL results, then perform a more + * effective union we can improve our results. For example. + * + * |(a)(b)(c)(d) + * + * t1 = union(a,b) + * t2 = union(c,d) + * t3 = union(t1, t2) + * + * This is still needing n-1 operations but because we perform them hierarchially, + * rather than having a large set A that we need to iterate over many times, each + * smaller set is faster to iterate through. + * + * A better improvement is to find the largest IDL, then to convert it to + * a b+tree or something similar (skiplist?), then to iterate over the remaining + * IDLs and their content performing + * insert sort. Afterwards, we can collapse the tree to an IDL, and return it. + * + * this is better because the current union is O(n) complex to insert, so with the + * tree method we have a faster merge of two IDL's together. This would also prevent + * the need to do many recursive inserts. + * + */ + +IDListSet * +idl_set_create() { + /* Alloc it, with a default of 8 idls */ + IDListSet *idl_set = (IDListSet *)slapi_ch_calloc(1, sizeof(IDListSet)); + /* all other fields are 0 thanks to calloc */ + return idl_set; +} + +static void +idl_set_free_idls(IDListSet *idl_set) { + /* Free idlists */ + IDList *idl = idl_set->head; + IDList *next_idl = NULL; + while (idl != NULL) { + next_idl = idl->next; + idl_free(&idl); + idl = next_idl; + } + + /* Free complements if any */ + idl = idl_set->complement_head; + while (idl != NULL) { + next_idl = idl->next; + idl_free(&idl); + idl = next_idl; + } +} + +void +idl_set_destroy(IDListSet *idl_set) { + slapi_ch_free((void **)&(idl_set)); +} + +void +idl_set_insert_idl(IDListSet *idl_set, IDList *idl) { + PR_ASSERT(idl); + + idl->next = idl_set->head; + idl_set->head = idl; + idl_set->count += 1; + + /* Now that it's inserted, lets check some things. */ + int32_t is_allids = idl_is_allids(idl); + /* Is this an allids? */ + if (is_allids) { + idl_set->allids = 1; + } + /* Is this smaller than our last smallest set? */ + if (idl_set->min == NULL || idl->b_nids < idl_set->min->b_nids) { + idl_set->min = idl; + } + /* Is this larger than our last largest set? */ + if (!is_allids && (idl_set->max == NULL || idl->b_nids > idl_set->max->b_nids)) { + idl_set->max = idl; + } + return; +} + +/* + * The difference between this and insert is that complement implies + * a future intersection, and that we plan to use a "not" query. + * + * As a result, the order of operations is to: + * * intersect all sets + * * apply complements. + */ +void +idl_set_insert_complement_idl(IDListSet *idl_set, IDList *idl) { + PR_ASSERT(idl); + /* + * If we complement to ALLIDS, the result is empty set. + * so we can put empty set into the main list. This will cause + * -- no change to union (but this should never be called during OR / union) + * -- shortcut of the AND to trigger immediately + */ + idl->next = idl_set->complement_head; + idl_set->complement_head = idl; + idl_set->count += 1; +} + +int64_t +idl_set_union_shortcut(IDListSet *idl_set) { + /* Are we able to shortcut the union process? */ + /* This generally indicates the presence of allids */ + return idl_set->allids; +} + +int64_t +idl_set_intersection_shortcut(IDListSet *idl_set) { + /* If we have a 0 length idl, we can never create an intersection. */ + if (idl_set->min) { + return idl_set->min->b_nids == 0; + } + return 0; +} + +IDList * +idl_set_union(IDListSet *idl_set, backend *be) { + if (idl_set->count == 0) { + return NULL; + } + if (idl_set->count == 1) { + return idl_set->head; + } + if (idl_set->allids != 0) { + idl_set_free_idls(idl_set); + return idl_allids(be); + } + /* For now do a dumb union .... */ + IDList *result_list = idl_set->head; + IDList *new_result_list = NULL; + IDList *idl = idl_set->head->next; + IDList *next_idl = NULL; + while (idl != NULL) { + next_idl = idl->next; + new_result_list = idl_union(be, result_list, idl); + + idl_free(&result_list); + idl_free(&idl); + + result_list = new_result_list; + idl = next_idl; + } + + return result_list; + + /* + * To make this faster: + * track the max size idlist on the way in that is not + * allids. If we hav an all ids immedatel free all idls and return + * + * If idls count > 2 ... + * + * Then turn the max into a b+tree + * now do an insert sort of the rest of the sets in. + * + * Finally, dump the b+tree back to an idlist and return it. + */ + +} + +IDList * +idl_set_intersect(IDListSet *idl_set, backend *be) { + if (idl_set->count == 0) { + return NULL; + } + if (idl_set->count == 1) { + return idl_set->head; + } + + /* Okay, actually do a intersection then! */ + + /* + * Start with the *minimum* intersection, because it + * will reduce our candidate set the quickest. + */ + + IDList *result_list = idl_set->min; + IDList *min_next = idl_set->min->next; + IDList *new_result_list = NULL; + IDList *idl = idl_set->head; + IDList *next_idl = NULL; + + while (idl != NULL) { + if (idl == idl_set->min) { + /* + * Because we started with min, it was freed as result_list. + * but we found it again in our set. skip over it. + */ + idl = min_next; + } else { + next_idl = idl->next; + /* + * Only both to intersect results if we have a non 0 + * It's important to construct it like this, because it + * means if we have an empty idl, we never intersect it, + * and we free the idl's remaining in the idl_set. + * + * another aspect is that if idl_set->min is 0, because it + * starts as the result_list, we never do any intersections + * and we just free the remaining sets. + * hint: idl_intersection returns NULL for empty set. + */ + + if (result_list && result_list->b_nids != 0) { + new_result_list = idl_intersection(be, result_list, idl); + idl_free(&result_list); + result_list = new_result_list; + } + idl_free(&idl); + idl = next_idl; + } + } + + /* Now, that we have the "smallest" intersection possible, we need to subtract + * elements as required. + */ + + idl = idl_set->complement_head; + while (idl != NULL) { + next_idl = idl->next; + if (idl_notin(be, result_list, idl, &new_result_list)) { + /* + * idl_notin returns 1 on new alloc, so free result_list and idl + */ + idl_free(&idl); + idl_free(&result_list); + result_list = new_result_list; + } else { + /* + * idl_notin returns 0 when it "does nothing", so just free idl. + */ + idl_free(&idl); + } + idl = next_idl; + } + + return result_list; +} + diff --git a/ldap/servers/slapd/back-ldbm/index.c b/ldap/servers/slapd/back-ldbm/index.c index a3dac66..e7cba38 100644 --- a/ldap/servers/slapd/back-ldbm/index.c +++ b/ldap/servers/slapd/back-ldbm/index.c @@ -1033,6 +1033,12 @@ index_read_ext_allids( interval = PR_MillisecondsToInterval(slapi_rand() % 100); DS_Sleep(interval); continue; + } else if (*err != 0 || idl == NULL) { + /* The database might not exist. We have to assume it means empty set */ + slapi_log_err(SLAPI_LOG_TRACE, "index_read_ext_allids", "Failed to access idl index for %s\n", basetype); + slapi_log_err(SLAPI_LOG_TRACE, "index_read_ext_allids", "Assuming %s has no index values\n", basetype); + idl = idl_alloc(0); + break; } else { break; } diff --git a/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h b/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h index ca2ff7b..46b67cc 100644 --- a/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h +++ b/ldap/servers/slapd/back-ldbm/proto-back-ldbm.h @@ -281,6 +281,20 @@ IDList *idl_new_range_fetch(backend *be, DB* db, DBT *lowerkey, DBT *upperkey, int allidslimit, int sizelimit, time_t stoptime, int lookthrough_limit, int operator); + +int64_t idl_compare(IDList *a, IDList *b); + +/* + * idl_set.c + */ +IDListSet * idl_set_create(); +void idl_set_destroy(IDListSet *idl_set); +void idl_set_insert_idl(IDListSet *idl_set, IDList *idl); +int64_t idl_set_union_shortcut(IDListSet *idl_set); +int64_t idl_set_intersection_shortcut(IDListSet *idl_set); +IDList * idl_set_union(IDListSet *idl_set, backend *be); +IDList * idl_set_intersect(IDListSet *idl_set, backend *be); + /* * index.c */ -- 1.8.3.1