1 files changed, 454 insertions, 0 deletions
diff --git a/ldap/servers/plugins/collation/collate.c b/ldap/servers/plugins/collation/collate.c
new file mode 100644
index 00000000..603caf53
--- /dev/null
+++ b/ldap/servers/plugins/collation/collate.c
@@ -0,0 +1,454 @@
+/** BEGIN COPYRIGHT BLOCK
+ * Copyright 2001 Sun Microsystems, Inc.
+ * Portions copyright 1999, 2001-2003 Netscape Communications Corporation.
+ * All rights reserved.
+ * END COPYRIGHT BLOCK **/
+/* collate.c - implementation of indexing, using a Collation */
+
+#include "collate.h"
+#include <string.h> /* memcpy */
+
+#include <unicode/ucol.h> /* Collation */
+#include <unicode/ucnv.h> /* Conversion */
+#include <unicode/ustring.h> /* UTF8 conversion */
+
+#include <ldap.h> /* LDAP_UTF8LEN */
+#include <slap.h> /* for strcasecmp on non-UNIX platforms and correct debug macro */
+
+void
+collation_init( char *configpath )
+    /* Called once per process, to initialize globals. */
+{
+	/* ICU needs no initialization? */
+}
+
+typedef struct coll_profile_t { /* Collator characteristics */
+    const char* language;
+    const char* country;
+    const char* variant;
+    UColAttributeValue strength; /* one of UCOL_PRIMARY = 0, UCOL_SECONDARY = 1, UCOL_TERTIARY = 2, UCOL_QUATERNARY = 3, UCOL_IDENTICAL = 4 */
+    UColAttributeValue decomposition; /* one of UCOL_OFF = 0, UCOL_DEFAULT = 1, UCOL_ON = 2 */
+} coll_profile_t;
+
+typedef struct coll_id_t { /* associates an OID with a coll_profile_t */
+    char* oid;
+    coll_profile_t* profile;
+} coll_id_t;
+
+/* A list of all OIDs that identify collator profiles: */
+static const coll_id_t** collation_id = NULL;
+static size_t            collation_ids = 0;
+
+int
+collation_config (size_t cargc, char** cargv,
+		  const char* fname, size_t lineno)
+    /* Process one line from a configuration file.
+       Return 0 if it's OK, -1 if it's not recognized.
+       Any other return value is a process exit code.
+    */
+{
+    if (cargc <= 0) { /* Bizarre.  Oh, well... */
+    } else if (!strcasecmp (cargv[0], "NLS")) {
+	/* ignore - not needed anymore with ICU - was used to get path for NLS_Initialize */
+    } else if (!strcasecmp (cargv[0], "collation")) {
+	if ( cargc < 7 ) {
+	    LDAPDebug (LDAP_DEBUG_ANY,
+		       "%s: line %lu ignored: only %lu arguments (expected "
+		       "collation language country variant strength decomposition oid ...)\n",
+		       fname, (unsigned long)lineno, (unsigned long)cargc );
+	} else {
+	    auto size_t arg;
+	    auto coll_profile_t* profile = (coll_profile_t*) slapi_ch_calloc (1, sizeof (coll_profile_t));
+	    if (*cargv[1]) profile->language = slapi_ch_strdup (cargv[1]);
+	    if (*cargv[2]) profile->country  = slapi_ch_strdup (cargv[2]);
+	    if (*cargv[3]) profile->variant  = slapi_ch_strdup (cargv[3]);
+	    switch (atoi(cargv[4])) {
+	      case 1: profile->strength = UCOL_PRIMARY; break;
+	      case 2: profile->strength = UCOL_SECONDARY; /* no break here? fall through? wtf? */
+	      case 3: profile->strength = UCOL_TERTIARY; break;
+	      case 4: profile->strength = UCOL_IDENTICAL; break;
+	      default: profile->strength = UCOL_SECONDARY;
+ 		LDAPDebug (LDAP_DEBUG_ANY,
+			   "%s: line %lu: strength \"%s\" not supported (will use 2)\n",
+			   fname, (unsigned long)lineno, cargv[4]);
+		break;
+	    }
+	    switch (atoi(cargv[5])) {
+	      case 1: profile->decomposition = UCOL_OFF; break;
+	      case 2: profile->decomposition = UCOL_DEFAULT; /* no break here? fall through? wtf? */
+	      case 3: profile->decomposition = UCOL_ON; break;
+	      default: profile->decomposition = UCOL_DEFAULT;
+		LDAPDebug (LDAP_DEBUG_ANY,
+			   "%s: line %lu: decomposition \"%s\" not supported (will use 2)\n",
+			   fname, (unsigned long)lineno, cargv[5]);
+		break;
+	    }
+
+            {
+                char descStr[256];
+                char nameOrder[256];
+                char nameSubstring[256];
+                char oidString[256];
+                char *tmpStr=NULL;
+                Slapi_MatchingRuleEntry *mrentry=slapi_matchingrule_new();
+ 
+                if(UCOL_PRIMARY == profile->strength) {
+                    strcpy(nameOrder,"caseIgnoreOrderingMatch");
+                    strcpy(nameSubstring,"caseIgnoreSubstringMatch");
+                }
+                else {
+                    strcpy(nameOrder,"caseExactOrderingMatch");
+                    strcpy(nameSubstring,"caseExactSubstringMatch");
+                }
+ 
+                if(cargc > 7) {
+                    strcat(nameOrder,"-");
+                    strcat(nameOrder,cargv[7]);
+                    strcat(nameSubstring,"-");
+                    strcat(nameSubstring,cargv[7]);
+                    slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_NAME,
+                                           (void *)slapi_ch_strdup(nameOrder));
+                }
+                else {
+                    if(0 != cargv[1][0]) {
+                        strcat(nameOrder,"-");
+                        strcat(nameSubstring,"-");
+                    }
+                    strcat(nameOrder,cargv[1]);
+                    strcat(nameSubstring,cargv[1]);
+                    slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_NAME,
+                                           (void *)slapi_ch_strdup(nameOrder));
+                }
+                strcpy(oidString,cargv[6]);
+                slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_OID,
+                                       (void *)slapi_ch_strdup(oidString));
+                if(0 != cargv[2][0]) {
+                    sprintf(descStr,"%s-%s",cargv[1],cargv[2]);
+                }
+                else {
+                    strcpy(descStr,cargv[1]);
+                }
+                slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_DESC,
+						   (void *)slapi_ch_strdup(descStr));
+                slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_SYNTAX,
+						   (void *)slapi_ch_strdup(DIRSTRING_SYNTAX_OID));
+                slapi_matchingrule_register(mrentry);
+                slapi_matchingrule_get(mrentry,SLAPI_MATCHINGRULE_NAME,
+                                       (void *)&tmpStr);
+                slapi_ch_free((void **)&tmpStr);
+                slapi_matchingrule_get(mrentry,SLAPI_MATCHINGRULE_OID,
+                                       (void *)&tmpStr);
+                slapi_ch_free((void **)&tmpStr);
+                slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_NAME,
+                                       (void *)slapi_ch_strdup(nameSubstring));
+                strcat(oidString,".6");
+                slapi_matchingrule_set(mrentry,SLAPI_MATCHINGRULE_OID,
+                                       (void *)slapi_ch_strdup(oidString));
+                slapi_matchingrule_register(mrentry);
+                slapi_matchingrule_free(&mrentry,1);
+            }
+ 
+
+	    for (arg = 6; arg < cargc; ++arg) {
+		auto coll_id_t* id = (coll_id_t*) slapi_ch_malloc (sizeof (coll_id_t));
+		id->oid     = slapi_ch_strdup (cargv[arg]);
+		id->profile = profile;
+		if (collation_ids <= 0) {
+		    collation_id = (const coll_id_t**) slapi_ch_malloc (2 * sizeof (coll_id_t*));
+		} else {
+		    collation_id = (const coll_id_t**) slapi_ch_realloc
+		      ((void*)collation_id, (collation_ids + 2) * sizeof (coll_id_t*));
+		}
+		collation_id [collation_ids++] = id;
+		collation_id [collation_ids] = NULL;
+	    }
+	}
+    } else {
+	return -1; /* unrecognized */
+    }
+    return 0; /* success */
+}
+
+typedef struct collation_indexer_t
+    /* A kind of indexer, implemented using an ICU Collator */
+{
+    UCollator*         collator;
+    UConverter*	       converter;
+    struct berval**    ix_keys;
+    int                is_default_collator;
+} collation_indexer_t;
+
+/*
+  Caller must ensure that U == NULL and Ulen == 0 the first time called
+*/
+static UErrorCode
+SetUnicodeStringFromUTF_8 (UChar** U, int32_t* Ulen, int *isAlloced, const struct berval* bv)
+    /* Copy the UTF-8 string bv into the UnicodeString U,
+       but remove leading and trailing whitespace, and
+       convert consecutive whitespaces into a single space.
+       Ulen is set to the number of UChars in the array (not necessarily the number of bytes!)
+    */
+{
+    size_t n;
+    int32_t len = 0; /* length of non-space string */
+    int32_t needLen = 0; /* number of bytes needed for string */
+    UErrorCode err = U_ZERO_ERROR;
+    const char* s = bv->bv_val;
+    const char* begin = NULL; /* will point to beginning of non-space in val */
+    const char* end = NULL; /* will point to the first space after the last non-space char in val */
+    int32_t nUchars = 0;
+
+    if (!bv->bv_len) { /* no value? */
+	return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
+    }
+
+    /* first, set s to the first non-space char in bv->bv_val */
+    for (n = 0; (n < bv->bv_len) && ldap_utf8isspace((char *)s); ) { /* cast away const */
+	const char *next = LDAP_UTF8NEXT((char *)s); /* cast away const */
+	n += (next - s); /* count bytes, not chars */
+	s = next;
+    }
+    begin = s; /* begin points to first non-space char in val */
+
+    if (n >= bv->bv_len) { /* value is all spaces? */
+	return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
+    }
+
+    s = bv->bv_val + (bv->bv_len-1); /* move s to last char of bv_val */
+    end = s; /* end points at last char of bv_val - may change below */
+    /* find the last non-null and non-space char of val */
+    for (n = bv->bv_len; (n > 0) && (!*s || ldap_utf8isspace((char *)s));) {
+	const char *prev = LDAP_UTF8PREV((char *)s);
+	end = prev;
+	n -= (s - prev); /* count bytes, not chars */
+	s = prev;
+    }	
+
+    /* end now points at last non-null/non-space of val */
+    if (n < 0) { /* bogus */
+	return U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
+    }
+
+    len = LDAP_UTF8NEXT((char *)end) - begin;
+
+    u_strFromUTF8(*U, *Ulen, &nUchars, begin, len, &err);
+    if (nUchars > *Ulen) { /* need more space */
+	if (*isAlloced) { /* realloc space */
+	    *U = (UChar *)slapi_ch_realloc((char *)*U, sizeof(UChar) * nUchars);
+	} else { /* must use malloc */
+	    *U = (UChar *)slapi_ch_malloc(sizeof(UChar) * nUchars);
+	    *isAlloced = 1; /* no longer using fixed buffer */
+	}
+	*Ulen = nUchars;
+	err = U_ZERO_ERROR; /* reset */
+	u_strFromUTF8(*U, *Ulen, NULL, begin, len, &err);
+    } else {
+	*Ulen = nUchars;
+    }
+
+    return err;
+}
+
+static struct berval**
+collation_index (indexer_t* ix, struct berval** bvec, struct berval** prefixes)
+{ 
+    collation_indexer_t* etc = (collation_indexer_t*) ix->ix_etc;
+    struct berval** keys = NULL;
+    if (bvec) {
+	char keyBuffer[128]; /* try to use static space buffer to avoid malloc */
+	int32_t keyLen = sizeof(keyBuffer);
+	char* key = keyBuffer; /* but key can grow if necessary */
+	size_t keyn = 0;
+	struct berval** bv;
+	UChar charBuffer[128]; /* try to use static space buffer */
+	int32_t nChars = sizeof(charBuffer)/sizeof(UChar); /* but grow if necessary */
+	UChar *chars = charBuffer; /* try to reuse this */
+	int isAlloced = 0; /* using fixed buffer */
+
+	for (bv = bvec; *bv; ++bv) {
+	    /* if chars is allocated, nChars will be the capacity and the number of chars in chars */
+	    /* otherwise, nChars will be the number of chars, which may be less than the capacity */
+	    if (!isAlloced) {
+		nChars = sizeof(charBuffer)/sizeof(UChar); /* reset */
+	    }
+	    if (U_ZERO_ERROR == SetUnicodeStringFromUTF_8 (&chars, &nChars, &isAlloced, *bv)) {
+		/* nChars is now the number of UChar in chars, which may be less than the
+		   capacity of charBuffer if not allocated */
+		struct berval* prefix = prefixes ? prefixes[bv-bvec] : NULL;
+		const size_t prefixLen = prefix ? prefix->bv_len : 0;
+		struct berval* bk = NULL;
+		int32_t realLen; /* real length of key, not keyLen which is buffer size */
+
+		/* try to get the sort key using key and keyLen; only grow key
+		   if we need to */
+		/* can use -1 for char len since the conversion from UTF8
+		   null terminates the string */
+		realLen = ucol_getSortKey(etc->collator, chars, nChars, (uint8_t *)key, keyLen);
+		if (realLen > keyLen) { /* need more space */
+		    if (key == keyBuffer) {
+			key = (char*)slapi_ch_malloc(sizeof(char) * realLen);
+		    } else {
+			key = (char*)slapi_ch_realloc(key, sizeof(char) * realLen);
+		    }
+		    keyLen = ucol_getSortKey(etc->collator, chars, nChars, (uint8_t *)key, realLen);
+		}
+		if (realLen > 0) {
+		    bk = (struct berval*) slapi_ch_malloc (sizeof(struct berval));
+
+		    bk->bv_len = prefixLen + realLen;
+		    bk->bv_val = slapi_ch_malloc (bk->bv_len + 1);
+		    if (prefixLen) {
+			memcpy(bk->bv_val, prefix->bv_val, prefixLen);
+		    }
+		    memcpy(bk->bv_val + prefixLen, key, realLen);
+		    bk->bv_val[bk->bv_len] = '\0';
+		    LDAPDebug (LDAP_DEBUG_FILTER, "collation_index(%.*s) %lu bytes\n",
+			       bk->bv_len, bk->bv_val, (unsigned long)bk->bv_len);
+		    keys = (struct berval**)
+			slapi_ch_realloc ((void*)keys, sizeof(struct berval*) * (keyn + 2));
+		    keys[keyn++] = bk;
+		    keys[keyn] = NULL;
+		}
+	    }
+	}
+	if (chars != charBuffer) { /* realloc'ed, need to free */
+	    slapi_ch_free((void **)&chars);
+	}
+	if (key != keyBuffer) { /* realloc'ed, need to free */
+	    slapi_ch_free_string(&key);
+	}
+    }
+    if (etc->ix_keys != NULL) ber_bvecfree (etc->ix_keys);
+    etc->ix_keys = keys;
+    return keys;
+}
+
+static void
+collation_indexer_destroy (indexer_t* ix)
+    /* The destructor function for a collation-based indexer. */
+{
+    collation_indexer_t* etc = (collation_indexer_t*) ix->ix_etc;
+    if (etc->converter) {
+	ucnv_close(etc->converter);
+	etc->converter = NULL;
+    }
+    if (!etc->is_default_collator) {
+	/* Don't delete the default collation - it seems to cause problems */
+	ucol_close(etc->collator);
+	etc->collator = NULL;
+    }
+    if (etc->ix_keys != NULL) {
+	ber_bvecfree (etc->ix_keys);
+	etc->ix_keys = NULL;
+    }
+    slapi_ch_free((void**)&ix->ix_etc);
+    ix->ix_etc = NULL; /* just for hygiene */
+}
+
+static UErrorCode
+s_newNamedLocaleFromComponents(char **locale, const char *lang, const char *country, const char *variant)
+{
+    UErrorCode err = U_ZERO_ERROR;
+    int hasLang = (lang && *lang);
+    int hasC = (country && *country);
+    int hasVar = (variant && *variant);
+
+    *locale = NULL;
+    if (hasLang) {
+	*locale = PR_smprintf("%s%s%s%s%s", lang, (hasC ? "_" : ""), (hasC ? country : ""),
+			      (hasVar ? "_" : ""), (hasVar ? variant : ""));
+    } else {
+	err = U_INVALID_FORMAT_ERROR; /* don't know what else to use here */
+    }
+
+    return err;
+}
+
+indexer_t*
+collation_indexer_create (const char* oid)
+    /* Return a new indexer, based on the collation identified by oid.
+       Return NULL if this can't be done.
+    */
+{
+    indexer_t* ix = NULL;
+    const coll_id_t** id = collation_id;
+    char* locale = NULL; /* NULL == default locale */
+    if (id) for (; *id; ++id) {
+	if (!strcasecmp (oid, (*id)->oid)) {
+	    const coll_profile_t* profile = (*id)->profile;
+	    const int is_default = (profile->language == NULL && 
+					 profile->country  == NULL && 
+					 profile->variant  == NULL);
+	    UErrorCode err = U_ZERO_ERROR;
+	    if ( ! is_default) {
+		if (locale) {
+		    PR_smprintf_free(locale);
+		    locale = NULL;
+		}
+		err = s_newNamedLocaleFromComponents(&locale,
+						     profile->language,
+						     profile->country,
+						     profile->variant);
+	    }
+	    if (err == U_ZERO_ERROR) {
+		UCollator* coll = ucol_open(locale, &err);
+		/*
+		 * If we found exactly the right collator for this locale,
+		 * or if we found a fallback one, or if we are happy with
+		 * the default, use it.
+		 */
+		if (err == U_ZERO_ERROR || err == U_USING_FALLBACK_WARNING ||
+		    (err == U_USING_DEFAULT_WARNING && is_default)) {
+		    collation_indexer_t* etc = (collation_indexer_t*)
+		      slapi_ch_calloc (1, sizeof (collation_indexer_t));
+		    ix = (indexer_t*) slapi_ch_calloc (1, sizeof (indexer_t));
+		    ucol_setAttribute (coll, UCOL_STRENGTH, profile->strength, &err);
+		    if (err != U_ZERO_ERROR) {
+			LDAPDebug (LDAP_DEBUG_ANY, "collation_indexer_create: could not "
+				   "set the collator strength for oid %s to %d: err %d\n",
+				   oid, profile->strength, err);
+		    }
+		    ucol_setAttribute (coll, UCOL_DECOMPOSITION_MODE, profile->decomposition, &err);
+		    if (err != U_ZERO_ERROR) {
+			LDAPDebug (LDAP_DEBUG_ANY, "collation_indexer_create: could not "
+				   "set the collator decomposition mode for oid %s to %d: err %d\n",
+				   oid, profile->decomposition, err);
+		    }
+		    etc->collator = coll;
+		    etc->is_default_collator = is_default;
+		    for (id = collation_id; *id; ++id) {
+			if ((*id)->profile == profile) {
+			    break; /* found the 'official' id */
+			}
+		    }
+		    ix->ix_etc = etc;
+		    ix->ix_oid = (*id)->oid;
+		    ix->ix_index = collation_index;
+		    ix->ix_destroy = collation_indexer_destroy;
+		    break; /* return */
+		    /* free (etc); */
+		    /* free (ix); */
+		} else if (err == U_USING_DEFAULT_WARNING) {
+		    LDAPDebug (LDAP_DEBUG_FILTER, "collation_indexer_create: could not "
+			       "create an indexer for OID %s for locale %s and could not "
+			       "use default locale\n",
+			       oid, (locale ? locale : "(default)"), NULL);
+		} else { /* error */
+		    LDAPDebug (LDAP_DEBUG_FILTER, "collation_indexer_create: could not "
+			       "create an indexer for OID %s for locale %s: err = %d\n",
+			       oid, (locale ? locale : "(default)"), err);
+		}
+		if (coll) {
+		    ucol_close (coll);
+		    coll = NULL;
+		}
+	    }
+	    break; /* failed to create the specified collator */
+	}
+    }
+    if (locale) {
+	PR_smprintf_free(locale);
+	locale = NULL;
+    }
+    return ix;
+}