diff options
| author | Noriko Hosoi <nhosoi@redhat.com> | 2009-01-12 19:18:38 +0000 |
|---|---|---|
| committer | Noriko Hosoi <nhosoi@redhat.com> | 2009-01-12 19:18:38 +0000 |
| commit | 22eb878880eb98336e0c173d6d1db9e1d9c9da07 (patch) | |
| tree | a233f15c5515d95fc8d05ea9ad8f6779c7b49349 | |
| parent | 7ce114f9d999fb8e80813d97c370d4fa8541b154 (diff) | |
| download | ds-22eb878880eb98336e0c173d6d1db9e1d9c9da07.tar.gz ds-22eb878880eb98336e0c173d6d1db9e1d9c9da07.tar.xz ds-22eb878880eb98336e0c173d6d1db9e1d9c9da07.zip | |
Resolves: #460613
Summary: Approximate Search '~=' Returns unexpected result
Change description: increasing the maximum length of "phonetic" string from 4
to 6. The length 4 is sometimes too short to distinguish long words. For
instance, the sample string Queensland is converted to KNSLNT if there is no
limitation; Consulting is to KNSLTNK. By cutting them at the 5th character,
the 2 strings are considered to sound like each other.
| -rw-r--r-- | ldap/servers/plugins/syntaxes/phonetic.c | 638 |
1 files changed, 319 insertions, 319 deletions
diff --git a/ldap/servers/plugins/syntaxes/phonetic.c b/ldap/servers/plugins/syntaxes/phonetic.c index a974ebda..4801cc70 100644 --- a/ldap/servers/plugins/syntaxes/phonetic.c +++ b/ldap/servers/plugins/syntaxes/phonetic.c @@ -68,7 +68,7 @@ utf8iswordbreak( const char* s ) case 0x00A0: /* non-breaking space */ case 0x3000: /* ideographic space */ case 0xFEFF: /* zero-width non-breaking space */ - return 1; + return 1; default: break; } return 0; @@ -77,61 +77,61 @@ utf8iswordbreak( const char* s ) char * first_word( char *s ) { - if ( s == NULL ) { - return( NULL ); - } - - while ( iswordbreak( s ) ) { - if ( *s == '\0' ) { - return( NULL ); - } else { - LDAP_UTF8INC( s ); - } - } - - return( s ); + if ( s == NULL ) { + return( NULL ); + } + + while ( iswordbreak( s ) ) { + if ( *s == '\0' ) { + return( NULL ); + } else { + LDAP_UTF8INC( s ); + } + } + + return( s ); } char * next_word( char *s ) { - if ( s == NULL ) { - return( NULL ); - } - - while ( ! iswordbreak( s ) ) { - LDAP_UTF8INC( s ); - } - - while ( iswordbreak( s ) ) { - if ( *s == '\0' ) { - return( NULL ); - } else { - LDAP_UTF8INC( s ); - } - } - - return( s ); + if ( s == NULL ) { + return( NULL ); + } + + while ( ! iswordbreak( s ) ) { + LDAP_UTF8INC( s ); + } + + while ( iswordbreak( s ) ) { + if ( *s == '\0' ) { + return( NULL ); + } else { + LDAP_UTF8INC( s ); + } + } + + return( s ); } char * word_dup( char *w ) { - char *s, *ret; - char save; + char *s, *ret; + char save; - for ( s = w; !iswordbreak( s ); LDAP_UTF8INC( s )) - ; /* NULL */ - save = *s; - *s = '\0'; - ret = slapi_ch_strdup( w ); - *s = save; + for ( s = w; !iswordbreak( s ); LDAP_UTF8INC( s )) + ; /* NULL */ + save = *s; + *s = '\0'; + ret = slapi_ch_strdup( w ); + *s = save; - return( ret ); + return( ret ); } #ifndef MAXPHONEMELEN -#define MAXPHONEMELEN 4 +#define MAXPHONEMELEN 6 #endif #if defined(SOUNDEX) @@ -140,11 +140,11 @@ word_dup( char *w ) char * phonetic( char *s ) { - char code, adjacent, ch; - char *p; - char **c; - int i, cmax; - char phoneme[MAXPHONEMELEN + 1]; + char code, adjacent, ch; + char *p; + char **c; + int i, cmax; + char phoneme[MAXPHONEMELEN + 1]; p = s; if ( p == NULL || *p == '\0' ) { @@ -152,18 +152,18 @@ phonetic( char *s ) } adjacent = '0'; - phoneme[0] = TOUPPER(*p); + phoneme[0] = TOUPPER(*p); - phoneme[1] = '\0'; + phoneme[1] = '\0'; for ( i = 0; i < 99 && (! iswordbreak(p)); LDAP_UTF8INC( p )) { - ch = TOUPPER (*p); + ch = TOUPPER (*p); code = '0'; switch (ch) { case 'B': case 'F': - case 'P': + case 'P': case 'V': code = (adjacent != '1') ? '1' : '0'; break; @@ -196,18 +196,18 @@ phonetic( char *s ) } if ( i == 0 ) { - adjacent = code; - i++; - } else if ( code != '0' ) { - if ( i == MAXPHONEMELEN ) - break; + adjacent = code; + i++; + } else if ( code != '0' ) { + if ( i == MAXPHONEMELEN ) + break; adjacent = phoneme[i] = code; i++; } } - if ( i > 0 ) - phoneme[i] = '\0'; + if ( i > 0 ) + phoneme[i] = '\0'; return( slapi_ch_strdup( phoneme ) ); } @@ -224,274 +224,274 @@ phonetic( char *s ) /* Character coding array */ static char vsvfn[26] = { - 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, - /* A B C D E F G H I J K L M */ - 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0}; - /* N O P Q R S T U V W X Y Z */ + 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, + /* A B C D E F G H I J K L M */ + 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0}; + /* N O P Q R S T U V W X Y Z */ /* Macros to access character coding array */ -#define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */ -#define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */ -#define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */ -#define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */ -#define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */ +#define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */ +#define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */ +#define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */ +#define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */ +#define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */ char * phonetic( char *Word ) { - char *n, *n_start, *n_end; /* pointers to string */ - char *metaph_end; /* pointers to metaph */ - char ntrans[42]; /* word with uppercase letters */ - int KSflag; /* state flag for X -> KS */ - char buf[MAXPHONEMELEN + 2]; - char *Metaph; - - /* - * Copy Word to internal buffer, dropping non-alphabetic characters - * and converting to upper case - */ - n = ntrans + 4; n_end = ntrans + 35; - while (!iswordbreak( Word ) && n < n_end) { - if (isascii(*Word)) { - if (isalpha(*Word)) { - *n++ = TOUPPER(*Word); - } - ++Word; - } else { - auto const size_t len = LDAP_UTF8COPY(n, Word); - n += len; Word += len; - } - } - Metaph = buf; - *Metaph = '\0'; - if (n == ntrans + 4) { - return( slapi_ch_strdup( buf ) ); /* Return if null */ - } - n_end = n; /* Set n_end to end of string */ - - /* ntrans[0] will always be == 0 */ - ntrans[0] = '\0'; - ntrans[1] = '\0'; - ntrans[2] = '\0'; - ntrans[3] = '\0'; - *n++ = 0; - *n++ = 0; - *n++ = 0; - *n = 0; /* Pad with nulls */ - n = ntrans + 4; /* Assign pointer to start */ - - /* Check for PN, KN, GN, AE, WR, WH, and X at start */ - switch (*n) { - case 'P': - case 'K': - case 'G': - /* 'PN', 'KN', 'GN' becomes 'N' */ - if (*(n + 1) == 'N') - *n++ = 0; - break; - case 'A': - /* 'AE' becomes 'E' */ - if (*(n + 1) == 'E') - *n++ = 0; - break; - case 'W': - /* 'WR' becomes 'R', and 'WH' to 'H' */ - if (*(n + 1) == 'R') - *n++ = 0; - else if (*(n + 1) == 'H') { - *(n + 1) = *n; - *n++ = 0; - } - break; - case 'X': - /* 'X' becomes 'S' */ - *n = 'S'; - break; - } - - /* - * Now, loop step through string, stopping at end of string or when - * the computed 'metaph' is MAXPHONEMELEN characters long - */ - - KSflag = 0; /* state flag for KS translation */ - for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n; - n <= n_end && Metaph < metaph_end; n++) { - if (KSflag) { - KSflag = 0; - *Metaph++ = 'S'; - } else if (!isascii(*n)) { - *Metaph++ = *n; - } else { - /* Drop duplicates except for CC */ - if (*(n - 1) == *n && *n != 'C') - continue; - /* Check for F J L M N R or first letter vowel */ - if (same(*n) || (n == n_start && vowel(*n))) { - *Metaph++ = *n; - } else { - switch (*n) { - case 'B': - - /* - * B unless in -MB - */ - if (n < (n_end - 1) && *(n - 1) != 'M') { - *Metaph++ = *n; - } - break; - case 'C': - - /* - * X if in -CIA-, -CH- else S if in - * -CI-, -CE-, -CY- else dropped if - * in -SCI-, -SCE-, -SCY- else K - */ - if (*(n - 1) != 'S' || !frontv(*(n + 1))) { - if (*(n + 1) == 'I' && *(n + 2) == 'A') { - *Metaph++ = 'X'; - } else if (frontv(*(n + 1))) { - *Metaph++ = 'S'; - } else if (*(n + 1) == 'H') { - *Metaph++ = ((n == n_start && !vowel(*(n + 2))) - || *(n - 1) == 'S') - ? (char) 'K' : (char) 'X'; - } else { - *Metaph++ = 'K'; - } - } - break; - case 'D': - - /* - * J if in DGE or DGI or DGY else T - */ - *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2))) - ? (char) 'J' : (char) 'T'; - break; - case 'G': - - /* - * F if in -GH and not B--GH, D--GH, - * -H--GH, -H---GH else dropped if - * -GNED, -GN, -DGE-, -DGI-, -DGY- - * else J if in -GE-, -GI-, -GY- and - * not GG else K - */ - if ((*(n + 1) != 'J' || vowel(*(n + 2))) && - (*(n + 1) != 'N' || ((n + 1) < n_end && - (*(n + 2) != 'E' || *(n + 3) != 'D'))) && - (*(n - 1) != 'D' || !frontv(*(n + 1)))) - *Metaph++ = (frontv(*(n + 1)) && - *(n + 2) != 'G') ? (char) 'G' : (char) 'K'; - else if (*(n + 1) == 'H' && !noghf(*(n - 3)) && - *(n - 4) != 'H') - *Metaph++ = 'F'; - break; - case 'H': - - /* - * H if before a vowel and not after - * C, G, P, S, T else dropped - */ - if (!varson(*(n - 1)) && (!vowel(*(n - 1)) || - vowel(*(n + 1)))) - *Metaph++ = 'H'; - break; - case 'K': - - /* - * dropped if after C else K - */ - if (*(n - 1) != 'C') - *Metaph++ = 'K'; - break; - case 'P': - - /* - * F if before H, else P - */ - *Metaph++ = *(n + 1) == 'H' ? - (char) 'F' : (char) 'P'; - break; - case 'Q': - - /* - * K - */ - *Metaph++ = 'K'; - break; - case 'S': - - /* - * X in -SH-, -SIO- or -SIA- else S - */ - *Metaph++ = (*(n + 1) == 'H' || - (*(n + 1) == 'I' && (*(n + 2) == 'O' || - *(n + 2) == 'A'))) - ? (char) 'X' : (char) 'S'; - break; - case 'T': - - /* - * X in -TIA- or -TIO- else 0 (zero) - * before H else dropped if in -TCH- - * else T - */ - if (*(n + 1) == 'I' && (*(n + 2) == 'O' || - *(n + 2) == 'A')) - *Metaph++ = 'X'; - else if (*(n + 1) == 'H') - *Metaph++ = '0'; - else if (*(n + 1) != 'C' || *(n + 2) != 'H') - *Metaph++ = 'T'; - break; - case 'V': - - /* - * F - */ - *Metaph++ = 'F'; - break; - case 'W': - - /* - * W after a vowel, else dropped - */ - case 'Y': - - /* - * Y unless followed by a vowel - */ - if (vowel(*(n + 1))) - *Metaph++ = *n; - break; - case 'X': - - /* - * KS - */ - if (n == n_start) - *Metaph++ = 'S'; - else { - *Metaph++ = 'K'; /* Insert K, then S */ - KSflag = 1; - } - break; - case 'Z': - - /* - * S - */ - *Metaph++ = 'S'; - break; - } - } - } - } - - *Metaph = 0; /* Null terminate */ - return( slapi_ch_strdup( buf ) ); + char *n, *n_start, *n_end; /* pointers to string */ + char *metaph_end; /* pointers to metaph */ + char ntrans[42]; /* word with uppercase letters */ + int KSflag; /* state flag for X -> KS */ + char buf[MAXPHONEMELEN + 2]; + char *Metaph; + + /* + * Copy Word to internal buffer, dropping non-alphabetic characters + * and converting to upper case + */ + n = ntrans + 4; n_end = ntrans + 35; + while (!iswordbreak( Word ) && n < n_end) { + if (isascii(*Word)) { + if (isalpha(*Word)) { + *n++ = TOUPPER(*Word); + } + ++Word; + } else { + auto const size_t len = LDAP_UTF8COPY(n, Word); + n += len; Word += len; + } + } + Metaph = buf; + *Metaph = '\0'; + if (n == ntrans + 4) { + return( slapi_ch_strdup( buf ) ); /* Return if null */ + } + n_end = n; /* Set n_end to end of string */ + + /* ntrans[0] will always be == 0 */ + ntrans[0] = '\0'; + ntrans[1] = '\0'; + ntrans[2] = '\0'; + ntrans[3] = '\0'; + *n++ = 0; + *n++ = 0; + *n++ = 0; + *n = 0; /* Pad with nulls */ + n = ntrans + 4; /* Assign pointer to start */ + + /* Check for PN, KN, GN, AE, WR, WH, and X at start */ + switch (*n) { + case 'P': + case 'K': + case 'G': + /* 'PN', 'KN', 'GN' becomes 'N' */ + if (*(n + 1) == 'N') + *n++ = 0; + break; + case 'A': + /* 'AE' becomes 'E' */ + if (*(n + 1) == 'E') + *n++ = 0; + break; + case 'W': + /* 'WR' becomes 'R', and 'WH' to 'H' */ + if (*(n + 1) == 'R') + *n++ = 0; + else if (*(n + 1) == 'H') { + *(n + 1) = *n; + *n++ = 0; + } + break; + case 'X': + /* 'X' becomes 'S' */ + *n = 'S'; + break; + } + + /* + * Now, loop step through string, stopping at end of string or when + * the computed 'metaph' is MAXPHONEMELEN characters long + */ + + KSflag = 0; /* state flag for KS translation */ + for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n; + n <= n_end && Metaph < metaph_end; n++) { + if (KSflag) { + KSflag = 0; + *Metaph++ = 'S'; + } else if (!isascii(*n)) { + *Metaph++ = *n; + } else { + /* Drop duplicates except for CC */ + if (*(n - 1) == *n && *n != 'C') + continue; + /* Check for F J L M N R or first letter vowel */ + if (same(*n) || (n == n_start && vowel(*n))) { + *Metaph++ = *n; + } else { + switch (*n) { + case 'B': + + /* + * B unless in -MB + */ + if (n < (n_end - 1) && *(n - 1) != 'M') { + *Metaph++ = *n; + } + break; + case 'C': + + /* + * X if in -CIA-, -CH- else S if in + * -CI-, -CE-, -CY- else dropped if + * in -SCI-, -SCE-, -SCY- else K + */ + if (*(n - 1) != 'S' || !frontv(*(n + 1))) { + if (*(n + 1) == 'I' && *(n + 2) == 'A') { + *Metaph++ = 'X'; + } else if (frontv(*(n + 1))) { + *Metaph++ = 'S'; + } else if (*(n + 1) == 'H') { + *Metaph++ = ((n == n_start && !vowel(*(n + 2))) + || *(n - 1) == 'S') + ? (char) 'K' : (char) 'X'; + } else { + *Metaph++ = 'K'; + } + } + break; + case 'D': + + /* + * J if in DGE or DGI or DGY else T + */ + *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2))) + ? (char) 'J' : (char) 'T'; + break; + case 'G': + + /* + * F if in -GH and not B--GH, D--GH, + * -H--GH, -H---GH else dropped if + * -GNED, -GN, -DGE-, -DGI-, -DGY- + * else J if in -GE-, -GI-, -GY- and + * not GG else K + */ + if ((*(n + 1) != 'J' || vowel(*(n + 2))) && + (*(n + 1) != 'N' || ((n + 1) < n_end && + (*(n + 2) != 'E' || *(n + 3) != 'D'))) && + (*(n - 1) != 'D' || !frontv(*(n + 1)))) + *Metaph++ = (frontv(*(n + 1)) && + *(n + 2) != 'G') ? (char) 'G' : (char) 'K'; + else if (*(n + 1) == 'H' && !noghf(*(n - 3)) && + *(n - 4) != 'H') + *Metaph++ = 'F'; + break; + case 'H': + + /* + * H if before a vowel and not after + * C, G, P, S, T else dropped + */ + if (!varson(*(n - 1)) && (!vowel(*(n - 1)) || + vowel(*(n + 1)))) + *Metaph++ = 'H'; + break; + case 'K': + + /* + * dropped if after C else K + */ + if (*(n - 1) != 'C') + *Metaph++ = 'K'; + break; + case 'P': + + /* + * F if before H, else P + */ + *Metaph++ = *(n + 1) == 'H' ? + (char) 'F' : (char) 'P'; + break; + case 'Q': + + /* + * K + */ + *Metaph++ = 'K'; + break; + case 'S': + + /* + * X in -SH-, -SIO- or -SIA- else S + */ + *Metaph++ = (*(n + 1) == 'H' || + (*(n + 1) == 'I' && (*(n + 2) == 'O' || + *(n + 2) == 'A'))) + ? (char) 'X' : (char) 'S'; + break; + case 'T': + + /* + * X in -TIA- or -TIO- else 0 (zero) + * before H else dropped if in -TCH- + * else T + */ + if (*(n + 1) == 'I' && (*(n + 2) == 'O' || + *(n + 2) == 'A')) + *Metaph++ = 'X'; + else if (*(n + 1) == 'H') + *Metaph++ = '0'; + else if (*(n + 1) != 'C' || *(n + 2) != 'H') + *Metaph++ = 'T'; + break; + case 'V': + + /* + * F + */ + *Metaph++ = 'F'; + break; + case 'W': + + /* + * W after a vowel, else dropped + */ + case 'Y': + + /* + * Y unless followed by a vowel + */ + if (vowel(*(n + 1))) + *Metaph++ = *n; + break; + case 'X': + + /* + * KS + */ + if (n == n_start) + *Metaph++ = 'S'; + else { + *Metaph++ = 'K'; /* Insert K, then S */ + KSflag = 1; + } + break; + case 'Z': + + /* + * S + */ + *Metaph++ = 'S'; + break; + } + } + } + } + + *Metaph = 0; /* Null terminate */ + return( slapi_ch_strdup( buf ) ); } #endif /* METAPHONE */ |
