diff options
Diffstat (limited to 'contrib/idn/idnkit-1.0-src/lib/unormalize.c')
-rw-r--r-- | contrib/idn/idnkit-1.0-src/lib/unormalize.c | 413 |
1 files changed, 413 insertions, 0 deletions
diff --git a/contrib/idn/idnkit-1.0-src/lib/unormalize.c b/contrib/idn/idnkit-1.0-src/lib/unormalize.c new file mode 100644 index 0000000..f741724 --- /dev/null +++ b/contrib/idn/idnkit-1.0-src/lib/unormalize.c @@ -0,0 +1,413 @@ +#ifndef lint +static char *rcsid = "$Id: unormalize.c,v 1.1.1.1 2003/06/04 00:26:43 marka Exp $"; +#endif + +/* + * Copyright (c) 2000,2001,2002 Japan Network Information Center. + * All rights reserved. + * + * By using this file, you agree to the terms and conditions set forth bellow. + * + * LICENSE TERMS AND CONDITIONS + * + * The following License Terms and Conditions apply, unless a different + * license is obtained from Japan Network Information Center ("JPNIC"), + * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, + * Chiyoda-ku, Tokyo 101-0047, Japan. + * + * 1. Use, Modification and Redistribution (including distribution of any + * modified or derived work) in source and/or binary forms is permitted + * under this License Terms and Conditions. + * + * 2. Redistribution of source code must retain the copyright notices as they + * appear in each source code file, this License Terms and Conditions. + * + * 3. Redistribution in binary form must reproduce the Copyright Notice, + * this License Terms and Conditions, in the documentation and/or other + * materials provided with the distribution. For the purposes of binary + * distribution the "Copyright Notice" refers to the following language: + * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." + * + * 4. The name of JPNIC may not be used to endorse or promote products + * derived from this Software without specific prior written approval of + * JPNIC. + * + * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + */ + +#include <config.h> + +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +#include <idn/result.h> +#include <idn/assert.h> +#include <idn/logmacro.h> +#include <idn/ucs4.h> +#include <idn/unicode.h> +#include <idn/unormalize.h> +#include <idn/debug.h> + +#if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY) +#define memmove(a,b,c) bcopy((char *)(b),(char *)(a),(int)(c)) +#endif + +#define WORKBUF_SIZE 128 +#define WORKBUF_SIZE_MAX 10000 + +typedef struct { + idn__unicode_version_t version; /* Unicode version */ + int cur; /* pointing now processing character */ + int last; /* pointing just after the last character */ + int size; /* size of UCS and CLASS array */ + unsigned long *ucs4; /* UCS-4 characters */ + int *class; /* and their canonical classes */ + unsigned long ucs4_buf[WORKBUF_SIZE]; /* local buffer */ + int class_buf[WORKBUF_SIZE]; /* ditto */ +} workbuf_t; + +static idn_result_t normalize(idn__unicode_version_t version, + int do_composition, int compat, + const unsigned long *from, + unsigned long *to, size_t tolen); +static idn_result_t decompose(workbuf_t *wb, unsigned long c, int compat); +static void get_class(workbuf_t *wb); +static void reorder(workbuf_t *wb); +static void compose(workbuf_t *wb); +static idn_result_t flush_before_cur(workbuf_t *wb, + unsigned long **top, size_t *tolenp); +static void workbuf_init(workbuf_t *wb); +static void workbuf_free(workbuf_t *wb); +static idn_result_t workbuf_extend(workbuf_t *wb); +static idn_result_t workbuf_append(workbuf_t *wb, unsigned long c); +static void workbuf_shift(workbuf_t *wb, int shift); +static void workbuf_removevoid(workbuf_t *wb); + +idn_result_t +idn__unormalize_formkc(idn__unicode_version_t version, + const unsigned long *from, unsigned long *to, + size_t tolen) { + assert(version != NULL && from != NULL && to != NULL && tolen >= 0); + TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n", + idn__debug_ucs4xstring(from, 50), tolen)); + return (normalize(version, 1, 1, from, to, tolen)); +} + +static idn_result_t +normalize(idn__unicode_version_t version, int do_composition, int compat, + const unsigned long *from, unsigned long *to, size_t tolen) { + workbuf_t wb; + idn_result_t r = idn_success; + + /* + * Initialize working buffer. + */ + workbuf_init(&wb); + wb.version = version; + + while (*from != '\0') { + unsigned long c; + + assert(wb.cur == wb.last); + + /* + * Get one character from 'from'. + */ + c = *from++; + + /* + * Decompose it. + */ + if ((r = decompose(&wb, c, compat)) != idn_success) + goto ret; + + /* + * Get canonical class. + */ + get_class(&wb); + + /* + * Reorder & compose. + */ + for (; wb.cur < wb.last; wb.cur++) { + if (wb.cur == 0) { + continue; + } else if (wb.class[wb.cur] > 0) { + /* + * This is not a starter. Try reordering. + * Note that characters up to it are + * already in canonical order. + */ + reorder(&wb); + continue; + } + + /* + * This is a starter character, and there are + * some characters before it. Those characters + * have been reordered properly, and + * ready for composition. + */ + if (do_composition && wb.class[0] == 0) + compose(&wb); + + /* + * If CUR points to a starter character, + * then process of characters before CUR are + * already finished, because any further + * reordering/composition for them are blocked + * by the starter CUR points. + */ + if (wb.cur > 0 && wb.class[wb.cur] == 0) { + /* Flush everything before CUR. */ + r = flush_before_cur(&wb, &to, &tolen); + if (r != idn_success) + goto ret; + } + } + } + + if (r == idn_success) { + if (do_composition && wb.cur > 0 && wb.class[0] == 0) { + /* + * There is some characters left in WB. + * They are ordered, but not composed yet. + * Now CUR points just after the last character in WB, + * and since compose() tries to compose characters + * between top and CUR inclusive, we must make CUR + * one character back during compose(). + */ + wb.cur--; + compose(&wb); + wb.cur++; + } + /* + * Call this even when WB.CUR == 0, to make TO + * NUL-terminated. + */ + r = flush_before_cur(&wb, &to, &tolen); + if (r != idn_success) + goto ret; + } + + if (tolen <= 0) { + r = idn_buffer_overflow; + goto ret; + } + *to = '\0'; + +ret: + workbuf_free(&wb); + return (r); +} + +static idn_result_t +decompose(workbuf_t *wb, unsigned long c, int compat) { + idn_result_t r; + int dec_len; + +again: + r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last, + wb->size - wb->last, c, &dec_len); + switch (r) { + case idn_success: + wb->last += dec_len; + return (idn_success); + case idn_notfound: + return (workbuf_append(wb, c)); + case idn_buffer_overflow: + if ((r = workbuf_extend(wb)) != idn_success) + return (r); + if (wb->size > WORKBUF_SIZE_MAX) { + WARNING(("idn__unormalize_form*: " + "working buffer too large\n")); + return (idn_nomemory); + } + goto again; + default: + return (r); + } + /* NOTREACHED */ +} + +static void +get_class(workbuf_t *wb) { + int i; + + for (i = wb->cur; i < wb->last; i++) + wb->class[i] = idn__unicode_canonicalclass(wb->version, + wb->ucs4[i]); +} + +static void +reorder(workbuf_t *wb) { + unsigned long c; + int i; + int class; + + assert(wb != NULL); + + i = wb->cur; + c = wb->ucs4[i]; + class = wb->class[i]; + + while (i > 0 && wb->class[i - 1] > class) { + wb->ucs4[i] = wb->ucs4[i - 1]; + wb->class[i] =wb->class[i - 1]; + i--; + wb->ucs4[i] = c; + wb->class[i] = class; + } +} + +static void +compose(workbuf_t *wb) { + int cur; + unsigned long *ucs4; + int *class; + int last_class; + int nvoids; + int i; + idn__unicode_version_t ver; + + assert(wb != NULL && wb->class[0] == 0); + + cur = wb->cur; + ucs4 = wb->ucs4; + class = wb->class; + ver = wb->version; + + /* + * If there are no decomposition sequence that begins with + * the top character, composition is impossible. + */ + if (!idn__unicode_iscompositecandidate(ver, ucs4[0])) + return; + + last_class = 0; + nvoids = 0; + for (i = 1; i <= cur; i++) { + unsigned long c; + int cl = class[i]; + + if ((last_class < cl || cl == 0) && + idn__unicode_compose(ver, ucs4[0], ucs4[i], + &c) == idn_success) { + /* + * Replace the top character with the composed one. + */ + ucs4[0] = c; + class[0] = idn__unicode_canonicalclass(ver, c); + + class[i] = -1; /* void this character */ + nvoids++; + } else { + last_class = cl; + } + } + + /* Purge void characters, if any. */ + if (nvoids > 0) + workbuf_removevoid(wb); +} + +static idn_result_t +flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) { + if (*tolenp < wb->cur) + return (idn_buffer_overflow); + + memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur); + *top += wb->cur; + *tolenp -= wb->cur; + workbuf_shift(wb, wb->cur); + + return (idn_success); +} + +static void +workbuf_init(workbuf_t *wb) { + wb->cur = 0; + wb->last = 0; + wb->size = WORKBUF_SIZE; + wb->ucs4 = wb->ucs4_buf; + wb->class = wb->class_buf; +} + +static void +workbuf_free(workbuf_t *wb) { + if (wb->ucs4 != wb->ucs4_buf) { + free(wb->ucs4); + free(wb->class); + } +} + +static idn_result_t +workbuf_extend(workbuf_t *wb) { + int newsize = wb->size * 3; + + if (wb->ucs4 == wb->ucs4_buf) { + wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize); + wb->class = malloc(sizeof(wb->class[0]) * newsize); + } else { + wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize); + wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize); + } + if (wb->ucs4 == NULL || wb->class == NULL) + return (idn_nomemory); + else + return (idn_success); +} + +static idn_result_t +workbuf_append(workbuf_t *wb, unsigned long c) { + idn_result_t r; + + if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success) + return (r); + wb->ucs4[wb->last++] = c; + return (idn_success); +} + +static void +workbuf_shift(workbuf_t *wb, int shift) { + int nmove; + + assert(wb != NULL && wb->cur >= shift); + + nmove = wb->last - shift; + (void)memmove(&wb->ucs4[0], &wb->ucs4[shift], + nmove * sizeof(wb->ucs4[0])); + (void)memmove(&wb->class[0], &wb->class[shift], + nmove * sizeof(wb->class[0])); + wb->cur -= shift; + wb->last -= shift; +} + +static void +workbuf_removevoid(workbuf_t *wb) { + int i, j; + int last = wb->last; + + for (i = j = 0; i < last; i++) { + if (wb->class[i] >= 0) { + if (j < i) { + wb->ucs4[j] = wb->ucs4[i]; + wb->class[j] = wb->class[i]; + } + j++; + } + } + wb->cur -= last - j; + wb->last = j; +} |