diff options
author | Peter Jones <pjones@redhat.com> | 2006-06-29 15:58:41 +0000 |
---|---|---|
committer | Peter Jones <pjones@redhat.com> | 2006-06-29 15:58:41 +0000 |
commit | 097e9e63c1431019b1831db549e7f3686aaf34f4 (patch) | |
tree | 7156813ee826601375c54ccc3e5a0505fcf5fd08 /wlite/wlite_mbrtowc.c | |
parent | b7e280956c9fe359efe6cfd6dbeeb658fbe10465 (diff) | |
download | anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.gz anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.xz anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.zip |
- Fix wide character support (#196099, #186701)
Diffstat (limited to 'wlite/wlite_mbrtowc.c')
-rw-r--r-- | wlite/wlite_mbrtowc.c | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/wlite/wlite_mbrtowc.c b/wlite/wlite_mbrtowc.c new file mode 100644 index 000000000..ecd6b29c2 --- /dev/null +++ b/wlite/wlite_mbrtowc.c @@ -0,0 +1,234 @@ +/* + * $Id$ + * + * Copyright (C) 2003 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Original Author: Adrian Havill <havill@redhat.com> + * + * Contributors: + */ + +#include <errno.h> // errno, EILSEQ, ERANGE + +#include "wlite_config.h" // wchar_t, NULL, size_t + +#include "wlite_wchar.h" // prototypes +#include "wlite_wctype.h" +#include "wlite_stdlib.h" + +static const long wlite_invalid = -1; +static const long wlite_incomplete = -2; + +#if WLITE_READ_6_BYTE_UTF8_SURROGATE && WLITE_XBMP_CHAR +static int +is_surrogate_hi(wchar_t u) { return u >= 0xD800 && u <= 0xDBFF; } + +static int +is_surrogate_lo(wchar_t u) { return u >= 0xDC00 && u <= 0xDFFF; } + +static wchar_t +make_utc_from_surrogates(wchar_t hi, wchar_t lo) { + wchar_t u = 0; + + u += (hi - 0xD800) * 0x400; + u += lo - 0xDC00 + 0x10000; + return u; +} +#endif + +static int +is_plane_0(wchar_t u) { return (u + 1) > 0x0000 && (u - 1) < 0xFFFF; } + +static void +make_surrogates(unsigned long u, wchar_t *hi, wchar_t *lo) { + if (hi != NULL) + *hi = (u - 0x10000) / 0x400 + 0xD800; + if (lo != NULL) + *lo = (u - 0x10000) % 0x400 + 0xDC00; +} + +static int +is_tail(uint8_t c) { return (c & 0xC0) == 0x80; } + +static long +get_utc(const unsigned char **utf8, size_t *length) { + long u = 0; + const uint8_t *s = NULL; + + if (*length == 0) return wlite_incomplete; + s = *utf8; + if (s[0] < 128) { + /* HEAD/TAIL pattern: 0zzzzzzz + * ... ASCII is just ASCII. Ain't UTF-8 wonderful? + */ + + u |= (*s++ & 0x7F) << 0; + *length -= 1; + } + else if ((s[0] & 0xE0) == 0xC0) { + /* HEAD/TAIL pattern: 110yyyyy 10zzzzzz + * ... most probably a European character or fancy dingbat/sym if we're + * here + */ + + if (*length < 2) return wlite_incomplete; + if (!is_tail(s[1])) return wlite_invalid; + u |= (*s++ & 0x1F) << 6; + u |= *s++ & 0x3F << 0; + *length -= 2; + if (u < 0x0080) u = wlite_invalid; + } + else if ((s[0] & 0xF0) == 0xE0) { + /* HEAD/TAIL/TAIL pattern: 1110xxxx 10yyyyyy 10zzzzzz + * ... most probably a CJK character, but sometimes Euro/Asian/African. + */ + + if (*length < 3) return wlite_incomplete; + if (!is_tail(s[1]) || !is_tail(s[2])) return wlite_invalid; + u |= (*s++ & 0x0F) << 12; + u |= (*s++ & 0x3F) << 6; + u |= (*s++ & 0x3F) << 0; + *length -= 3; + if (u < 0x0800) u = wlite_invalid; + +# if WLITE_READ_6_BYTE_UTF8_SURROGATE && WLITE_XBMP_CHAR + if (is_surrogate_hi(u)) { + /* XXX: see the note below regarding four byte patterns + * + * if you're here with the debugger then you should probably check + * your favorite character encoding converter for brain-damage. + */ + + size_t length_copy = *length; /* save from recursion */ + const unsigned char *s_copy = s; + + long u1 = u; + long u2 = get_utc(&s_copy, &length_copy); + + if (u2 == wlite_incomplete) return u2; + else if (is_surrogate_lo(u2)) { + u = make_utc_from_surrogates(u1, u2); + *length -= 3; + s += 3; + } + } +# endif + } + else if ((s[0] & 0xF8) == 0xF0) { + /* HEAD/TAIL/TAIL/TAIL bit pattern: 11110www 10xxxxxx 10yyyyyy 10zzzzzz + * ... most probably a freak CJK character and the string is testing + * ... Unicode conformance. Either that or you're attempting to process + * ... Klingon. + * + * XXX: if you're here with the debugger and you're working with + * real-world text, it's probably suspect (I doubt you have any fonts + * to represent the characters in this range anyway) + */ + + if (*length < 4) return wlite_incomplete; // incomplete + if (!is_tail(s[1]) || !is_tail(s[2]) || !is_tail(s[3])) + return wlite_invalid; + u |= (*s++ & 0x07) << 18; + u |= (*s++ & 0x3F) << 12; + u |= (*s++ & 0x3F) << 6; + u |= (*s++ & 0x3F) << 0; + *length -= 4; + if (u <= 0xFFFF) { + u = wlite_invalid; + } + } + else { + /* we either got a five or six byte UTF-8 sequence, a "decapitated" + * (all tail with no head byte-- sometimes raw Latin-1 slipped in a + * UTF-8 string) UTF-8 sequence, or we got 0xFE or 0xFF in the byte + * stream (always illegal in UTF-8-- which means these two can be used + * as private non-exported sentinals in applications. Did I mention + * that UTF-8 is wonderful?) + */ + + u = wlite_invalid; + while (*length != 0 && (is_tail(*s) || *s >= 0xF8)) { + --*length; + ++s; + } + } + *utf8 = s; + return u; +} + +size_t +wlite_mbrtowc(wchar_t *c, const char *s, size_t n, wlite_mbstate_t *ps) { + static wlite_mbstate_t internal = { 0 }; + size_t consumed = 0, remaining = n; + const unsigned char *utf8 = (const unsigned char *) s; + long u = 0; + + if (ps == NULL) + ps = &internal; + if (s != NULL) { + u = get_utc(&utf8, &remaining); + if (u == wlite_incomplete) { + ps->flags_ |= WLITE_MBSTATE_INCOMPLETE_; + ps->flags_ &= ~WLITE_MBSTATE_ERROR_; + return (size_t) -2; + } + else if (u == wlite_invalid) { + ps->flags_ &= ~WLITE_MBSTATE_INCOMPLETE_; + ps->flags_ |= WLITE_MBSTATE_ERROR_; + errno = EILSEQ; + return (size_t) -1; + } + else { + ps->wcout_ = 0; + if (c != NULL) { + if ((ps->flags_ & WLITE_MBSTATE_SURROGATE_) && !is_plane_0(u)) { + wchar_t hi, lo; + + make_surrogates(u, &hi, &lo); + /* XXX: Std C does not allow writing more than one wide + * character to c; this is a non-standard extension for + * internal API use. + */ + + c[0] = (wchar_t) hi; + c[1] = (wchar_t) lo; + ps->wcout_ = 2; + } + else { + *c = (wchar_t) u; + ps->wcout_ = 1; + } + } + if (u == 0) { + wlite_0_mbstate_(ps); + return 0; + } + consumed = n - remaining; + } + } + else if (s == NULL) { + if (ps->flags_ & WLITE_MBSTATE_INCOMPLETE_) { + errno = EILSEQ; + return (size_t) -1; + } + else { + wlite_0_mbstate_(ps); + return WLITE_MBS_SHIFT_STATES_; + } + } + return consumed; +} |