summaryrefslogtreecommitdiffstats
path: root/wlite/wlite_mbrtowc.c
diff options
context:
space:
mode:
authorPeter Jones <pjones@redhat.com>2006-06-29 15:58:41 +0000
committerPeter Jones <pjones@redhat.com>2006-06-29 15:58:41 +0000
commit097e9e63c1431019b1831db549e7f3686aaf34f4 (patch)
tree7156813ee826601375c54ccc3e5a0505fcf5fd08 /wlite/wlite_mbrtowc.c
parentb7e280956c9fe359efe6cfd6dbeeb658fbe10465 (diff)
downloadanaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.gz
anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.xz
anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.zip
- Fix wide character support (#196099, #186701)
Diffstat (limited to 'wlite/wlite_mbrtowc.c')
-rw-r--r--wlite/wlite_mbrtowc.c234
1 files changed, 234 insertions, 0 deletions
diff --git a/wlite/wlite_mbrtowc.c b/wlite/wlite_mbrtowc.c
new file mode 100644
index 000000000..ecd6b29c2
--- /dev/null
+++ b/wlite/wlite_mbrtowc.c
@@ -0,0 +1,234 @@
+/*
+ * $Id$
+ *
+ * Copyright (C) 2003 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Original Author: Adrian Havill <havill@redhat.com>
+ *
+ * Contributors:
+ */
+
+#include <errno.h> // errno, EILSEQ, ERANGE
+
+#include "wlite_config.h" // wchar_t, NULL, size_t
+
+#include "wlite_wchar.h" // prototypes
+#include "wlite_wctype.h"
+#include "wlite_stdlib.h"
+
+static const long wlite_invalid = -1;
+static const long wlite_incomplete = -2;
+
+#if WLITE_READ_6_BYTE_UTF8_SURROGATE && WLITE_XBMP_CHAR
+static int
+is_surrogate_hi(wchar_t u) { return u >= 0xD800 && u <= 0xDBFF; }
+
+static int
+is_surrogate_lo(wchar_t u) { return u >= 0xDC00 && u <= 0xDFFF; }
+
+static wchar_t
+make_utc_from_surrogates(wchar_t hi, wchar_t lo) {
+ wchar_t u = 0;
+
+ u += (hi - 0xD800) * 0x400;
+ u += lo - 0xDC00 + 0x10000;
+ return u;
+}
+#endif
+
+static int
+is_plane_0(wchar_t u) { return (u + 1) > 0x0000 && (u - 1) < 0xFFFF; }
+
+static void
+make_surrogates(unsigned long u, wchar_t *hi, wchar_t *lo) {
+ if (hi != NULL)
+ *hi = (u - 0x10000) / 0x400 + 0xD800;
+ if (lo != NULL)
+ *lo = (u - 0x10000) % 0x400 + 0xDC00;
+}
+
+static int
+is_tail(uint8_t c) { return (c & 0xC0) == 0x80; }
+
+static long
+get_utc(const unsigned char **utf8, size_t *length) {
+ long u = 0;
+ const uint8_t *s = NULL;
+
+ if (*length == 0) return wlite_incomplete;
+ s = *utf8;
+ if (s[0] < 128) {
+ /* HEAD/TAIL pattern: 0zzzzzzz
+ * ... ASCII is just ASCII. Ain't UTF-8 wonderful?
+ */
+
+ u |= (*s++ & 0x7F) << 0;
+ *length -= 1;
+ }
+ else if ((s[0] & 0xE0) == 0xC0) {
+ /* HEAD/TAIL pattern: 110yyyyy 10zzzzzz
+ * ... most probably a European character or fancy dingbat/sym if we're
+ * here
+ */
+
+ if (*length < 2) return wlite_incomplete;
+ if (!is_tail(s[1])) return wlite_invalid;
+ u |= (*s++ & 0x1F) << 6;
+ u |= *s++ & 0x3F << 0;
+ *length -= 2;
+ if (u < 0x0080) u = wlite_invalid;
+ }
+ else if ((s[0] & 0xF0) == 0xE0) {
+ /* HEAD/TAIL/TAIL pattern: 1110xxxx 10yyyyyy 10zzzzzz
+ * ... most probably a CJK character, but sometimes Euro/Asian/African.
+ */
+
+ if (*length < 3) return wlite_incomplete;
+ if (!is_tail(s[1]) || !is_tail(s[2])) return wlite_invalid;
+ u |= (*s++ & 0x0F) << 12;
+ u |= (*s++ & 0x3F) << 6;
+ u |= (*s++ & 0x3F) << 0;
+ *length -= 3;
+ if (u < 0x0800) u = wlite_invalid;
+
+# if WLITE_READ_6_BYTE_UTF8_SURROGATE && WLITE_XBMP_CHAR
+ if (is_surrogate_hi(u)) {
+ /* XXX: see the note below regarding four byte patterns
+ *
+ * if you're here with the debugger then you should probably check
+ * your favorite character encoding converter for brain-damage.
+ */
+
+ size_t length_copy = *length; /* save from recursion */
+ const unsigned char *s_copy = s;
+
+ long u1 = u;
+ long u2 = get_utc(&s_copy, &length_copy);
+
+ if (u2 == wlite_incomplete) return u2;
+ else if (is_surrogate_lo(u2)) {
+ u = make_utc_from_surrogates(u1, u2);
+ *length -= 3;
+ s += 3;
+ }
+ }
+# endif
+ }
+ else if ((s[0] & 0xF8) == 0xF0) {
+ /* HEAD/TAIL/TAIL/TAIL bit pattern: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
+ * ... most probably a freak CJK character and the string is testing
+ * ... Unicode conformance. Either that or you're attempting to process
+ * ... Klingon.
+ *
+ * XXX: if you're here with the debugger and you're working with
+ * real-world text, it's probably suspect (I doubt you have any fonts
+ * to represent the characters in this range anyway)
+ */
+
+ if (*length < 4) return wlite_incomplete; // incomplete
+ if (!is_tail(s[1]) || !is_tail(s[2]) || !is_tail(s[3]))
+ return wlite_invalid;
+ u |= (*s++ & 0x07) << 18;
+ u |= (*s++ & 0x3F) << 12;
+ u |= (*s++ & 0x3F) << 6;
+ u |= (*s++ & 0x3F) << 0;
+ *length -= 4;
+ if (u <= 0xFFFF) {
+ u = wlite_invalid;
+ }
+ }
+ else {
+ /* we either got a five or six byte UTF-8 sequence, a "decapitated"
+ * (all tail with no head byte-- sometimes raw Latin-1 slipped in a
+ * UTF-8 string) UTF-8 sequence, or we got 0xFE or 0xFF in the byte
+ * stream (always illegal in UTF-8-- which means these two can be used
+ * as private non-exported sentinals in applications. Did I mention
+ * that UTF-8 is wonderful?)
+ */
+
+ u = wlite_invalid;
+ while (*length != 0 && (is_tail(*s) || *s >= 0xF8)) {
+ --*length;
+ ++s;
+ }
+ }
+ *utf8 = s;
+ return u;
+}
+
+size_t
+wlite_mbrtowc(wchar_t *c, const char *s, size_t n, wlite_mbstate_t *ps) {
+ static wlite_mbstate_t internal = { 0 };
+ size_t consumed = 0, remaining = n;
+ const unsigned char *utf8 = (const unsigned char *) s;
+ long u = 0;
+
+ if (ps == NULL)
+ ps = &internal;
+ if (s != NULL) {
+ u = get_utc(&utf8, &remaining);
+ if (u == wlite_incomplete) {
+ ps->flags_ |= WLITE_MBSTATE_INCOMPLETE_;
+ ps->flags_ &= ~WLITE_MBSTATE_ERROR_;
+ return (size_t) -2;
+ }
+ else if (u == wlite_invalid) {
+ ps->flags_ &= ~WLITE_MBSTATE_INCOMPLETE_;
+ ps->flags_ |= WLITE_MBSTATE_ERROR_;
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
+ else {
+ ps->wcout_ = 0;
+ if (c != NULL) {
+ if ((ps->flags_ & WLITE_MBSTATE_SURROGATE_) && !is_plane_0(u)) {
+ wchar_t hi, lo;
+
+ make_surrogates(u, &hi, &lo);
+ /* XXX: Std C does not allow writing more than one wide
+ * character to c; this is a non-standard extension for
+ * internal API use.
+ */
+
+ c[0] = (wchar_t) hi;
+ c[1] = (wchar_t) lo;
+ ps->wcout_ = 2;
+ }
+ else {
+ *c = (wchar_t) u;
+ ps->wcout_ = 1;
+ }
+ }
+ if (u == 0) {
+ wlite_0_mbstate_(ps);
+ return 0;
+ }
+ consumed = n - remaining;
+ }
+ }
+ else if (s == NULL) {
+ if (ps->flags_ & WLITE_MBSTATE_INCOMPLETE_) {
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
+ else {
+ wlite_0_mbstate_(ps);
+ return WLITE_MBS_SHIFT_STATES_;
+ }
+ }
+ return consumed;
+}