- Fix wide character support (#196099, #186701)

author: Peter Jones <pjones@redhat.com> 2006-06-29 15:58:41 +0000
committer: Peter Jones <pjones@redhat.com> 2006-06-29 15:58:41 +0000
commit: 097e9e63c1431019b1831db549e7f3686aaf34f4 (patch)
tree: 7156813ee826601375c54ccc3e5a0505fcf5fd08 /wlite/wlite_mbrtowc.c
parent: b7e280956c9fe359efe6cfd6dbeeb658fbe10465 (diff)
download: anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.gz
anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.xz
anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.zip
1 files changed, 234 insertions, 0 deletions
diff --git a/wlite/wlite_mbrtowc.c b/wlite/wlite_mbrtowc.c
new file mode 100644
index 000000000..ecd6b29c2
--- /dev/null
+++ b/wlite/wlite_mbrtowc.c
@@ -0,0 +1,234 @@
+/*
+ * $Id$
+ *
+ * Copyright (C) 2003  Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Original Author: Adrian Havill <havill@redhat.com>
+ *
+ * Contributors:
+ */
+
+#include <errno.h>   // errno, EILSEQ, ERANGE
+
+#include "wlite_config.h"   // wchar_t, NULL, size_t
+
+#include "wlite_wchar.h"    // prototypes
+#include "wlite_wctype.h"
+#include "wlite_stdlib.h"
+
+static const long wlite_invalid = -1;
+static const long wlite_incomplete = -2;
+
+#if WLITE_READ_6_BYTE_UTF8_SURROGATE && WLITE_XBMP_CHAR
+static int
+is_surrogate_hi(wchar_t u) { return u >= 0xD800 && u <= 0xDBFF; }
+
+static int
+is_surrogate_lo(wchar_t u) { return u >= 0xDC00 && u <= 0xDFFF; }
+
+static wchar_t
+make_utc_from_surrogates(wchar_t hi, wchar_t lo) {
+    wchar_t u = 0;
+
+    u += (hi - 0xD800) *  0x400;
+    u += lo - 0xDC00 + 0x10000;
+    return u;
+}
+#endif
+
+static int
+is_plane_0(wchar_t u) { return (u + 1) > 0x0000 && (u - 1) < 0xFFFF; }
+
+static void
+make_surrogates(unsigned long u, wchar_t *hi, wchar_t *lo) {
+    if (hi != NULL)
+        *hi = (u - 0x10000) / 0x400 + 0xD800;
+    if (lo != NULL)
+        *lo = (u - 0x10000) % 0x400 + 0xDC00;
+}
+
+static int
+is_tail(uint8_t c) { return (c & 0xC0) == 0x80; }
+
+static long
+get_utc(const unsigned char **utf8, size_t *length) {
+    long u = 0;
+    const uint8_t *s = NULL;
+
+    if (*length == 0) return wlite_incomplete;
+    s = *utf8;
+    if (s[0] < 128) {
+        /* HEAD/TAIL pattern: 0zzzzzzz
+         * ... ASCII is just ASCII. Ain't UTF-8 wonderful?
+         */
+
+        u |= (*s++ & 0x7F) <<  0;
+        *length -= 1;
+    }
+    else if ((s[0] & 0xE0) == 0xC0) {
+        /* HEAD/TAIL pattern: 110yyyyy 10zzzzzz
+         * ... most probably a European character or fancy dingbat/sym if we're
+         * here
+         */
+
+        if (*length < 2) return wlite_incomplete;
+        if (!is_tail(s[1])) return wlite_invalid;
+        u |= (*s++ & 0x1F) <<  6;
+        u |=  *s++ & 0x3F  <<  0;
+        *length -= 2;
+        if (u < 0x0080) u = wlite_invalid;
+    }
+    else if ((s[0] & 0xF0) == 0xE0) {
+        /* HEAD/TAIL/TAIL pattern: 1110xxxx 10yyyyyy 10zzzzzz
+         * ... most probably a CJK character, but sometimes Euro/Asian/African.
+         */
+
+        if (*length < 3) return wlite_incomplete;
+        if (!is_tail(s[1]) || !is_tail(s[2])) return wlite_invalid;
+        u |= (*s++ & 0x0F) << 12;
+        u |= (*s++ & 0x3F) <<  6;
+        u |= (*s++ & 0x3F) <<  0;
+        *length -= 3;
+        if (u < 0x0800) u = wlite_invalid;
+
+#   if WLITE_READ_6_BYTE_UTF8_SURROGATE && WLITE_XBMP_CHAR
+        if (is_surrogate_hi(u)) {
+            /* XXX: see the note below regarding four byte patterns
+             *
+             * if you're here with the debugger then you should probably check
+             * your favorite character encoding converter for brain-damage.
+             */
+
+            size_t length_copy = *length;  /* save from recursion */
+            const unsigned char *s_copy = s;
+
+            long u1 = u;
+            long u2 = get_utc(&s_copy, &length_copy);
+
+            if (u2 == wlite_incomplete) return u2;
+            else if (is_surrogate_lo(u2)) {
+                u = make_utc_from_surrogates(u1, u2);
+                *length -= 3;
+                s += 3;
+            }
+        }
+#   endif
+    }
+    else if ((s[0] & 0xF8) == 0xF0) {
+        /* HEAD/TAIL/TAIL/TAIL bit pattern: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
+         * ... most probably a freak CJK character and the string is testing
+         * ... Unicode conformance. Either that or you're attempting to process
+         * ... Klingon.
+         *
+         * XXX: if you're here with the debugger and you're working with
+         * real-world text, it's probably suspect (I doubt you have any fonts
+         * to represent the characters in this range anyway)
+         */
+
+        if (*length < 4) return wlite_incomplete;  // incomplete
+        if (!is_tail(s[1]) || !is_tail(s[2]) || !is_tail(s[3]))
+            return wlite_invalid;
+        u |= (*s++ & 0x07) << 18;
+        u |= (*s++ & 0x3F) << 12;
+        u |= (*s++ & 0x3F) <<  6;
+        u |= (*s++ & 0x3F) <<  0;
+        *length -= 4;
+        if (u <= 0xFFFF) {
+            u = wlite_invalid;
+        }
+    }
+    else {
+        /* we either got a five or six byte UTF-8 sequence, a "decapitated"
+         * (all tail with no head byte-- sometimes raw Latin-1 slipped in a
+         * UTF-8 string) UTF-8 sequence, or we got 0xFE or 0xFF in the byte
+         * stream (always illegal in UTF-8-- which means these two can be used
+         * as private non-exported sentinals in applications. Did I mention
+         * that UTF-8 is wonderful?)
+         */
+
+        u = wlite_invalid;
+        while (*length != 0 && (is_tail(*s) || *s >= 0xF8)) {
+            --*length;
+            ++s;
+        }
+    }
+    *utf8 = s;
+    return u;
+}
+
+size_t
+wlite_mbrtowc(wchar_t *c, const char *s, size_t n, wlite_mbstate_t *ps) {
+    static wlite_mbstate_t internal = { 0 };
+    size_t consumed = 0, remaining = n;
+    const unsigned char *utf8 = (const unsigned char *) s;
+    long u = 0;
+
+    if (ps == NULL)
+        ps = &internal;
+    if (s != NULL) {
+        u = get_utc(&utf8, &remaining);
+        if (u == wlite_incomplete) {
+            ps->flags_ |=  WLITE_MBSTATE_INCOMPLETE_;
+            ps->flags_ &= ~WLITE_MBSTATE_ERROR_;
+            return (size_t) -2;
+        }
+        else if (u == wlite_invalid) {
+            ps->flags_ &= ~WLITE_MBSTATE_INCOMPLETE_;
+            ps->flags_ |=  WLITE_MBSTATE_ERROR_;
+            errno = EILSEQ;
+            return (size_t) -1;
+        }
+        else {
+            ps->wcout_ = 0;
+            if (c != NULL) {
+                if ((ps->flags_ & WLITE_MBSTATE_SURROGATE_) && !is_plane_0(u)) {
+                    wchar_t hi, lo;
+
+                    make_surrogates(u, &hi, &lo);
+                    /* XXX: Std C does not allow writing more than one wide
+                     * character to c; this is a non-standard extension for
+                     * internal API use.
+                     */
+
+                    c[0] = (wchar_t) hi;
+                    c[1] = (wchar_t) lo;
+                    ps->wcout_ = 2;
+                }
+                else {
+                    *c = (wchar_t) u;
+                    ps->wcout_ = 1;
+                }
+            }
+            if (u == 0) {
+                wlite_0_mbstate_(ps);
+                return 0;
+            }
+            consumed = n - remaining;
+        }
+    }
+    else if (s == NULL) {
+        if (ps->flags_ & WLITE_MBSTATE_INCOMPLETE_) {
+            errno = EILSEQ;
+            return (size_t) -1;
+        }
+        else {
+            wlite_0_mbstate_(ps);
+            return WLITE_MBS_SHIFT_STATES_;
+        }
+    }
+    return consumed;
+}
author	Peter Jones <pjones@redhat.com>	2006-06-29 15:58:41 +0000
committer	Peter Jones <pjones@redhat.com>	2006-06-29 15:58:41 +0000
commit	097e9e63c1431019b1831db549e7f3686aaf34f4 (patch)
tree	7156813ee826601375c54ccc3e5a0505fcf5fd08 /wlite/wlite_mbrtowc.c
parent	b7e280956c9fe359efe6cfd6dbeeb658fbe10465 (diff)
download	anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.gz anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.tar.xz anaconda-097e9e63c1431019b1831db549e7f3686aaf34f4.zip