/* #define TEST_GET_EAST_ASIA_STR_WIDTH 1 */ #include #include #include #include #include #include "eawidth.h" /* * If the amount of columns the cursor advances on a TAB character depends * on the current position, set this to a negative number (i.e. -8 for tab * stops every eight columns. If static, set to a positive number. Zero if * tabs are ignored. */ static const int tab_width = -8; typedef struct { unsigned short start, end; east_asia_type type; } eaw_db_type; static const eaw_db_type eaw_db[] = { { 0x0020,0x007E,narrow }, { 0x00A1,0x00A1,ambiguous }, /*INVERTED EXCLAMATION MARK*/ { 0x00A2,0x00A3,narrow }, { 0x00A4,0x00A4,ambiguous }, /*CURRENCY SIGN*/ { 0x00A5,0x00A6,narrow }, { 0x00A7,0x00A8,ambiguous }, { 0x00AA,0x00AA,ambiguous }, /*FEMININE ORDINAL INDICATOR*/ { 0x00AC,0x00AC,narrow }, /*NOT SIGN*/ { 0x00AD,0x00AD,ambiguous }, /*SOFT HYPHEN*/ { 0x00AF,0x00AF,narrow }, /*MACRON*/ { 0x00B0,0x00B4,ambiguous }, { 0x00B6,0x00BA,ambiguous }, { 0x00BC,0x00BF,ambiguous }, { 0x00C6,0x00C6,ambiguous }, /*LATIN CAPITAL LETTER AE*/ { 0x00D0,0x00D0,ambiguous }, /*LATIN CAPITAL LETTER ETH*/ { 0x00D7,0x00D8,ambiguous }, { 0x00DE,0x00E1,ambiguous }, { 0x00E6,0x00E6,ambiguous }, /*LATIN SMALL LETTER AE*/ { 0x00E8,0x00EA,ambiguous }, { 0x00EC,0x00ED,ambiguous }, { 0x00F0,0x00F0,ambiguous }, /*LATIN SMALL LETTER ETH*/ { 0x00F2,0x00F3,ambiguous }, { 0x00F7,0x00FA,ambiguous }, { 0x00FC,0x00FC,ambiguous }, /*LATIN SMALL LETTER U WITH DIAERESIS*/ { 0x00FE,0x00FE,ambiguous }, /*LATIN SMALL LETTER THORN*/ { 0x0101,0x0101,ambiguous }, /*LATIN SMALL LETTER A WITH MACRON*/ { 0x0111,0x0111,ambiguous }, /*LATIN SMALL LETTER D WITH STROKE*/ { 0x0113,0x0113,ambiguous }, /*LATIN SMALL LETTER E WITH MACRON*/ { 0x011B,0x011B,ambiguous }, /*LATIN SMALL LETTER E WITH CARON*/ { 0x0126,0x0127,ambiguous }, { 0x012B,0x012B,ambiguous }, /*LATIN SMALL LETTER I WITH MACRON*/ { 0x0131,0x0133,ambiguous }, { 0x0138,0x0138,ambiguous }, /*LATIN SMALL LETTER KRA*/ { 0x013F,0x0142,ambiguous }, { 0x0144,0x0144,ambiguous }, /*LATIN SMALL LETTER N WITH ACUTE*/ { 0x0148,0x014A,ambiguous }, { 0x014D,0x014D,ambiguous }, /*LATIN SMALL LETTER O WITH MACRON*/ { 0x0152,0x0153,ambiguous }, { 0x0166,0x0167,ambiguous }, { 0x016B,0x016B,ambiguous }, /*LATIN SMALL LETTER U WITH MACRON*/ { 0x01CE,0x01CE,ambiguous }, /*LATIN SMALL LETTER A WITH CARON*/ { 0x01D0,0x01D0,ambiguous }, /*LATIN SMALL LETTER I WITH CARON*/ { 0x01D2,0x01D2,ambiguous }, /*LATIN SMALL LETTER O WITH CARON*/ { 0x01D4,0x01D4,ambiguous }, /*LATIN SMALL LETTER U WITH CARON*/ { 0x01D6,0x01D6,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+MACRON*/ { 0x01D8,0x01D8,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+ACUTE*/ { 0x01DA,0x01DA,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+CARON*/ { 0x01DC,0x01DC,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+GRAVE*/ { 0x0251,0x0251,ambiguous }, /*LATIN SMALL LETTER ALPHA*/ { 0x0261,0x0261,ambiguous }, /*LATIN SMALL LETTER SCRIPT G*/ { 0x02C7,0x02C7,ambiguous }, /*CARON*/ { 0x02C9,0x02CB,ambiguous }, { 0x02CD,0x02CD,ambiguous }, /*MODIFIER LETTER LOW MACRON*/ { 0x02D0,0x02D0,ambiguous }, /*MODIFIER LETTER TRIANGULAR COLON*/ { 0x02D8,0x02DB,ambiguous }, { 0x02DD,0x02DD,ambiguous }, /*DOUBLE ACUTE ACCENT*/ { 0x0300,0x0362,ambiguous }, { 0x0391,0x03A9,ambiguous }, { 0x03B1,0x03C1,ambiguous }, { 0x03C3,0x03C9,ambiguous }, { 0x0401,0x0401,ambiguous }, /*CYRILLIC CAPITAL LETTER IO*/ { 0x0410,0x044F,ambiguous }, { 0x0451,0x0451,ambiguous }, /*CYRILLIC SMALL LETTER IO*/ { 0x1100,0x115F,wide }, { 0x2010,0x2010,ambiguous }, /*HYPHEN*/ { 0x2013,0x2016,ambiguous }, { 0x2018,0x2019,ambiguous }, { 0x201C,0x201D,ambiguous }, { 0x2020,0x2021,ambiguous }, { 0x2025,0x2027,ambiguous }, { 0x2030,0x2030,ambiguous }, /*PER MILLE SIGN*/ { 0x2032,0x2033,ambiguous }, { 0x2035,0x2035,ambiguous }, /*REVERSED PRIME*/ { 0x203B,0x203B,ambiguous }, /*REFERENCE MARK*/ { 0x2074,0x2074,ambiguous }, /*SUPERSCRIPT FOUR*/ { 0x207F,0x207F,ambiguous }, /*SUPERSCRIPT LATIN SMALL LETTER N*/ { 0x2081,0x2084,ambiguous }, { 0x20A9,0x20A9,half_width }, /*WON SIGN*/ { 0x20AC,0x20AC,ambiguous }, /*EURO SIGN*/ { 0x2103,0x2103,ambiguous }, /*DEGREE CELSIUS*/ { 0x2105,0x2105,ambiguous }, /*CARE OF*/ { 0x2109,0x2109,ambiguous }, /*DEGREE FAHRENHEIT*/ { 0x2113,0x2113,ambiguous }, /*SCRIPT SMALL L*/ { 0x2121,0x2122,ambiguous }, { 0x2126,0x2126,ambiguous }, /*OHM SIGN*/ { 0x212B,0x212B,ambiguous }, /*ANGSTROM SIGN*/ { 0x2154,0x2155,ambiguous }, { 0x215B,0x215B,ambiguous }, /*VULGAR FRACTION ONE EIGHTH*/ { 0x215E,0x215E,ambiguous }, /*VULGAR FRACTION SEVEN EIGHTHS*/ { 0x2160,0x216B,ambiguous }, { 0x2170,0x2179,ambiguous }, { 0x2190,0x2199,ambiguous }, { 0x21D2,0x21D2,ambiguous }, /*RIGHTWARDS DOUBLE ARROW*/ { 0x21D4,0x21D4,ambiguous }, /*LEFT RIGHT DOUBLE ARROW*/ { 0x2200,0x2200,ambiguous }, /*FOR ALL*/ { 0x2202,0x2203,ambiguous }, { 0x2207,0x2208,ambiguous }, { 0x220B,0x220B,ambiguous }, /*CONTAINS AS MEMBER*/ { 0x220F,0x220F,ambiguous }, /*N-ARY PRODUCT*/ { 0x2211,0x2211,ambiguous }, /*N-ARY SUMMATION*/ { 0x2215,0x2215,ambiguous }, /*DIVISION SLASH*/ { 0x221A,0x221A,ambiguous }, /*SQUARE ROOT*/ { 0x221D,0x2220,ambiguous }, { 0x2223,0x2223,ambiguous }, /*DIVIDES*/ { 0x2225,0x2225,ambiguous }, /*PARALLEL TO*/ { 0x2227,0x222C,ambiguous }, { 0x222E,0x222E,ambiguous }, /*CONTOUR INTEGRAL*/ { 0x2234,0x2237,ambiguous }, { 0x223C,0x223D,ambiguous }, { 0x2248,0x2248,ambiguous }, /*ALMOST EQUAL TO*/ { 0x224C,0x224C,ambiguous }, /*ALL EQUAL TO*/ { 0x2252,0x2252,ambiguous }, /*APPROXIMATELY EQUAL TO OR THE IMAGE OF*/ { 0x2260,0x2261,ambiguous }, { 0x2264,0x2267,ambiguous }, { 0x226A,0x226B,ambiguous }, { 0x226E,0x226F,ambiguous }, { 0x2282,0x2283,ambiguous }, { 0x2286,0x2287,ambiguous }, { 0x2295,0x2295,ambiguous }, /*CIRCLED PLUS*/ { 0x2299,0x2299,ambiguous }, /*CIRCLED DOT OPERATOR*/ { 0x22A5,0x22A5,ambiguous }, /*UP TACK*/ { 0x22BF,0x22BF,ambiguous }, /*RIGHT TRIANGLE*/ { 0x2312,0x2312,ambiguous }, /*ARC*/ { 0x2460,0x24BF,ambiguous }, { 0x24D0,0x24E9,ambiguous }, { 0x2500,0x254B,ambiguous }, { 0x2550,0x2574,ambiguous }, { 0x2580,0x258F,ambiguous }, { 0x2592,0x25A1,ambiguous }, { 0x25A3,0x25A9,ambiguous }, { 0x25B2,0x25B3,ambiguous }, { 0x25B6,0x25B7,ambiguous }, { 0x25BC,0x25BD,ambiguous }, { 0x25C0,0x25C1,ambiguous }, { 0x25C6,0x25C8,ambiguous }, { 0x25CB,0x25CB,ambiguous }, /*WHITE CIRCLE*/ { 0x25CE,0x25D1,ambiguous }, { 0x25E2,0x25E5,ambiguous }, { 0x25EF,0x25EF,ambiguous }, /*LARGE CIRCLE*/ { 0x2605,0x2606,ambiguous }, { 0x2609,0x2609,ambiguous }, /*SUN*/ { 0x260E,0x260F,ambiguous }, { 0x261C,0x261C,ambiguous }, /*WHITE LEFT POINTING INDEX*/ { 0x261E,0x261E,ambiguous }, /*WHITE RIGHT POINTING INDEX*/ { 0x2640,0x2640,ambiguous }, /*FEMALE SIGN*/ { 0x2642,0x2642,ambiguous }, /*MALE SIGN*/ { 0x2660,0x2661,ambiguous }, { 0x2663,0x2665,ambiguous }, { 0x2667,0x266A,ambiguous }, { 0x266C,0x266D,ambiguous }, { 0x266F,0x266F,ambiguous }, /*MUSIC SHARP SIGN*/ { 0x2E80,0x3009,wide }, { 0x300A,0x300B,ambiguous }, { 0x300C,0x3019,wide }, { 0x301A,0x301B,ambiguous }, { 0x301C,0x303E,wide }, { 0x3041,0xD7A3,wide }, { 0xE000,0xF8FF,ambiguous }, { 0xF900,0xFA2D,wide }, { 0xFE30,0xFE6B,wide }, { 0xFF01,0xFF5E,full_width }, { 0xFF61,0xFFDC,half_width }, { 0xFFE0,0xFFE6,full_width }, { 0xFFE8,0xFFEE,half_width }, }; static int eaw_db_cmp (const void *ck, const void *ce) { const eaw_db_type *key = ck, *element = ce; assert(key != NULL); assert(element != NULL); if (key->start < element->start) return -1; else if (key->end > element->end) return 1; return 0; } static int is_cjk_locale (const char *locale_name) { static const char c[] = "zh"; /* Chinese */ static const char j[] = "ja"; /* Japanese */ static const char k[] = "ko"; /* Korean */ if (NULL == locale_name) return 0; if (strncmp(locale_name, c, sizeof(c)) == 0) return 1; if (strncmp(locale_name, j, sizeof(j)) == 0) return 1; if (strncmp(locale_name, k, sizeof(k)) == 0) return 1; return 0; } east_asia_type get_east_asia_type (wchar_t unicode) { assert(0xFFFF != unicode && 0xFFFE != unicode); if (unicode > 0xFFFF) { /* * Plane 2 is intended for CJK ideographs */ if (unicode >= 0x20000 && unicode <= 0x2FFFD) return wide; return ambiguous; } else { eaw_db_type *pos, key; size_t n; n = sizeof(eaw_db) / sizeof(eaw_db_type); key.start = key.end = (unsigned short) unicode; pos = bsearch(&key, eaw_db, n, sizeof(eaw_db_type), eaw_db_cmp); if (NULL != pos) return pos->type; } return neutral; } int east_asia_mblen (const char *locale_name, const char *s, size_t n, int x) { wchar_t *wcs, *p; int width = 0; if (NULL == s) s = ""; /* * Getting the locale name via setlocale() is expensive, so we prefer * to have it passed to us. */ if (NULL == locale_name) { locale_name = setlocale(LC_CTYPE, NULL); if (NULL == locale_name) return INT_MAX; } wcs = (wchar_t *) calloc(n, sizeof(wchar_t)); if (NULL == wcs) return INT_MAX; #if defined __GLIBC__ && !__GLIBC_PREREQ(2,2) #warning wide character support is broken. Glibc 2.2 or better needed. #endif if ((size_t) -1 == mbstowcs(wcs, s, n)) return INT_MAX; switch (get_east_asia_type(*wcs)) { case neutral: /* * Put characters that print nothing here. * * XXX: Yes, I know there are a lot more than this in ISO-10646, but * this function is intended to calculate the width of strings for * fixed width terminals displaying legacy CJK character sets. * State-of-the-art Unicode handling terminals probably won't need * this function anyway. */ if (0x0000 == *wcs) break; /* NULL */ if (0x0007 == *wcs) break; /* BELL */ /* FIXME: there will probably be ASCII chars after the escape * code, which will be counted as part of the width even though they * aren't displayed. */ if (0x001B == *wcs) break; /* ESC */ if (0xFEFF == *wcs) break; /* ZWNBSP aka BOM (magic, signature) */ /* * Special characters go here */ if (0x0008 == *wcs) { /* BACKSPACE */ width = -1; break; } if (0x0009 == *wcs) { /* TAB */ if (tab_width < 0) width = x % abs(tab_width); else width = tab_width; break; } /*FALLTHRU*/ case narrow: case half_width: width = 1; break; case wide: case full_width: width = 2; break; case ambiguous: width = is_cjk_locale(locale_name) ? 2 : 1; break; default: width = INT_MAX; } free(wcs); return width; } int get_east_asia_str_n_width (const char *locale_name, const char *s, size_t n, int x) { int total_width = 0; wchar_t *wcs, *p; if (NULL == s) s = ""; /* * Getting the locale name via setlocale() is expensive, so we prefer * to have it passed to us. */ if (NULL == locale_name) { locale_name = setlocale(LC_CTYPE, NULL); if (NULL == locale_name) return INT_MAX; } wcs = (wchar_t *) calloc(n, sizeof(wchar_t)); if (NULL == wcs) return INT_MAX; #if defined __GLIBC__ && !__GLIBC_PREREQ(2,2) #warning wide character support is broken. Glibc 2.2 or better needed. #endif if ((size_t) -1 == mbstowcs(wcs, s, n)) return INT_MAX; for (p = wcs; L'\0' != *p; p++) { int width = 0; switch (get_east_asia_type(*p)) { case neutral: /* * Put characters that print nothing here. * * XXX: Yes, I know there are a lot more than this in ISO-10646, but * this function is intended to calculate the width of strings for * fixed width terminals displaying legacy CJK character sets. * State-of-the-art Unicode handling terminals probably won't need * this function anyway. */ if (0x0000 == *p) break; /* NULL */ if (0x0007 == *p) break; /* BELL */ /* FIXME: there will probably be ASCII chars after the escape * code, which will be counted as part of the width even though they * aren't displayed. */ if (0x001B == *p) break; /* ESC */ if (0xFEFF == *p) break; /* ZWNBSP aka BOM (magic, signature) */ /* * Special characters go here */ if (0x0008 == *p) { /* BACKSPACE */ width = -1; break; } if (0x0009 == *p) { /* TAB */ if (tab_width < 0) width = x % abs(tab_width); else width = tab_width; break; } /*FALLTHRU*/ case narrow: case half_width: width = 1; break; case wide: case full_width: width = 2; break; case ambiguous: width = is_cjk_locale(locale_name) ? 2 : 1; break; default: abort(); /* Doh! */ } x += width; total_width += width; } free(wcs); return total_width; } int get_east_asia_str_width (const char *locale_name, const char *s, int x) { size_t n; int rc; n = strlen(s) + 1; rc = get_east_asia_str_n_width (locale_name, s, n, x); if (rc == INT_MAX) return strlen (s); return rc; } #if TEST_GET_EAST_ASIA_STR_WIDTH #include int main (int argc, char *argv[]) { int i; char *lc; const char *fmt = "word #%d ('%s') length is %zu, width is %u\n"; lc = setlocale(LC_CTYPE, ""); if (NULL == lc) { fputs("couldn't set the default locale for LC_CTYPE\n", stderr); exit(EXIT_FAILURE); } if (printf("character type locale is '%s'\n", lc) < 0) { perror(NULL); exit(EXIT_FAILURE); } for (i = 1; argc < 2 || i < argc; i++) { char *s; size_t length; unsigned width; if (argc < 2) { if (scanf("%as", &s) < 1 && ferror(stdin)) { perror(NULL); exit(EXIT_FAILURE); } else if (feof(stdin)) break; } else s = strdup(argv[(size_t) i]); if (NULL == s) { perror(NULL); exit(EXIT_FAILURE); } length = strlen(s); width = get_east_asia_str_width(lc, s, 0); if (printf(fmt, i, s, length, width) < 0) { perror(NULL); exit(EXIT_FAILURE); } free(s); } return 0; } #endif