diff options
author | Andrew Tridgell <tridge@samba.org> | 2001-07-22 07:38:32 +0000 |
---|---|---|
committer | Andrew Tridgell <tridge@samba.org> | 2001-07-22 07:38:32 +0000 |
commit | 49514266131aa55b4a97f02b39b2d5e5c1f625a2 (patch) | |
tree | 81cc9bf796345a882305af58bd18caa705d4e774 /source3/lib/iconv.c | |
parent | 512351db92b4e13c2944f740090ec58730d8fa06 (diff) | |
download | samba-49514266131aa55b4a97f02b39b2d5e5c1f625a2.tar.gz samba-49514266131aa55b4a97f02b39b2d5e5c1f625a2.tar.xz samba-49514266131aa55b4a97f02b39b2d5e5c1f625a2.zip |
changed the iconv interface to go via ucs2 for all conversions. This
fixes some problems wih some character sets and allows for using
internal charsets in conjunction with ionv charsets
this makes us slower but more correct. speed will come later.
(This used to be commit 594f84b4e39182dcf344c02dc0185376a2726395)
Diffstat (limited to 'source3/lib/iconv.c')
-rw-r--r-- | source3/lib/iconv.c | 257 |
1 files changed, 186 insertions, 71 deletions
diff --git a/source3/lib/iconv.c b/source3/lib/iconv.c index b73ff6ff395..2285d8debfe 100644 --- a/source3/lib/iconv.c +++ b/source3/lib/iconv.c @@ -21,13 +21,15 @@ #include "includes.h" -static size_t ascii_pull(char **, size_t *, char **, size_t *); -static size_t ascii_push(char **, size_t *, char **, size_t *); -static size_t utf8_pull(char **, size_t *, char **, size_t *); -static size_t utf8_push(char **, size_t *, char **, size_t *); -static size_t weird_pull(char **, size_t *, char **, size_t *); -static size_t weird_push(char **, size_t *, char **, size_t *); -static size_t iconv_copy(char **, size_t *, char **, size_t *); +static size_t ascii_pull(void *,char **, size_t *, char **, size_t *); +static size_t ascii_push(void *,char **, size_t *, char **, size_t *); +static size_t utf8_pull(void *,char **, size_t *, char **, size_t *); +static size_t utf8_push(void *,char **, size_t *, char **, size_t *); +static size_t weird_pull(void *,char **, size_t *, char **, size_t *); +static size_t weird_push(void *,char **, size_t *, char **, size_t *); +static size_t ucs2hex_pull(void *,char **, size_t *, char **, size_t *); +static size_t ucs2hex_push(void *,char **, size_t *, char **, size_t *); +static size_t iconv_copy(void *,char **, size_t *, char **, size_t *); /* for each charset we have a function that pulls from that charset to @@ -35,18 +37,39 @@ static size_t iconv_copy(char **, size_t *, char **, size_t *); */ static struct { char *name; - size_t (*pull)(char **inbuf, size_t *inbytesleft, + size_t (*pull)(void *, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); - size_t (*push)(char **inbuf, size_t *inbytesleft, + size_t (*push)(void *, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); } charsets[] = { {"UCS-2LE", iconv_copy, iconv_copy}, {"UTF8", utf8_pull, utf8_push}, {"ASCII", ascii_pull, ascii_push}, {"WEIRD", weird_pull, weird_push}, + {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}, {NULL, NULL, NULL} }; + +/* if there was an error then reset the internal state, + this ensures that we don't have a shift state remaining for + character sets like SJIS */ +static size_t sys_iconv(void *cd, + char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ +#ifdef HAVE_NATIVE_ICONV + size_t ret = iconv((iconv_t)cd, + inbuf, inbytesleft, + outbuf, outbytesleft); + if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL); + return ret; +#else + errno = EINVAL; + return -1; +#endif +} + /* this is a simple portable iconv() implementaion. It only knows about a very small number of character sets - just enough that Samba works @@ -60,36 +83,28 @@ size_t smb_iconv(smb_iconv_t cd, char *bufp = cvtbuf; size_t bufsize; -#ifdef HAVE_NATIVE_ICONV - if (cd->cd) { - size_t ret; - ret = iconv(cd->cd, inbuf, inbytesleft, outbuf, outbytesleft); - - /* if there was an error then reset the internal state, - this ensures that we don't have a shift state remaining for - character sets like SJIS */ - if (ret == (size_t)-1) { - iconv(cd->cd, NULL, NULL, NULL, NULL); - } - return ret; - } -#endif - - /* in most cases we can go direct */ + /* in many cases we can go direct */ if (cd->direct) { - return cd->direct(inbuf, inbytesleft, outbuf, outbytesleft); + return cd->direct(cd->cd_direct, + inbuf, inbytesleft, outbuf, outbytesleft); } + /* otherwise we have to do it chunks at a time */ while (*inbytesleft > 0) { bufp = cvtbuf; bufsize = sizeof(cvtbuf); - if (cd->pull(inbuf, inbytesleft, &bufp, &bufsize) == -1 && - errno != E2BIG) return -1; + + if (cd->pull(cd->cd_pull, + inbuf, inbytesleft, &bufp, &bufsize) == -1 + && errno != E2BIG) return -1; bufp = cvtbuf; bufsize = sizeof(cvtbuf) - bufsize; - if (cd->push(&bufp, &bufsize, outbuf, outbytesleft) == -1) return -1; + + if (cd->push(cd->cd_push, + &bufp, &bufsize, + outbuf, outbytesleft) == -1) return -1; } return 0; @@ -102,9 +117,19 @@ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) { smb_iconv_t ret; int from, to; -#ifdef HAVE_NATIVE_ICONV - iconv_t cd = NULL; -#endif + + ret = (smb_iconv_t)malloc(sizeof(*ret)); + if (!ret) { + errno = ENOMEM; + return (smb_iconv_t)-1; + } + memset(ret, 0, sizeof(*ret)); + + /* check for the simplest null conversion */ + if (strcmp(fromcode, tocode) == 0) { + ret->direct = iconv_copy; + return ret; + } for (from=0; charsets[from].name; from++) { if (strcasecmp(charsets[from].name, fromcode) == 0) break; @@ -113,53 +138,57 @@ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) if (strcasecmp(charsets[to].name, tocode) == 0) break; } - if (!charsets[from].name || !charsets[to].name) { #ifdef HAVE_NATIVE_ICONV - /* its not builtin - see if iconv() has it */ - cd = iconv_open(tocode, fromcode); - if (!cd) -#endif - { - errno = EINVAL; - return (smb_iconv_t)-1; - } + if (!charsets[from].name) { + ret->pull = sys_iconv; + ret->cd_pull = iconv_open("UCS-2LE", fromcode); + if (!ret->cd_pull) goto failed; } - - ret = (smb_iconv_t)malloc(sizeof(*ret)); - if (!ret) { - errno = ENOMEM; - return (smb_iconv_t)-1; + if (!charsets[to].name) { + ret->push = sys_iconv; + ret->cd_push = iconv_open(tocode, "UCS-2LE"); + if (!ret->cd_push) goto failed; } - memset(ret, 0, sizeof(*ret)); - -#ifdef HAVE_NATIVE_ICONV - /* see if we will be using the native iconv */ - if (cd) { - ret->cd = cd; - return ret; +#else + if (!charsets[from].name || !charsets[to].name) { + goto failed; } #endif - /* check for the simplest null conversion */ - if (from == to) { - ret->direct = iconv_copy; + /* check for conversion to/from ucs2 */ + if (from == 0 && charsets[to].name) { + ret->direct = charsets[to].push; + return ret; + } + if (to == 0 && charsets[from].name) { + ret->direct = charsets[from].pull; return ret; } - /* check for conversion to/from ucs2 */ +#ifdef HAVE_NATIVE_ICONV if (from == 0) { - ret->direct = charsets[to].push; + ret->direct = sys_iconv; + ret->cd_direct = ret->cd_push; + ret->cd_push = NULL; return ret; } if (to == 0) { - ret->direct = charsets[from].pull; + ret->direct = sys_iconv; + ret->cd_direct = ret->cd_pull; + ret->cd_pull = NULL; return ret; } +#endif /* the general case has to go via a buffer */ - ret->pull = charsets[from].pull; - ret->push = charsets[to].push; + if (!ret->pull) ret->pull = charsets[from].pull; + if (!ret->push) ret->push = charsets[to].push; return ret; + +failed: + free(ret); + errno = EINVAL; + return (smb_iconv_t)-1; } /* @@ -168,10 +197,11 @@ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) int smb_iconv_close (smb_iconv_t cd) { #ifdef HAVE_NATIVE_ICONV - if (cd->cd) { - iconv_close(cd->cd); - } + if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct); + if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull); + if (cd->cd_push) iconv_close((iconv_t)cd->cd_push); #endif + memset(cd, 0, sizeof(*cd)); free(cd); return 0; @@ -184,7 +214,7 @@ int smb_iconv_close (smb_iconv_t cd) multi-byte character set support for english users ***********************************************************************/ -static size_t ascii_pull(char **inbuf, size_t *inbytesleft, +static size_t ascii_pull(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { while (*inbytesleft >= 1 && *outbytesleft >= 2) { @@ -204,7 +234,7 @@ static size_t ascii_pull(char **inbuf, size_t *inbytesleft, return 0; } -static size_t ascii_push(char **inbuf, size_t *inbytesleft, +static size_t ascii_push(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { int ir_count=0; @@ -232,6 +262,91 @@ static size_t ascii_push(char **inbuf, size_t *inbytesleft, } +static size_t ucs2hex_pull(void *cd, char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ + while (*inbytesleft >= 1 && *outbytesleft >= 2) { + unsigned v; + + if ((*inbuf)[0] != '@') { + /* seven bit ascii case */ + (*outbuf)[0] = (*inbuf)[0]; + (*outbuf)[1] = 0; + (*inbytesleft) -= 1; + (*outbytesleft) -= 2; + (*inbuf) += 1; + (*outbuf) += 2; + continue; + } + /* it's a hex character */ + if (*inbytesleft < 5) { + errno = EINVAL; + return -1; + } + + if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) { + errno = EILSEQ; + return -1; + } + + (*outbuf)[0] = v&0xff; + (*outbuf)[1] = v>>8; + (*inbytesleft) -= 5; + (*outbytesleft) -= 2; + (*inbuf) += 5; + (*outbuf) += 2; + } + + if (*inbytesleft > 0) { + errno = E2BIG; + return -1; + } + + return 0; +} + +static size_t ucs2hex_push(void *cd, char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ + while (*inbytesleft >= 2 && *outbytesleft >= 1) { + char buf[6]; + + if ((*inbuf)[1] == 0 && + ((*inbuf)[0] & 0x80) == 0 && + (*inbuf)[0] != '@') { + (*outbuf)[0] = (*inbuf)[0]; + (*inbytesleft) -= 2; + (*outbytesleft) -= 1; + (*inbuf) += 2; + (*outbuf) += 1; + continue; + } + if (*outbytesleft < 5) { + errno = E2BIG; + return -1; + } + snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0)); + memcpy(*outbuf, buf, 5); + (*inbytesleft) -= 2; + (*outbytesleft) -= 5; + (*inbuf) += 2; + (*outbuf) += 5; + } + + if (*inbytesleft == 1) { + errno = EINVAL; + return -1; + } + + if (*inbytesleft > 1) { + errno = E2BIG; + return -1; + } + + return 0; +} + + /* the "weird" character set is very useful for testing multi-byte support and finding bugs. Don't use on a production system! */ @@ -245,7 +360,7 @@ static struct { {0, NULL} }; -static size_t weird_pull(char **inbuf, size_t *inbytesleft, +static size_t weird_pull(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { while (*inbytesleft >= 1 && *outbytesleft >= 2) { @@ -288,7 +403,7 @@ static size_t weird_pull(char **inbuf, size_t *inbytesleft, return 0; } -static size_t weird_push(char **inbuf, size_t *inbytesleft, +static size_t weird_push(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { int ir_count=0; @@ -337,7 +452,7 @@ static size_t weird_push(char **inbuf, size_t *inbytesleft, return ir_count; } -static size_t iconv_copy(char **inbuf, size_t *inbytesleft, +static size_t iconv_copy(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { int n; @@ -359,7 +474,7 @@ static size_t iconv_copy(char **inbuf, size_t *inbytesleft, return 0; } -static size_t utf8_pull(char **inbuf, size_t *inbytesleft, +static size_t utf8_pull(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { while (*inbytesleft >= 1 && *outbytesleft >= 2) { @@ -406,7 +521,7 @@ badseq: return -1; } -static size_t utf8_push(char **inbuf, size_t *inbytesleft, +static size_t utf8_push(void *cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { while (*inbytesleft >= 2 && *outbytesleft >= 1) { |