diff options
-rw-r--r-- | ChangeLog | 33 | ||||
-rw-r--r-- | enc/euc_jp.c | 77 | ||||
-rw-r--r-- | enc/sjis.c | 54 | ||||
-rw-r--r-- | enc/utf8.c | 180 | ||||
-rw-r--r-- | encoding.c | 6 | ||||
-rw-r--r-- | include/ruby/encoding.h | 8 | ||||
-rw-r--r-- | include/ruby/oniguruma.h | 29 | ||||
-rw-r--r-- | io.c | 32 | ||||
-rw-r--r-- | string.c | 54 | ||||
-rw-r--r-- | test/ruby/test_m17n.rb | 258 |
10 files changed, 588 insertions, 143 deletions
@@ -1,3 +1,36 @@ +Thu Dec 6 18:22:11 2007 Tanaka Akira <akr@fsij.org> + + * encoding.c (rb_enc_precise_mbclen): new function for mbclen with + validation. + + * include/ruby/encoding.h (rb_enc_precise_mbclen): declared. + (MBCLEN_CHARFOUND): new macro. + (MBCLEN_INVALID): new macro. + (MBCLEN_NEEDMORE): new macro. + + * include/ruby/oniguruma.h (OnigEncodingTypeST): replace mbc_enc_len + by precise_mbc_enc_len. + (ONIGENC_PRECISE_MBC_ENC_LEN): new macro. + (ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND): new macro. + (ONIGENC_CONSTRUCT_MBCLEN_INVALID): new macro. + (ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE): new macro. + (ONIGENC_MBCLEN_CHARFOUND): new macro. + (ONIGENC_MBCLEN_INVALID): new macro. + (ONIGENC_MBCLEN_NEEDMORE): new macro. + (ONIGENC_MBC_ENC_LEN): use ONIGENC_PRECISE_MBC_ENC_LEN. + + * enc/euc_jp.c: validation implemented. + + * enc/sjis.c: ditto. + + * enc/utf8.c: ditto. + + * string.c (rb_str_inspect): use rb_enc_precise_mbclen for invalid + encoding. + (rb_str_valid_encoding_p): new method String#valid_encoding?. + + * io.c (rb_io_getc): use rb_enc_precise_mbclen. + Thu Dec 6 01:37:23 2007 Nobuyoshi Nakada <nobu@ruby-lang.org> * regparse.c (i_apply_case_fold): fix for negative character class. a diff --git a/enc/euc_jp.c b/enc/euc_jp.c index f48c90494..ea2a8e072 100644 --- a/enc/euc_jp.c +++ b/enc/euc_jp.c @@ -50,10 +50,85 @@ static const int EncLen_EUCJP[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 }; +typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t; +#define A ACCEPT +#define F FAILURE +static const signed char trans[][0x100] = { + { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F + }, + { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F + }, + { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F + }, + +}; +#undef A +#undef F + static int mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) { - return EncLen_EUCJP[*p]; + int firstbyte = *p++; + state_t s; + s = trans[0][firstbyte]; + if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1); + s = trans[s][*p++]; + if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2); + s = trans[s][*p++]; + return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); } static OnigCodePoint diff --git a/enc/sjis.c b/enc/sjis.c index fbfddcc61..ff7da3427 100644 --- a/enc/sjis.c +++ b/enc/sjis.c @@ -70,10 +70,62 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] +typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t; +#define A ACCEPT +#define F FAILURE +static const signed char trans[][0x100] = { + { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F + }, + { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F, + /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F + } +}; +#undef A +#undef F + static int mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) { - return EncLen_SJIS[*p]; + int firstbyte = *p++; + state_t s; + s = trans[0][firstbyte]; + if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1); + s = trans[s][*p++]; + return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); } static int diff --git a/enc/utf8.c b/enc/utf8.c index a2acd5eee..24a385448 100644 --- a/enc/utf8.c +++ b/enc/utf8.c @@ -56,13 +56,189 @@ static const int EncLen_UTF8[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 + 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; +typedef enum { + FAILURE = -2, + ACCEPT, + S0, S1, S2, S3, + S4, S5, S6, S7 +} state_t; +#define A ACCEPT +#define F FAILURE +static const signed char trans[][0x100] = { + { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, + /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, +}; +#undef A +#undef F + static int utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) { - return EncLen_UTF8[*p]; + int firstbyte = *p++; + state_t s; + s = trans[0][firstbyte]; + if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + + if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1); + s = trans[s][*p++]; + if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + + if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2); + s = trans[s][*p++]; + if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + + if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3); + s = trans[s][*p++]; + return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) : + ONIGENC_CONSTRUCT_MBCLEN_INVALID(); } static int diff --git a/encoding.c b/encoding.c index ec7405f96..53ceac851 100644 --- a/encoding.c +++ b/encoding.c @@ -495,6 +495,12 @@ rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) } int +rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) +{ + return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); +} + +int rb_enc_codelen(int c, rb_encoding *enc) { int n = ONIGENC_CODE_TO_MBCLEN(enc,c); diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index bdef19133..a800f5427 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -68,9 +68,15 @@ rb_encoding * rb_enc_find(const char *name); #define rb_enc_mbminlen(enc) (enc)->min_enc_len #define rb_enc_mbmaxlen(enc) (enc)->max_enc_len -/* ptr,encoding -> mbclen */ +/* ptr,endptr,encoding -> mbclen */ int rb_enc_mbclen(const char*, const char *, rb_encoding*); +/* ptr,endptr,encoding -> chlen, invalid or needmore */ +int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*); +#define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret) +#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret) +#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret) + /* code,encoding -> codelen */ int rb_enc_codelen(int, rb_encoding*); diff --git a/include/ruby/oniguruma.h b/include/ruby/oniguruma.h index 0a1f614bd..df4d07b11 100644 --- a/include/ruby/oniguruma.h +++ b/include/ruby/oniguruma.h @@ -144,7 +144,7 @@ typedef struct { typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg); typedef struct OnigEncodingTypeST { - int (*mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc); + int (*precise_mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc); const char* name; int max_enc_len; int min_enc_len; @@ -282,7 +282,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; #define ONIGENC_STEP_BACK(enc,start,s,n) \ onigenc_step_back((enc),(start),(s),(n)) -#define ONIGENC_MBC_ENC_LEN(enc,p,e) (enc)->mbc_enc_len(p,e,enc) + +#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n) +#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1) +#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-n) + +static inline int onigenc_mbclen_charfound(int r) { return 0 < r ? r : 0; } +static inline int onigenc_mbclen_needmore(int r) { return r < -1 ? -1 - r : 0; } +#define ONIGENC_MBCLEN_CHARFOUND(r) onigenc_mbclen_charfound(r) +#define ONIGENC_MBCLEN_INVALID(r) ((r) == -1) +#define ONIGENC_MBCLEN_NEEDMORE(r) onigenc_mbclen_needmore(r) + +#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e) (enc)->precise_mbc_enc_len(p,e,enc) + +static inline int onigenc_mbclen_recover(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc) +{ + int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e); + int r; + if (ONIGENC_MBCLEN_INVALID(ret)) + return 1; + else if ((r = ONIGENC_MBCLEN_NEEDMORE(ret))) + return e-p+r; + else + return ONIGENC_MBCLEN_CHARFOUND(ret); +} + +#define ONIGENC_MBC_ENC_LEN(enc,p,e) onigenc_mbclen_recover(p,e,enc) #define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) #define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) @@ -2127,7 +2127,7 @@ rb_io_getc(VALUE io) { rb_encoding *enc; rb_io_t *fptr; - int n, left; + int r, n; VALUE str; GetOpenFile(io, fptr); @@ -2138,22 +2138,30 @@ rb_io_getc(VALUE io) if (io_fillbuf(fptr) < 0) { return Qnil; } - n = rb_enc_mbclen(fptr->rbuf+fptr->rbuf_off, fptr->rbuf+fptr->rbuf_len, enc); - if (n < fptr->rbuf_len) { + r = rb_enc_precise_mbclen(fptr->rbuf+fptr->rbuf_off, fptr->rbuf+fptr->rbuf_off+fptr->rbuf_len, enc); + if ((n = MBCLEN_CHARFOUND(r)) != 0 && n <= fptr->rbuf_len) { str = rb_str_new(fptr->rbuf+fptr->rbuf_off, n); fptr->rbuf_off += n; fptr->rbuf_len -= n; } + else if (MBCLEN_NEEDMORE(r)) { + str = rb_str_new(fptr->rbuf+fptr->rbuf_off, fptr->rbuf_len); + fptr->rbuf_len = 0; +getc_needmore: + if (io_fillbuf(fptr) != -1) { + rb_str_cat(str, fptr->rbuf+fptr->rbuf_off, 1); + fptr->rbuf_off++; + fptr->rbuf_len--; + r = rb_enc_precise_mbclen(RSTRING_PTR(str), RSTRING_PTR(str)+RSTRING_LEN(str), enc); + if (MBCLEN_NEEDMORE(r)) { + goto getc_needmore; + } + } + } else { - str = rb_str_new(0, n); - left = fptr->rbuf_len; - MEMCPY(RSTRING_PTR(str), fptr->rbuf+fptr->rbuf_off, char, left); - if (io_fillbuf(fptr) < 0) { - return Qnil; - } - MEMCPY(RSTRING_PTR(str)+left, fptr->rbuf, char, n-left); - fptr->rbuf_off += left; - fptr->rbuf_len -= left; + str = rb_str_new(fptr->rbuf+fptr->rbuf_off, 1); + fptr->rbuf_off++; + fptr->rbuf_len--; } rb_enc_associate(str, enc); @@ -2919,10 +2919,20 @@ rb_str_inspect(VALUE str) str_cat_char(result, '"', enc); p = RSTRING_PTR(str); pend = RSTRING_END(str); while (p < pend) { - int c = rb_enc_codepoint(p, pend, enc); - int n = rb_enc_codelen(c, enc); + int c; + int n; int cc; + n = rb_enc_precise_mbclen(p, pend, enc); + if (!MBCLEN_CHARFOUND(n)) { + p++; + n = 1; + goto escape_codepoint; + } + + c = rb_enc_codepoint(p, pend, enc); + n = rb_enc_codelen(c, enc); + p += n; if (c == '"'|| c == '\\' || (c == '#' && (cc = rb_enc_codepoint(p,pend,enc), @@ -2954,19 +2964,21 @@ rb_str_inspect(VALUE str) prefix_escape(result, 'e', enc); } else if (rb_enc_isprint(c, enc)) { - char buf[5]; - - rb_enc_mbcput(c, buf, enc); - rb_str_buf_cat(result, buf, n); + rb_str_buf_cat(result, p-n, n); } else { char buf[5]; - char *s = buf; + char *s; + char *q; - sprintf(buf, "\\%03o", c & 0377); - while (*s) { - str_cat_char(result, *s++, enc); - } +escape_codepoint: + for (q = p-n; q < p; q++) { + s = buf; + sprintf(buf, "\\%03o", *q & 0377); + while (*s) { + str_cat_char(result, *s++, enc); + } + } } } str_cat_char(result, '"', enc); @@ -5232,6 +5244,25 @@ rb_str_force_encoding(VALUE str, VALUE enc) return str; } +static VALUE +rb_str_valid_encoding_p(VALUE str) +{ + char *p = RSTRING_PTR(str); + char *pend = RSTRING_END(str); + rb_encoding *enc = rb_enc_get(str); + + while (p < pend) { + int n; + + n = rb_enc_precise_mbclen(p, pend, enc); + if (!MBCLEN_CHARFOUND(n)) { + return Qfalse; + } + p += n; + } + return Qtrue; +} + /********************************************************************** * Document-class: Symbol * @@ -5644,6 +5675,7 @@ Init_String(void) rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); + rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); id_to_s = rb_intern("to_s"); diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index dc932d7cb..02c8dca4d 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -26,14 +26,46 @@ class TestM17N < Test::Unit::TestCase end def test_string_mixed_unicode - assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) } - assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) } - assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) } - assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) } - assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) } - assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) } - assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) } - assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) } + assert_raise(SyntaxError) { eval(a(%{"\xc2\xa0\\u{6666}"})) } + assert_raise(SyntaxError) { eval(e(%{"\xc2\xa0\\u{6666}"})) } + assert_raise(SyntaxError) { eval(s(%{"\xc2\xa0\\u{6666}"})) } + assert_nothing_raised { eval(u(%{"\xc2\xa0\\u{6666}"})) } + assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc2\xa0"})) } + assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc2\xa0"})) } + assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc2\xa0"})) } + assert_nothing_raised { eval(u(%{"\\u{6666}\xc2\xa0"})) } + end + + def test_string_inspect + assert_equal('"\376"', e("\xfe").inspect) + assert_equal('"\216"', e("\x8e").inspect) + assert_equal('"\217"', e("\x8f").inspect) + assert_equal('"\217\241"', e("\x8f\xa1").inspect) + assert_equal('"\357"', s("\xef").inspect) + assert_equal('"\302"', u("\xc2").inspect) + assert_equal('"\340\200"', u("\xe0\x80").inspect) + assert_equal('"\360\200\200"', u("\xf0\x80\x80").inspect) + assert_equal('"\370\200\200\200"', u("\xf8\x80\x80\x80").inspect) + assert_equal('"\374\200\200\200\200"', u("\xfc\x80\x80\x80\x80").inspect) + + assert_equal('"\376 "', e("\xfe ").inspect) + assert_equal('"\216 "', e("\x8e ").inspect) + assert_equal('"\217 "', e("\x8f ").inspect) + assert_equal('"\217\241 "', e("\x8f\xa1 ").inspect) + assert_equal('"\357 "', s("\xef ").inspect) + assert_equal('"\302 "', u("\xc2 ").inspect) + assert_equal('"\340\200 "', u("\xe0\x80 ").inspect) + assert_equal('"\360\200\200 "', u("\xf0\x80\x80 ").inspect) + assert_equal('"\370\200\200\200 "', u("\xf8\x80\x80\x80 ").inspect) + assert_equal('"\374\200\200\200\200 "', u("\xfc\x80\x80\x80\x80 ").inspect) + + + assert_equal(e("\"\\241\x8f\xa1\xa1\""), e("\xa1\x8f\xa1\xa1").inspect) + + assert_equal('"\201."', s("\x81.").inspect) + assert_equal(s("\"\x81@\""), s("\x81@").inspect) + + assert_equal('"\374"', u("\xfc").inspect) end def test_regexp_too_short_multibyte_character @@ -42,27 +74,27 @@ class TestM17N < Test::Unit::TestCase assert_raise(SyntaxError) { eval('/\x8f/e') } assert_raise(SyntaxError) { eval('/\x8f\xa1/e') } assert_raise(SyntaxError) { eval('/\xef/s') } - assert_raise(SyntaxError) { eval('/\xc0/u') } + assert_raise(SyntaxError) { eval('/\xc2/u') } assert_raise(SyntaxError) { eval('/\xe0\x80/u') } assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } - assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } - assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } + #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } + #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } # raw 8bit assert_raise(SyntaxError) { eval("/\xfe/e") } - assert_raise(SyntaxError) { eval("/\xc0/u") } + assert_raise(SyntaxError) { eval("/\xc2/u") } # invalid suffix - assert_raise(SyntaxError) { eval('/\xc0\xff/u') } - assert_raise(SyntaxError) { eval('/\xc0 /u') } - #assert_raise(SyntaxError) { eval('/\xc0\x20/u') } + assert_raise(SyntaxError) { eval('/\xc2\xff/u') } + assert_raise(SyntaxError) { eval('/\xc2 /u') } + #assert_raise(SyntaxError) { eval('/\xc2\x20/u') } end def assert_regexp_generic_encoding(r) assert(!r.fixed_encoding?) %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| - # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. - assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(ename) } + # "\xc2\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. + assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(ename) } } end @@ -71,9 +103,9 @@ class TestM17N < Test::Unit::TestCase %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| enc = Encoding.find(ename) if enc == r.encoding - assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(enc) } + assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(enc) } else - assert_raise(ArgumentError) { r =~ "\xc0\xa1".force_encoding(enc) } + assert_raise(ArgumentError) { r =~ "\xc2\xa1".force_encoding(enc) } end } end @@ -115,77 +147,77 @@ class TestM17N < Test::Unit::TestCase assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) assert_equal(0, r =~ u("a")) - assert_equal(nil, r =~ a("\xc0\xa1")) - assert_equal(nil, r =~ e("\xc0\xa1")) - assert_equal(nil, r =~ s("\xc0\xa1")) - assert_equal(nil, r =~ u("\xc0\xa1")) + assert_equal(nil, r =~ a("\xc2\xa1")) + assert_equal(nil, r =~ e("\xc2\xa1")) + assert_equal(nil, r =~ s("\xc2\xa1")) + assert_equal(nil, r =~ u("\xc2\xa1")) } end def test_regexp_ascii assert_regexp_fixed_ascii8bit(/a/n) - assert_regexp_fixed_ascii8bit(/\xc0\xa1/n) - assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/}))) - assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n}))) - assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/}))) + assert_regexp_fixed_ascii8bit(/\xc2\xa1/n) + assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/}))) + assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/n}))) + assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc2\xa1/}))) [/a/n].each {|r| assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) assert_equal(0, r =~ u("a")) - assert_equal(nil, r =~ a("\xc0\xa1")) - assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + assert_equal(nil, r =~ a("\xc2\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc2\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc2\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc2\xa1") } } - [/\xc0\xa1/n, eval(a(%{/\xc0\xa1/})), eval(a(%{/\xc0\xa1/n}))].each {|r| + [/\xc2\xa1/n, eval(a(%{/\xc2\xa1/})), eval(a(%{/\xc2\xa1/n}))].each {|r| assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) assert_equal(nil, r =~ u("a")) - assert_equal(0, r =~ a("\xc0\xa1")) - assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + assert_equal(0, r =~ a("\xc2\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc2\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc2\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc2\xa1") } } end def test_regexp_euc assert_regexp_fixed_eucjp(/a/e) - assert_regexp_fixed_eucjp(/\xc0\xa1/e) - assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/}))) - assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/}))) + assert_regexp_fixed_eucjp(/\xc2\xa1/e) + assert_regexp_fixed_eucjp(eval(e(%{/\xc2\xa1/}))) + assert_regexp_fixed_eucjp(eval(e(%q{/\xc2\xa1/}))) [/a/e].each {|r| assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) assert_equal(0, r =~ u("a")) - assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } - assert_equal(nil, r =~ e("\xc0\xa1")) - assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ a("\xc2\xa1") } + assert_equal(nil, r =~ e("\xc2\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc2\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc2\xa1") } } - [/\xc0\xa1/e, eval(e(%{/\xc0\xa1/})), eval(e(%q{/\xc0\xa1/}))].each {|r| + [/\xc2\xa1/e, eval(e(%{/\xc2\xa1/})), eval(e(%q{/\xc2\xa1/}))].each {|r| assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) assert_equal(nil, r =~ u("a")) - assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } - assert_equal(0, r =~ e("\xc0\xa1")) - assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ a("\xc2\xa1") } + assert_equal(0, r =~ e("\xc2\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc2\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc2\xa1") } } end def test_regexp_sjis assert_regexp_fixed_sjis(/a/s) - assert_regexp_fixed_sjis(/\xc0\xa1/s) - assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/}))) - assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/}))) + assert_regexp_fixed_sjis(/\xc2\xa1/s) + assert_regexp_fixed_sjis(eval(s(%{/\xc2\xa1/}))) + assert_regexp_fixed_sjis(eval(s(%q{/\xc2\xa1/}))) end def test_begin_end_offset @@ -223,10 +255,10 @@ class TestM17N < Test::Unit::TestCase assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding) assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding) - assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc0\xa1")).encoding) - assert_encoding("EUC-JP", Regexp.quote(e("\xc0\xa1")).encoding) - assert_encoding("Shift_JIS", Regexp.quote(s("\xc0\xa1")).encoding) - assert_encoding("UTF-8", Regexp.quote(u("\xc0\xa1")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc2\xa1")).encoding) + assert_encoding("EUC-JP", Regexp.quote(e("\xc2\xa1")).encoding) + assert_encoding("Shift_JIS", Regexp.quote(s("\xc2\xa1")).encoding) + assert_encoding("UTF-8", Regexp.quote(u("\xc2\xa1")).encoding) end def test_union_0 @@ -254,10 +286,10 @@ class TestM17N < Test::Unit::TestCase end def test_union_1_nonascii_string - assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc0\xa1"))) - assert_regexp_fixed_eucjp(Regexp.union(e("\xc0\xa1"))) - assert_regexp_fixed_sjis(Regexp.union(s("\xc0\xa1"))) - assert_regexp_fixed_utf8(Regexp.union(u("\xc0\xa1"))) + assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc2\xa1"))) + assert_regexp_fixed_eucjp(Regexp.union(e("\xc2\xa1"))) + assert_regexp_fixed_sjis(Regexp.union(s("\xc2\xa1"))) + assert_regexp_fixed_utf8(Regexp.union(u("\xc2\xa1"))) end def test_union_1_regexp @@ -271,7 +303,7 @@ class TestM17N < Test::Unit::TestCase def test_union_2 ary = [ a(""), e(""), s(""), u(""), - a("\xc0\xa1"), e("\xc0\xa1"), s("\xc0\xa1"), u("\xc0\xa1") + a("\xc2\xa1"), e("\xc2\xa1"), s("\xc2\xa1"), u("\xc2\xa1") ] ary.each {|s1| ary.each {|s2| @@ -304,26 +336,26 @@ class TestM17N < Test::Unit::TestCase def test_dynamic_ascii_regexp assert_regexp_fixed_ascii8bit(/#{}/n) - assert_regexp_fixed_ascii8bit(/#{}\xc0\xa1/n) - assert_regexp_fixed_ascii8bit(/\xc0\xa1#{}/n) - #assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/s') } - #assert_raise(SyntaxError) { s1, s2 = s('\xc0'), s('\xa1'); /#{s1}#{s2}/ } + assert_regexp_fixed_ascii8bit(/#{}\xc2\xa1/n) + assert_regexp_fixed_ascii8bit(/\xc2\xa1#{}/n) + #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/s') } + #assert_raise(SyntaxError) { s1, s2 = s('\xc2'), s('\xa1'); /#{s1}#{s2}/ } end def test_dynamic_eucjp_regexp assert_regexp_fixed_eucjp(/#{}/e) - assert_regexp_fixed_eucjp(/#{}\xc0\xa1/e) - assert_regexp_fixed_eucjp(/\xc0\xa1#{}/e) - assert_raise(RegexpError) { eval('/\xc0#{}/e') } - assert_raise(RegexpError) { eval('/#{}\xc0/e') } - #assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/e') } - #assert_raise(SyntaxError) { s1, s2 = e('\xc0'), e('\xa1'); /#{s1}#{s2}/ } + assert_regexp_fixed_eucjp(/#{}\xc2\xa1/e) + assert_regexp_fixed_eucjp(/\xc2\xa1#{}/e) + assert_raise(RegexpError) { eval('/\xc2#{}/e') } + assert_raise(RegexpError) { eval('/#{}\xc2/e') } + #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/e') } + #assert_raise(SyntaxError) { s1, s2 = e('\xc2'), e('\xa1'); /#{s1}#{s2}/ } end def test_dynamic_sjis_regexp assert_regexp_fixed_sjis(/#{}/s) - assert_regexp_fixed_sjis(/#{}\xc0\xa1/s) - assert_regexp_fixed_sjis(/\xc0\xa1#{}/s) + assert_regexp_fixed_sjis(/#{}\xc2\xa1/s) + assert_regexp_fixed_sjis(/\xc2\xa1#{}/s) assert_raise(RegexpError) { eval('/\x81#{}/s') } assert_raise(RegexpError) { eval('/#{}\x81/s') } #assert_raise(SyntaxError) { eval('/\x81#{}\xa1/s') } @@ -332,49 +364,49 @@ class TestM17N < Test::Unit::TestCase def test_dynamic_utf8_regexp assert_regexp_fixed_utf8(/#{}/u) - assert_regexp_fixed_utf8(/#{}\xc0\xa1/u) - assert_regexp_fixed_utf8(/\xc0\xa1#{}/u) - assert_raise(RegexpError) { eval('/\xc0#{}/u') } - assert_raise(RegexpError) { eval('/#{}\xc0/u') } - #assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/u') } - #assert_raise(SyntaxError) { s1, s2 = u('\xc0'), u('\xa1'); /#{s1}#{s2}/ } + assert_regexp_fixed_utf8(/#{}\xc2\xa1/u) + assert_regexp_fixed_utf8(/\xc2\xa1#{}/u) + assert_raise(RegexpError) { eval('/\xc2#{}/u') } + assert_raise(RegexpError) { eval('/#{}\xc2/u') } + #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/u') } + #assert_raise(SyntaxError) { s1, s2 = u('\xc2'), u('\xa1'); /#{s1}#{s2}/ } end def test_regexp_mixed_unicode - assert_raise(SyntaxError) { eval(a(%{/\xc0\xa0\\u{6666}/})) } - assert_raise(SyntaxError) { eval(e(%{/\xc0\xa0\\u{6666}/})) } - assert_raise(SyntaxError) { eval(s(%{/\xc0\xa0\\u{6666}/})) } - assert_nothing_raised { eval(u(%{/\xc0\xa0\\u{6666}/})) } - assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc0\xa0/})) } - assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc0\xa0/})) } - assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc0\xa0/})) } - assert_nothing_raised { eval(u(%{/\\u{6666}\xc0\xa0/})) } - - assert_raise(SyntaxError) { eval(a(%{/\\xc0\\xa0\\u{6666}/})) } - assert_raise(SyntaxError) { eval(e(%{/\\xc0\\xa0\\u{6666}/})) } - assert_raise(SyntaxError) { eval(s(%{/\\xc0\\xa0\\u{6666}/})) } - assert_nothing_raised { eval(u(%{/\\xc0\\xa0\\u{6666}/})) } - assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc0\\xa0/})) } - assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc0\\xa0/})) } - assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc0\\xa0/})) } - assert_nothing_raised { eval(u(%{/\\u{6666}\\xc0\\xa0/})) } - - assert_raise(SyntaxError) { eval(a(%{/\xc0\xa0#{}\\u{6666}/})) } - assert_raise(SyntaxError) { eval(e(%{/\xc0\xa0#{}\\u{6666}/})) } - assert_raise(SyntaxError) { eval(s(%{/\xc0\xa0#{}\\u{6666}/})) } - assert_nothing_raised { eval(u(%{/\xc0\xa0#{}\\u{6666}/})) } - assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\xc0\xa0/})) } - assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\xc0\xa0/})) } - assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\xc0\xa0/})) } - assert_nothing_raised { eval(u(%{/\\u{6666}#{}\xc0\xa0/})) } - - assert_raise(SyntaxError) { eval(a(%{/\\xc0\\xa0#{}\\u{6666}/})) } - assert_raise(SyntaxError) { eval(e(%{/\\xc0\\xa0#{}\\u{6666}/})) } - assert_raise(SyntaxError) { eval(s(%{/\\xc0\\xa0#{}\\u{6666}/})) } - assert_nothing_raised { eval(u(%{/\\xc0\\xa0#{}\\u{6666}/})) } - assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\\xc0\\xa0/})) } - assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\\xc0\\xa0/})) } - assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\\xc0\\xa0/})) } - assert_nothing_raised { eval(u(%{/\\u{6666}#{}\\xc0\\xa0/})) } + assert_raise(SyntaxError) { eval(a(%{/\xc2\xa0\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\xc2\xa0\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\xc2\xa0\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\xc2\xa0\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc2\xa0/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc2\xa0/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc2\xa0/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}\xc2\xa0/})) } + + assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa0\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa0\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa0\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\\xc2\\xa0\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc2\\xa0/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc2\\xa0/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc2\\xa0/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}\\xc2\\xa0/})) } + + assert_raise(SyntaxError) { eval(a(%{/\xc2\xa0#{}\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\xc2\xa0#{}\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\xc2\xa0#{}\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\xc2\xa0#{}\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\xc2\xa0/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\xc2\xa0/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\xc2\xa0/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}#{}\xc2\xa0/})) } + + assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa0#{}\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa0#{}\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa0#{}\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\\xc2\\xa0#{}\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\\xc2\\xa0/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\\xc2\\xa0/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\\xc2\\xa0/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}#{}\\xc2\\xa0/})) } end end |