summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog33
-rw-r--r--enc/euc_jp.c77
-rw-r--r--enc/sjis.c54
-rw-r--r--enc/utf8.c180
-rw-r--r--encoding.c6
-rw-r--r--include/ruby/encoding.h8
-rw-r--r--include/ruby/oniguruma.h29
-rw-r--r--io.c32
-rw-r--r--string.c54
-rw-r--r--test/ruby/test_m17n.rb258
10 files changed, 588 insertions, 143 deletions
diff --git a/ChangeLog b/ChangeLog
index 79f59dc0e..d2d08c84c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,36 @@
+Thu Dec 6 18:22:11 2007 Tanaka Akira <akr@fsij.org>
+
+ * encoding.c (rb_enc_precise_mbclen): new function for mbclen with
+ validation.
+
+ * include/ruby/encoding.h (rb_enc_precise_mbclen): declared.
+ (MBCLEN_CHARFOUND): new macro.
+ (MBCLEN_INVALID): new macro.
+ (MBCLEN_NEEDMORE): new macro.
+
+ * include/ruby/oniguruma.h (OnigEncodingTypeST): replace mbc_enc_len
+ by precise_mbc_enc_len.
+ (ONIGENC_PRECISE_MBC_ENC_LEN): new macro.
+ (ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND): new macro.
+ (ONIGENC_CONSTRUCT_MBCLEN_INVALID): new macro.
+ (ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE): new macro.
+ (ONIGENC_MBCLEN_CHARFOUND): new macro.
+ (ONIGENC_MBCLEN_INVALID): new macro.
+ (ONIGENC_MBCLEN_NEEDMORE): new macro.
+ (ONIGENC_MBC_ENC_LEN): use ONIGENC_PRECISE_MBC_ENC_LEN.
+
+ * enc/euc_jp.c: validation implemented.
+
+ * enc/sjis.c: ditto.
+
+ * enc/utf8.c: ditto.
+
+ * string.c (rb_str_inspect): use rb_enc_precise_mbclen for invalid
+ encoding.
+ (rb_str_valid_encoding_p): new method String#valid_encoding?.
+
+ * io.c (rb_io_getc): use rb_enc_precise_mbclen.
+
Thu Dec 6 01:37:23 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
* regparse.c (i_apply_case_fold): fix for negative character class. a
diff --git a/enc/euc_jp.c b/enc/euc_jp.c
index f48c90494..ea2a8e072 100644
--- a/enc/euc_jp.c
+++ b/enc/euc_jp.c
@@ -50,10 +50,85 @@ static const int EncLen_EUCJP[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
};
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
+ { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
+ /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
+ },
+ { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
+ },
+ { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
+ },
+
+};
+#undef A
+#undef F
+
static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{
- return EncLen_EUCJP[*p];
+ int firstbyte = *p++;
+ state_t s;
+ s = trans[0][firstbyte];
+ if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
+ s = trans[s][*p++];
+ if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
+ s = trans[s][*p++];
+ return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static OnigCodePoint
diff --git a/enc/sjis.c b/enc/sjis.c
index fbfddcc61..ff7da3427 100644
--- a/enc/sjis.c
+++ b/enc/sjis.c
@@ -70,10 +70,62 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
#define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
+ { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
+ },
+ { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
+ /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
+ }
+};
+#undef A
+#undef F
+
static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{
- return EncLen_SJIS[*p];
+ int firstbyte = *p++;
+ state_t s;
+ s = trans[0][firstbyte];
+ if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
+ s = trans[s][*p++];
+ return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static int
diff --git a/enc/utf8.c b/enc/utf8.c
index a2acd5eee..24a385448 100644
--- a/enc/utf8.c
+++ b/enc/utf8.c
@@ -56,13 +56,189 @@ static const int EncLen_UTF8[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
+ 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};
+typedef enum {
+ FAILURE = -2,
+ ACCEPT,
+ S0, S1, S2, S3,
+ S4, S5, S6, S7
+} state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
+ { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
+ /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+ { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+ /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
+ },
+};
+#undef A
+#undef F
+
static int
utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{
- return EncLen_UTF8[*p];
+ int firstbyte = *p++;
+ state_t s;
+ s = trans[0][firstbyte];
+ if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
+ s = trans[s][*p++];
+ if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
+ s = trans[s][*p++];
+ if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
+ s = trans[s][*p++];
+ return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
+ ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static int
diff --git a/encoding.c b/encoding.c
index ec7405f96..53ceac851 100644
--- a/encoding.c
+++ b/encoding.c
@@ -495,6 +495,12 @@ rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
}
int
+rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
+{
+ return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+}
+
+int
rb_enc_codelen(int c, rb_encoding *enc)
{
int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index bdef19133..a800f5427 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -68,9 +68,15 @@ rb_encoding * rb_enc_find(const char *name);
#define rb_enc_mbminlen(enc) (enc)->min_enc_len
#define rb_enc_mbmaxlen(enc) (enc)->max_enc_len
-/* ptr,encoding -> mbclen */
+/* ptr,endptr,encoding -> mbclen */
int rb_enc_mbclen(const char*, const char *, rb_encoding*);
+/* ptr,endptr,encoding -> chlen, invalid or needmore */
+int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*);
+#define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret)
+#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
+#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
+
/* code,encoding -> codelen */
int rb_enc_codelen(int, rb_encoding*);
diff --git a/include/ruby/oniguruma.h b/include/ruby/oniguruma.h
index 0a1f614bd..df4d07b11 100644
--- a/include/ruby/oniguruma.h
+++ b/include/ruby/oniguruma.h
@@ -144,7 +144,7 @@ typedef struct {
typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg);
typedef struct OnigEncodingTypeST {
- int (*mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc);
+ int (*precise_mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc);
const char* name;
int max_enc_len;
int min_enc_len;
@@ -282,7 +282,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030;
#define ONIGENC_STEP_BACK(enc,start,s,n) \
onigenc_step_back((enc),(start),(s),(n))
-#define ONIGENC_MBC_ENC_LEN(enc,p,e) (enc)->mbc_enc_len(p,e,enc)
+
+#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n)
+#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1)
+#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-n)
+
+static inline int onigenc_mbclen_charfound(int r) { return 0 < r ? r : 0; }
+static inline int onigenc_mbclen_needmore(int r) { return r < -1 ? -1 - r : 0; }
+#define ONIGENC_MBCLEN_CHARFOUND(r) onigenc_mbclen_charfound(r)
+#define ONIGENC_MBCLEN_INVALID(r) ((r) == -1)
+#define ONIGENC_MBCLEN_NEEDMORE(r) onigenc_mbclen_needmore(r)
+
+#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e) (enc)->precise_mbc_enc_len(p,e,enc)
+
+static inline int onigenc_mbclen_recover(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc)
+{
+ int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e);
+ int r;
+ if (ONIGENC_MBCLEN_INVALID(ret))
+ return 1;
+ else if ((r = ONIGENC_MBCLEN_NEEDMORE(ret)))
+ return e-p+r;
+ else
+ return ONIGENC_MBCLEN_CHARFOUND(ret);
+}
+
+#define ONIGENC_MBC_ENC_LEN(enc,p,e) onigenc_mbclen_recover(p,e,enc)
#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len)
#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc)
#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len)
diff --git a/io.c b/io.c
index 0dd586678..12d4e01c5 100644
--- a/io.c
+++ b/io.c
@@ -2127,7 +2127,7 @@ rb_io_getc(VALUE io)
{
rb_encoding *enc;
rb_io_t *fptr;
- int n, left;
+ int r, n;
VALUE str;
GetOpenFile(io, fptr);
@@ -2138,22 +2138,30 @@ rb_io_getc(VALUE io)
if (io_fillbuf(fptr) < 0) {
return Qnil;
}
- n = rb_enc_mbclen(fptr->rbuf+fptr->rbuf_off, fptr->rbuf+fptr->rbuf_len, enc);
- if (n < fptr->rbuf_len) {
+ r = rb_enc_precise_mbclen(fptr->rbuf+fptr->rbuf_off, fptr->rbuf+fptr->rbuf_off+fptr->rbuf_len, enc);
+ if ((n = MBCLEN_CHARFOUND(r)) != 0 && n <= fptr->rbuf_len) {
str = rb_str_new(fptr->rbuf+fptr->rbuf_off, n);
fptr->rbuf_off += n;
fptr->rbuf_len -= n;
}
+ else if (MBCLEN_NEEDMORE(r)) {
+ str = rb_str_new(fptr->rbuf+fptr->rbuf_off, fptr->rbuf_len);
+ fptr->rbuf_len = 0;
+getc_needmore:
+ if (io_fillbuf(fptr) != -1) {
+ rb_str_cat(str, fptr->rbuf+fptr->rbuf_off, 1);
+ fptr->rbuf_off++;
+ fptr->rbuf_len--;
+ r = rb_enc_precise_mbclen(RSTRING_PTR(str), RSTRING_PTR(str)+RSTRING_LEN(str), enc);
+ if (MBCLEN_NEEDMORE(r)) {
+ goto getc_needmore;
+ }
+ }
+ }
else {
- str = rb_str_new(0, n);
- left = fptr->rbuf_len;
- MEMCPY(RSTRING_PTR(str), fptr->rbuf+fptr->rbuf_off, char, left);
- if (io_fillbuf(fptr) < 0) {
- return Qnil;
- }
- MEMCPY(RSTRING_PTR(str)+left, fptr->rbuf, char, n-left);
- fptr->rbuf_off += left;
- fptr->rbuf_len -= left;
+ str = rb_str_new(fptr->rbuf+fptr->rbuf_off, 1);
+ fptr->rbuf_off++;
+ fptr->rbuf_len--;
}
rb_enc_associate(str, enc);
diff --git a/string.c b/string.c
index 5ca4dc425..20f2e3823 100644
--- a/string.c
+++ b/string.c
@@ -2919,10 +2919,20 @@ rb_str_inspect(VALUE str)
str_cat_char(result, '"', enc);
p = RSTRING_PTR(str); pend = RSTRING_END(str);
while (p < pend) {
- int c = rb_enc_codepoint(p, pend, enc);
- int n = rb_enc_codelen(c, enc);
+ int c;
+ int n;
int cc;
+ n = rb_enc_precise_mbclen(p, pend, enc);
+ if (!MBCLEN_CHARFOUND(n)) {
+ p++;
+ n = 1;
+ goto escape_codepoint;
+ }
+
+ c = rb_enc_codepoint(p, pend, enc);
+ n = rb_enc_codelen(c, enc);
+
p += n;
if (c == '"'|| c == '\\' ||
(c == '#' && (cc = rb_enc_codepoint(p,pend,enc),
@@ -2954,19 +2964,21 @@ rb_str_inspect(VALUE str)
prefix_escape(result, 'e', enc);
}
else if (rb_enc_isprint(c, enc)) {
- char buf[5];
-
- rb_enc_mbcput(c, buf, enc);
- rb_str_buf_cat(result, buf, n);
+ rb_str_buf_cat(result, p-n, n);
}
else {
char buf[5];
- char *s = buf;
+ char *s;
+ char *q;
- sprintf(buf, "\\%03o", c & 0377);
- while (*s) {
- str_cat_char(result, *s++, enc);
- }
+escape_codepoint:
+ for (q = p-n; q < p; q++) {
+ s = buf;
+ sprintf(buf, "\\%03o", *q & 0377);
+ while (*s) {
+ str_cat_char(result, *s++, enc);
+ }
+ }
}
}
str_cat_char(result, '"', enc);
@@ -5232,6 +5244,25 @@ rb_str_force_encoding(VALUE str, VALUE enc)
return str;
}
+static VALUE
+rb_str_valid_encoding_p(VALUE str)
+{
+ char *p = RSTRING_PTR(str);
+ char *pend = RSTRING_END(str);
+ rb_encoding *enc = rb_enc_get(str);
+
+ while (p < pend) {
+ int n;
+
+ n = rb_enc_precise_mbclen(p, pend, enc);
+ if (!MBCLEN_CHARFOUND(n)) {
+ return Qfalse;
+ }
+ p += n;
+ }
+ return Qtrue;
+}
+
/**********************************************************************
* Document-class: Symbol
*
@@ -5644,6 +5675,7 @@ Init_String(void)
rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
+ rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
id_to_s = rb_intern("to_s");
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index dc932d7cb..02c8dca4d 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -26,14 +26,46 @@ class TestM17N < Test::Unit::TestCase
end
def test_string_mixed_unicode
- assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) }
- assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) }
- assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) }
- assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) }
- assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) }
- assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) }
- assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) }
- assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) }
+ assert_raise(SyntaxError) { eval(a(%{"\xc2\xa0\\u{6666}"})) }
+ assert_raise(SyntaxError) { eval(e(%{"\xc2\xa0\\u{6666}"})) }
+ assert_raise(SyntaxError) { eval(s(%{"\xc2\xa0\\u{6666}"})) }
+ assert_nothing_raised { eval(u(%{"\xc2\xa0\\u{6666}"})) }
+ assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc2\xa0"})) }
+ assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc2\xa0"})) }
+ assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc2\xa0"})) }
+ assert_nothing_raised { eval(u(%{"\\u{6666}\xc2\xa0"})) }
+ end
+
+ def test_string_inspect
+ assert_equal('"\376"', e("\xfe").inspect)
+ assert_equal('"\216"', e("\x8e").inspect)
+ assert_equal('"\217"', e("\x8f").inspect)
+ assert_equal('"\217\241"', e("\x8f\xa1").inspect)
+ assert_equal('"\357"', s("\xef").inspect)
+ assert_equal('"\302"', u("\xc2").inspect)
+ assert_equal('"\340\200"', u("\xe0\x80").inspect)
+ assert_equal('"\360\200\200"', u("\xf0\x80\x80").inspect)
+ assert_equal('"\370\200\200\200"', u("\xf8\x80\x80\x80").inspect)
+ assert_equal('"\374\200\200\200\200"', u("\xfc\x80\x80\x80\x80").inspect)
+
+ assert_equal('"\376 "', e("\xfe ").inspect)
+ assert_equal('"\216 "', e("\x8e ").inspect)
+ assert_equal('"\217 "', e("\x8f ").inspect)
+ assert_equal('"\217\241 "', e("\x8f\xa1 ").inspect)
+ assert_equal('"\357 "', s("\xef ").inspect)
+ assert_equal('"\302 "', u("\xc2 ").inspect)
+ assert_equal('"\340\200 "', u("\xe0\x80 ").inspect)
+ assert_equal('"\360\200\200 "', u("\xf0\x80\x80 ").inspect)
+ assert_equal('"\370\200\200\200 "', u("\xf8\x80\x80\x80 ").inspect)
+ assert_equal('"\374\200\200\200\200 "', u("\xfc\x80\x80\x80\x80 ").inspect)
+
+
+ assert_equal(e("\"\\241\x8f\xa1\xa1\""), e("\xa1\x8f\xa1\xa1").inspect)
+
+ assert_equal('"\201."', s("\x81.").inspect)
+ assert_equal(s("\"\x81@\""), s("\x81@").inspect)
+
+ assert_equal('"\374"', u("\xfc").inspect)
end
def test_regexp_too_short_multibyte_character
@@ -42,27 +74,27 @@ class TestM17N < Test::Unit::TestCase
assert_raise(SyntaxError) { eval('/\x8f/e') }
assert_raise(SyntaxError) { eval('/\x8f\xa1/e') }
assert_raise(SyntaxError) { eval('/\xef/s') }
- assert_raise(SyntaxError) { eval('/\xc0/u') }
+ assert_raise(SyntaxError) { eval('/\xc2/u') }
assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
- assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
- assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
+ #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
+ #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit
assert_raise(SyntaxError) { eval("/\xfe/e") }
- assert_raise(SyntaxError) { eval("/\xc0/u") }
+ assert_raise(SyntaxError) { eval("/\xc2/u") }
# invalid suffix
- assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
- assert_raise(SyntaxError) { eval('/\xc0 /u') }
- #assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
+ assert_raise(SyntaxError) { eval('/\xc2\xff/u') }
+ assert_raise(SyntaxError) { eval('/\xc2 /u') }
+ #assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
end
def assert_regexp_generic_encoding(r)
assert(!r.fixed_encoding?)
%w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename|
- # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
- assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(ename) }
+ # "\xc2\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
+ assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(ename) }
}
end
@@ -71,9 +103,9 @@ class TestM17N < Test::Unit::TestCase
%w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename|
enc = Encoding.find(ename)
if enc == r.encoding
- assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(enc) }
+ assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(enc) }
else
- assert_raise(ArgumentError) { r =~ "\xc0\xa1".force_encoding(enc) }
+ assert_raise(ArgumentError) { r =~ "\xc2\xa1".force_encoding(enc) }
end
}
end
@@ -115,77 +147,77 @@ class TestM17N < Test::Unit::TestCase
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
- assert_equal(nil, r =~ a("\xc0\xa1"))
- assert_equal(nil, r =~ e("\xc0\xa1"))
- assert_equal(nil, r =~ s("\xc0\xa1"))
- assert_equal(nil, r =~ u("\xc0\xa1"))
+ assert_equal(nil, r =~ a("\xc2\xa1"))
+ assert_equal(nil, r =~ e("\xc2\xa1"))
+ assert_equal(nil, r =~ s("\xc2\xa1"))
+ assert_equal(nil, r =~ u("\xc2\xa1"))
}
end
def test_regexp_ascii
assert_regexp_fixed_ascii8bit(/a/n)
- assert_regexp_fixed_ascii8bit(/\xc0\xa1/n)
- assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/})))
- assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n})))
- assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
+ assert_regexp_fixed_ascii8bit(/\xc2\xa1/n)
+ assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/})))
+ assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/n})))
+ assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc2\xa1/})))
[/a/n].each {|r|
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
- assert_equal(nil, r =~ a("\xc0\xa1"))
- assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
- assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
- assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+ assert_equal(nil, r =~ a("\xc2\xa1"))
+ assert_raise(ArgumentError) { r =~ e("\xc2\xa1") }
+ assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
}
- [/\xc0\xa1/n, eval(a(%{/\xc0\xa1/})), eval(a(%{/\xc0\xa1/n}))].each {|r|
+ [/\xc2\xa1/n, eval(a(%{/\xc2\xa1/})), eval(a(%{/\xc2\xa1/n}))].each {|r|
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
- assert_equal(0, r =~ a("\xc0\xa1"))
- assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
- assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
- assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+ assert_equal(0, r =~ a("\xc2\xa1"))
+ assert_raise(ArgumentError) { r =~ e("\xc2\xa1") }
+ assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
}
end
def test_regexp_euc
assert_regexp_fixed_eucjp(/a/e)
- assert_regexp_fixed_eucjp(/\xc0\xa1/e)
- assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/})))
- assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/})))
+ assert_regexp_fixed_eucjp(/\xc2\xa1/e)
+ assert_regexp_fixed_eucjp(eval(e(%{/\xc2\xa1/})))
+ assert_regexp_fixed_eucjp(eval(e(%q{/\xc2\xa1/})))
[/a/e].each {|r|
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
- assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
- assert_equal(nil, r =~ e("\xc0\xa1"))
- assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
- assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ a("\xc2\xa1") }
+ assert_equal(nil, r =~ e("\xc2\xa1"))
+ assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
}
- [/\xc0\xa1/e, eval(e(%{/\xc0\xa1/})), eval(e(%q{/\xc0\xa1/}))].each {|r|
+ [/\xc2\xa1/e, eval(e(%{/\xc2\xa1/})), eval(e(%q{/\xc2\xa1/}))].each {|r|
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
- assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
- assert_equal(0, r =~ e("\xc0\xa1"))
- assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
- assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ a("\xc2\xa1") }
+ assert_equal(0, r =~ e("\xc2\xa1"))
+ assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
}
end
def test_regexp_sjis
assert_regexp_fixed_sjis(/a/s)
- assert_regexp_fixed_sjis(/\xc0\xa1/s)
- assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/})))
- assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/})))
+ assert_regexp_fixed_sjis(/\xc2\xa1/s)
+ assert_regexp_fixed_sjis(eval(s(%{/\xc2\xa1/})))
+ assert_regexp_fixed_sjis(eval(s(%q{/\xc2\xa1/})))
end
def test_begin_end_offset
@@ -223,10 +255,10 @@ class TestM17N < Test::Unit::TestCase
assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding)
- assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc0\xa1")).encoding)
- assert_encoding("EUC-JP", Regexp.quote(e("\xc0\xa1")).encoding)
- assert_encoding("Shift_JIS", Regexp.quote(s("\xc0\xa1")).encoding)
- assert_encoding("UTF-8", Regexp.quote(u("\xc0\xa1")).encoding)
+ assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc2\xa1")).encoding)
+ assert_encoding("EUC-JP", Regexp.quote(e("\xc2\xa1")).encoding)
+ assert_encoding("Shift_JIS", Regexp.quote(s("\xc2\xa1")).encoding)
+ assert_encoding("UTF-8", Regexp.quote(u("\xc2\xa1")).encoding)
end
def test_union_0
@@ -254,10 +286,10 @@ class TestM17N < Test::Unit::TestCase
end
def test_union_1_nonascii_string
- assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc0\xa1")))
- assert_regexp_fixed_eucjp(Regexp.union(e("\xc0\xa1")))
- assert_regexp_fixed_sjis(Regexp.union(s("\xc0\xa1")))
- assert_regexp_fixed_utf8(Regexp.union(u("\xc0\xa1")))
+ assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc2\xa1")))
+ assert_regexp_fixed_eucjp(Regexp.union(e("\xc2\xa1")))
+ assert_regexp_fixed_sjis(Regexp.union(s("\xc2\xa1")))
+ assert_regexp_fixed_utf8(Regexp.union(u("\xc2\xa1")))
end
def test_union_1_regexp
@@ -271,7 +303,7 @@ class TestM17N < Test::Unit::TestCase
def test_union_2
ary = [
a(""), e(""), s(""), u(""),
- a("\xc0\xa1"), e("\xc0\xa1"), s("\xc0\xa1"), u("\xc0\xa1")
+ a("\xc2\xa1"), e("\xc2\xa1"), s("\xc2\xa1"), u("\xc2\xa1")
]
ary.each {|s1|
ary.each {|s2|
@@ -304,26 +336,26 @@ class TestM17N < Test::Unit::TestCase
def test_dynamic_ascii_regexp
assert_regexp_fixed_ascii8bit(/#{}/n)
- assert_regexp_fixed_ascii8bit(/#{}\xc0\xa1/n)
- assert_regexp_fixed_ascii8bit(/\xc0\xa1#{}/n)
- #assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/s') }
- #assert_raise(SyntaxError) { s1, s2 = s('\xc0'), s('\xa1'); /#{s1}#{s2}/ }
+ assert_regexp_fixed_ascii8bit(/#{}\xc2\xa1/n)
+ assert_regexp_fixed_ascii8bit(/\xc2\xa1#{}/n)
+ #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/s') }
+ #assert_raise(SyntaxError) { s1, s2 = s('\xc2'), s('\xa1'); /#{s1}#{s2}/ }
end
def test_dynamic_eucjp_regexp
assert_regexp_fixed_eucjp(/#{}/e)
- assert_regexp_fixed_eucjp(/#{}\xc0\xa1/e)
- assert_regexp_fixed_eucjp(/\xc0\xa1#{}/e)
- assert_raise(RegexpError) { eval('/\xc0#{}/e') }
- assert_raise(RegexpError) { eval('/#{}\xc0/e') }
- #assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/e') }
- #assert_raise(SyntaxError) { s1, s2 = e('\xc0'), e('\xa1'); /#{s1}#{s2}/ }
+ assert_regexp_fixed_eucjp(/#{}\xc2\xa1/e)
+ assert_regexp_fixed_eucjp(/\xc2\xa1#{}/e)
+ assert_raise(RegexpError) { eval('/\xc2#{}/e') }
+ assert_raise(RegexpError) { eval('/#{}\xc2/e') }
+ #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/e') }
+ #assert_raise(SyntaxError) { s1, s2 = e('\xc2'), e('\xa1'); /#{s1}#{s2}/ }
end
def test_dynamic_sjis_regexp
assert_regexp_fixed_sjis(/#{}/s)
- assert_regexp_fixed_sjis(/#{}\xc0\xa1/s)
- assert_regexp_fixed_sjis(/\xc0\xa1#{}/s)
+ assert_regexp_fixed_sjis(/#{}\xc2\xa1/s)
+ assert_regexp_fixed_sjis(/\xc2\xa1#{}/s)
assert_raise(RegexpError) { eval('/\x81#{}/s') }
assert_raise(RegexpError) { eval('/#{}\x81/s') }
#assert_raise(SyntaxError) { eval('/\x81#{}\xa1/s') }
@@ -332,49 +364,49 @@ class TestM17N < Test::Unit::TestCase
def test_dynamic_utf8_regexp
assert_regexp_fixed_utf8(/#{}/u)
- assert_regexp_fixed_utf8(/#{}\xc0\xa1/u)
- assert_regexp_fixed_utf8(/\xc0\xa1#{}/u)
- assert_raise(RegexpError) { eval('/\xc0#{}/u') }
- assert_raise(RegexpError) { eval('/#{}\xc0/u') }
- #assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/u') }
- #assert_raise(SyntaxError) { s1, s2 = u('\xc0'), u('\xa1'); /#{s1}#{s2}/ }
+ assert_regexp_fixed_utf8(/#{}\xc2\xa1/u)
+ assert_regexp_fixed_utf8(/\xc2\xa1#{}/u)
+ assert_raise(RegexpError) { eval('/\xc2#{}/u') }
+ assert_raise(RegexpError) { eval('/#{}\xc2/u') }
+ #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/u') }
+ #assert_raise(SyntaxError) { s1, s2 = u('\xc2'), u('\xa1'); /#{s1}#{s2}/ }
end
def test_regexp_mixed_unicode
- assert_raise(SyntaxError) { eval(a(%{/\xc0\xa0\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(e(%{/\xc0\xa0\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(s(%{/\xc0\xa0\\u{6666}/})) }
- assert_nothing_raised { eval(u(%{/\xc0\xa0\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc0\xa0/})) }
- assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc0\xa0/})) }
- assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc0\xa0/})) }
- assert_nothing_raised { eval(u(%{/\\u{6666}\xc0\xa0/})) }
-
- assert_raise(SyntaxError) { eval(a(%{/\\xc0\\xa0\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(e(%{/\\xc0\\xa0\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(s(%{/\\xc0\\xa0\\u{6666}/})) }
- assert_nothing_raised { eval(u(%{/\\xc0\\xa0\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc0\\xa0/})) }
- assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc0\\xa0/})) }
- assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc0\\xa0/})) }
- assert_nothing_raised { eval(u(%{/\\u{6666}\\xc0\\xa0/})) }
-
- assert_raise(SyntaxError) { eval(a(%{/\xc0\xa0#{}\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(e(%{/\xc0\xa0#{}\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(s(%{/\xc0\xa0#{}\\u{6666}/})) }
- assert_nothing_raised { eval(u(%{/\xc0\xa0#{}\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\xc0\xa0/})) }
- assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\xc0\xa0/})) }
- assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\xc0\xa0/})) }
- assert_nothing_raised { eval(u(%{/\\u{6666}#{}\xc0\xa0/})) }
-
- assert_raise(SyntaxError) { eval(a(%{/\\xc0\\xa0#{}\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(e(%{/\\xc0\\xa0#{}\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(s(%{/\\xc0\\xa0#{}\\u{6666}/})) }
- assert_nothing_raised { eval(u(%{/\\xc0\\xa0#{}\\u{6666}/})) }
- assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\\xc0\\xa0/})) }
- assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\\xc0\\xa0/})) }
- assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\\xc0\\xa0/})) }
- assert_nothing_raised { eval(u(%{/\\u{6666}#{}\\xc0\\xa0/})) }
+ assert_raise(SyntaxError) { eval(a(%{/\xc2\xa0\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\xc2\xa0\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\xc2\xa0\\u{6666}/})) }
+ assert_nothing_raised { eval(u(%{/\xc2\xa0\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc2\xa0/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc2\xa0/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc2\xa0/})) }
+ assert_nothing_raised { eval(u(%{/\\u{6666}\xc2\xa0/})) }
+
+ assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa0\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa0\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa0\\u{6666}/})) }
+ assert_nothing_raised { eval(u(%{/\\xc2\\xa0\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc2\\xa0/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc2\\xa0/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc2\\xa0/})) }
+ assert_nothing_raised { eval(u(%{/\\u{6666}\\xc2\\xa0/})) }
+
+ assert_raise(SyntaxError) { eval(a(%{/\xc2\xa0#{}\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\xc2\xa0#{}\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\xc2\xa0#{}\\u{6666}/})) }
+ assert_nothing_raised { eval(u(%{/\xc2\xa0#{}\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\xc2\xa0/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\xc2\xa0/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\xc2\xa0/})) }
+ assert_nothing_raised { eval(u(%{/\\u{6666}#{}\xc2\xa0/})) }
+
+ assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa0#{}\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa0#{}\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa0#{}\\u{6666}/})) }
+ assert_nothing_raised { eval(u(%{/\\xc2\\xa0#{}\\u{6666}/})) }
+ assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\\xc2\\xa0/})) }
+ assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\\xc2\\xa0/})) }
+ assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\\xc2\\xa0/})) }
+ assert_nothing_raised { eval(u(%{/\\u{6666}#{}\\xc2\\xa0/})) }
end
end