diff options
-rw-r--r-- | ChangeLog | 39 | ||||
-rw-r--r-- | encoding.c | 32 | ||||
-rw-r--r-- | ext/tk/sample/tkextlib/vu/canvSticker2.rb | 1 | ||||
-rw-r--r-- | include/ruby/encoding.h | 3 | ||||
-rw-r--r-- | include/ruby/regex.h | 1 | ||||
-rw-r--r-- | parse.y | 22 | ||||
-rw-r--r-- | re.c | 54 | ||||
-rw-r--r-- | string.c | 8 | ||||
-rw-r--r-- | test/ruby/test_m17n.rb | 6 | ||||
-rw-r--r-- | test/ruby/test_regexp.rb | 6 |
10 files changed, 119 insertions, 53 deletions
@@ -1,3 +1,42 @@ +Sat Dec 8 11:06:29 2007 Tanaka Akira <akr@fsij.org> + + * encoding.c (rb_enc_mbclen): make it never fail. + (rb_enc_nth): don't check the return value of rb_enc_mbclen. + (rb_enc_strlen): ditto. + (rb_enc_precise_mbclen): return needmore(1) if e <= p. + (rb_enc_get_ascii): new function for extracting ASCII character. + + * include/ruby/encoding.h (rb_enc_get_ascii): declared. + + * include/ruby/regex.h (ismbchar): removed. + + * re.c (rb_reg_expr_str): use rb_enc_get_ascii. + (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine + the termination of escaped non-ASCII character. + (unescape_nonascii): use rb_enc_precise_mbclen. + (rb_reg_quote): use rb_enc_get_ascii. + (rb_reg_regsub): use rb_enc_get_ascii. + + * string.c (rb_str_reverse) don't check the return value of + rb_enc_mbclen. + (rb_str_split_m): don't call rb_enc_mbclen with e <= p. + + * parse.y (is_identchar): use ISASCII. + (parser_ismbchar): removed. + (parser_precise_mbclen): new macro. + (parser_isascii): new macro. + (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid + character precisely. + (parser_tokadd_string): use parser_isascii. + (parser_yylex): ditto. + (is_special_global_name): don't call is_identchar with e <= p. + (rb_enc_symname_p): ditto. + + [ruby-dev:32455] + + * ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie + because the encoding is not UTF-8. [ruby-dev:32475] + Fri Dec 7 20:21:35 2007 GOTOU Yuuzou <gotoyuzo@notwork.org> * ext/openssl/lib/net/ftptls.rb, ext/openssl/lib/net/telnets.rb: diff --git a/encoding.c b/encoding.c index 53ceac851..540aa8870 100644 --- a/encoding.c +++ b/encoding.c @@ -459,7 +459,6 @@ rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc) for (c=0; p<e && nth--; c++) { int n = rb_enc_mbclen(p, e, enc); - if (n == 0) return 0; p += n; } } @@ -478,7 +477,6 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) for (c=0; p<e; c++) { int n = rb_enc_mbclen(p, e, enc); - if (n == 0) return -1; p += n; } return c; @@ -487,19 +485,39 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) { - int n = ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); - if (n == 0) { - rb_raise(rb_eArgError, "invalid mbstring sequence"); - } - return n; + int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); + if (MBCLEN_CHARFOUND(n)) + return n; + else + return 1; } int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) { + if (e <= p) + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); } +int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc) +{ + int c, l; + if (e <= p) + return -1; + if (rb_enc_asciicompat(enc)) { + c = (unsigned char)*p; + return ISASCII(c) ? c : -1; + } + l = rb_enc_precise_mbclen(p, e, enc); + if (!MBCLEN_CHARFOUND(l)) + return -1; + c = rb_enc_codepoint(p, e, enc); + if (rb_enc_isascii(c, enc)) + return c; + return -1; +} + int rb_enc_codelen(int c, rb_encoding *enc) { diff --git a/ext/tk/sample/tkextlib/vu/canvSticker2.rb b/ext/tk/sample/tkextlib/vu/canvSticker2.rb index 21f098a19..f54e74866 100644 --- a/ext/tk/sample/tkextlib/vu/canvSticker2.rb +++ b/ext/tk/sample/tkextlib/vu/canvSticker2.rb @@ -1,5 +1,4 @@ #!/usr/bin/env ruby -# -*- coding: utf-8 -*- require 'tk' require 'tkextlib/vu/charts' diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index a800f5427..6924e955a 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -77,6 +77,9 @@ int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*); #define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret) #define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret) +/* ptr,endptr,encoding -> 0x00..0x7f, -1 */ +int rb_enc_get_ascii(const char*, const char *, rb_encoding*); + /* code,encoding -> codelen */ int rb_enc_codelen(int, rb_encoding*); diff --git a/include/ruby/regex.h b/include/ruby/regex.h index b214c63d3..d0a670f28 100644 --- a/include/ruby/regex.h +++ b/include/ruby/regex.h @@ -29,7 +29,6 @@ extern "C" { ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -#define ismbchar(p, e, enc) (mbclen((p),(e),(enc)) != 1) #define mbclen(p,e,enc) rb_enc_mbclen((p),(e),(enc)) #endif /* ifndef ONIG_RUBY_M17N */ @@ -4583,10 +4583,12 @@ ripper_dispatch_delayed_token(struct parser_params *parser, int t) #endif #define parser_mbclen() mbclen((lex_p-1),lex_pend,parser->enc) -#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || ismbchar(p,e,enc)) -#define parser_ismbchar() ismbchar((lex_p-1), lex_pend, parser->enc) +#define parser_precise_mbclen() rb_enc_precise_mbclen((lex_p-1),lex_pend,parser->enc) +#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || !ISASCII(*p)) #define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),lex_pend,parser->enc)) +#define parser_isascii() ISASCII(*(lex_p-1)) + static int parser_yyerror(struct parser_params *parser, const char *msg) { @@ -5305,8 +5307,8 @@ dispose_string(VALUE str) static int parser_tokadd_mbchar(struct parser_params *parser, int c) { - int len = parser_mbclen(); - if (len <= 0 || lex_p + len - 1 > lex_pend) { + int len = parser_precise_mbclen(); + if (!MBCLEN_CHARFOUND(len)) { compile_error(PARSER_ARG "illegal multibyte char"); return -1; } @@ -5414,7 +5416,7 @@ parser_tokadd_string(struct parser_params *parser, } } } - else if (parser_ismbchar()) { + else if (!parser_isascii()) { has_nonascii = 1; if (enc != *encp) { mixed_error(enc, *encp); @@ -6306,7 +6308,7 @@ parser_yylex(struct parser_params *parser) } newtok(); enc = parser->enc; - if (parser_ismbchar()) { + if (!parser_isascii()) { if (tokadd_mbchar(c) == -1) return 0; } else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && @@ -6889,7 +6891,7 @@ parser_yylex(struct parser_params *parser) } else { term = nextc(); - if (rb_enc_isalnum(term, parser->enc) || parser_ismbchar()) { + if (rb_enc_isalnum(term, parser->enc) || !parser_isascii()) { yyerror("unknown type of %string"); return 0; } @@ -8693,7 +8695,7 @@ is_special_global_name(const char *m, const char *e, rb_encoding *enc) break; case '-': ++m; - if (is_identchar(m, e, enc)) { + if (m < e && is_identchar(m, e, enc)) { if (!ISASCII(*m)) mb = 1; m += rb_enc_mbclen(m, e, enc); } @@ -8776,9 +8778,9 @@ rb_enc_symname_p(const char *name, rb_encoding *enc) default: localid = !rb_enc_isupper(*m, enc); id: - if (*m != '_' && !rb_enc_isalpha(*m, enc) && !ismbchar(m, e, enc)) + if (m >= e || (*m != '_' && !rb_enc_isalpha(*m, enc) && ISASCII(*m))) return Qfalse; - while (is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc); + while (m < e && is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc); if (localid) { switch (*m) { case '!': case '?': case '=': ++m; @@ -218,10 +218,12 @@ rb_reg_expr_str(VALUE str, const char *s, long len) rb_encoding *enc = rb_enc_get(str); const char *p, *pend; int need_escape = 0; + int c; p = s; pend = p + len; while (p<pend) { - if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) { + c = rb_enc_get_ascii(p, pend, enc); + if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) { need_escape = 1; break; } @@ -233,29 +235,31 @@ rb_reg_expr_str(VALUE str, const char *s, long len) else { p = s; while (p<pend) { - if (*p == '\\') { + c = rb_enc_get_ascii(p, pend, enc); + if (c == '\\') { int n = mbclen(p+1, pend, enc) + 1; rb_str_buf_cat(str, p, n); p += n; continue; } - else if (*p == '/') { + else if (c == '/') { char c = '\\'; rb_str_buf_cat(str, &c, 1); rb_str_buf_cat(str, p, 1); } - else if (ismbchar(p, pend, enc)) { - rb_str_buf_cat(str, p, mbclen(p, pend, enc)); - p += mbclen(p, pend, enc); + else if (c == -1) { + int l = mbclen(p, pend, enc); + rb_str_buf_cat(str, p, l); + p += l; continue; } - else if (rb_enc_isprint(*p, enc)) { + else if (rb_enc_isprint(c, enc)) { rb_str_buf_cat(str, p, 1); } - else if (!rb_enc_isspace(*p, enc)) { + else if (!rb_enc_isspace(c, enc)) { char b[8]; - sprintf(b, "\\%03o", *p & 0377); + sprintf(b, "\\%03o", c); rb_str_buf_cat(str, b, 4); } else { @@ -1377,6 +1381,7 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, char *chbuf = ALLOCA_N(char, chmaxlen); int chlen = 0; int byte; + int l; memset(chbuf, 0, chmaxlen); @@ -1386,7 +1391,8 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, } chbuf[chlen++] = byte; - while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) { + while (chlen < chmaxlen && + MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { byte = read_escaped_byte(&p, end, err); if (byte == -1) { return -1; @@ -1394,11 +1400,11 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, chbuf[chlen++] = byte; } - if (chlen != mbclen(chbuf, chbuf+chlen, enc)) { + l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); + if (MBCLEN_INVALID(l)) { strcpy(err, "invalid multibyte escape"); return -1; } - if (1 < chlen || (chbuf[0] & 0x80)) { rb_str_buf_cat(buf, chbuf, chlen); @@ -1515,13 +1521,12 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc, char smallbuf[2]; while (p < end) { - int chlen = mbclen(p, end, enc); + int chlen = rb_enc_precise_mbclen(p, end, enc); + if (!MBCLEN_CHARFOUND(chlen)) { + strcpy(err, "invalid multibyte character"); + return -1; + } if (1 < chlen || (*p & 0x80)) { - if (end < p + chlen) { - strcpy(err, "too short multibyte character"); - return -1; - } - /* xxx: validate the non-ascii character */ rb_str_buf_cat(buf, p, chlen); p += chlen; if (*encp == 0) @@ -2093,8 +2098,8 @@ rb_reg_quote(VALUE str) s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); for (; s < send; s++) { - c = *s; - if (ismbchar(s, send, enc)) { + c = rb_enc_get_ascii(s, send, enc); + if (c == -1) { int n = mbclen(s, send, enc); while (n-- && s < send) @@ -2129,8 +2134,8 @@ rb_reg_quote(VALUE str) t += s - RSTRING_PTR(str); for (; s < send; s++) { - c = *s; - if (ismbchar(s, send, enc)) { + c = rb_enc_get_ascii(s, send, enc); + if (c == -1) { int n = mbclen(s, send, enc); while (n-- && s < send) @@ -2397,13 +2402,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) e = s + RSTRING_LEN(str); while (s < e) { + int c = rb_enc_get_ascii(s, e, enc); char *ss = s++; - if (ismbchar(ss, e, enc)) { + if (c == -1) { s += mbclen(ss, e, enc) - 1; continue; } - if (*ss != '\\' || s == e) continue; + if (c != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); @@ -2725,9 +2725,6 @@ rb_str_reverse(VALUE str) while (s < e) { int clen = rb_enc_mbclen(s, e, enc); - if (clen == 0) { - rb_raise(rb_eArgError, "invalid mbstring sequence"); - } p -= clen; memcpy(p, s, clen); s += clen; @@ -4079,7 +4076,10 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) beg = start; } else { - start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); + if (RSTRING_PTR(str)+start == RSTRING_END(str)) + start++; + else + start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); last_null = 1; continue; } diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index 02c8dca4d..df59c8877 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -77,8 +77,8 @@ class TestM17N < Test::Unit::TestCase assert_raise(SyntaxError) { eval('/\xc2/u') } assert_raise(SyntaxError) { eval('/\xe0\x80/u') } assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } - #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } - #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } # raw 8bit assert_raise(SyntaxError) { eval("/\xfe/e") } @@ -87,7 +87,7 @@ class TestM17N < Test::Unit::TestCase # invalid suffix assert_raise(SyntaxError) { eval('/\xc2\xff/u') } assert_raise(SyntaxError) { eval('/\xc2 /u') } - #assert_raise(SyntaxError) { eval('/\xc2\x20/u') } + assert_raise(SyntaxError) { eval('/\xc2\x20/u') } end def assert_regexp_generic_encoding(r) diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 9cb8c4ac1..84710bb44 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -20,7 +20,7 @@ class TestRegexp < Test::Unit::TestCase def test_yoshidam_net_20041111_2 assert_raise(RegexpError) do - s = "[\xFF-\xFF]" + s = "[\xFF-\xFF]".force_encoding("utf-8") Regexp.new(s, nil, "u") end end @@ -42,8 +42,8 @@ class TestRegexp < Test::Unit::TestCase assert_equal :ok, begin Regexp.union( "a", - Regexp.new("\x80".force_encoding("euc-jp")), - Regexp.new("\x80".force_encoding("utf-8"))) + Regexp.new("\xc2\xa1".force_encoding("euc-jp")), + Regexp.new("\xc2\xa1".force_encoding("utf-8"))) :ng rescue ArgumentError :ok |