summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog39
-rw-r--r--encoding.c32
-rw-r--r--ext/tk/sample/tkextlib/vu/canvSticker2.rb1
-rw-r--r--include/ruby/encoding.h3
-rw-r--r--include/ruby/regex.h1
-rw-r--r--parse.y22
-rw-r--r--re.c54
-rw-r--r--string.c8
-rw-r--r--test/ruby/test_m17n.rb6
-rw-r--r--test/ruby/test_regexp.rb6
10 files changed, 119 insertions, 53 deletions
diff --git a/ChangeLog b/ChangeLog
index 349fc9c6d..8ce4954ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,42 @@
+Sat Dec 8 11:06:29 2007 Tanaka Akira <akr@fsij.org>
+
+ * encoding.c (rb_enc_mbclen): make it never fail.
+ (rb_enc_nth): don't check the return value of rb_enc_mbclen.
+ (rb_enc_strlen): ditto.
+ (rb_enc_precise_mbclen): return needmore(1) if e <= p.
+ (rb_enc_get_ascii): new function for extracting ASCII character.
+
+ * include/ruby/encoding.h (rb_enc_get_ascii): declared.
+
+ * include/ruby/regex.h (ismbchar): removed.
+
+ * re.c (rb_reg_expr_str): use rb_enc_get_ascii.
+ (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine
+ the termination of escaped non-ASCII character.
+ (unescape_nonascii): use rb_enc_precise_mbclen.
+ (rb_reg_quote): use rb_enc_get_ascii.
+ (rb_reg_regsub): use rb_enc_get_ascii.
+
+ * string.c (rb_str_reverse) don't check the return value of
+ rb_enc_mbclen.
+ (rb_str_split_m): don't call rb_enc_mbclen with e <= p.
+
+ * parse.y (is_identchar): use ISASCII.
+ (parser_ismbchar): removed.
+ (parser_precise_mbclen): new macro.
+ (parser_isascii): new macro.
+ (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid
+ character precisely.
+ (parser_tokadd_string): use parser_isascii.
+ (parser_yylex): ditto.
+ (is_special_global_name): don't call is_identchar with e <= p.
+ (rb_enc_symname_p): ditto.
+
+ [ruby-dev:32455]
+
+ * ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie
+ because the encoding is not UTF-8. [ruby-dev:32475]
+
Fri Dec 7 20:21:35 2007 GOTOU Yuuzou <gotoyuzo@notwork.org>
* ext/openssl/lib/net/ftptls.rb, ext/openssl/lib/net/telnets.rb:
diff --git a/encoding.c b/encoding.c
index 53ceac851..540aa8870 100644
--- a/encoding.c
+++ b/encoding.c
@@ -459,7 +459,6 @@ rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
for (c=0; p<e && nth--; c++) {
int n = rb_enc_mbclen(p, e, enc);
- if (n == 0) return 0;
p += n;
}
}
@@ -478,7 +477,6 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
for (c=0; p<e; c++) {
int n = rb_enc_mbclen(p, e, enc);
- if (n == 0) return -1;
p += n;
}
return c;
@@ -487,19 +485,39 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
int
rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
{
- int n = ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
- if (n == 0) {
- rb_raise(rb_eArgError, "invalid mbstring sequence");
- }
- return n;
+ int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+ if (MBCLEN_CHARFOUND(n))
+ return n;
+ else
+ return 1;
}
int
rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
{
+ if (e <= p)
+ return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
}
+int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc)
+{
+ int c, l;
+ if (e <= p)
+ return -1;
+ if (rb_enc_asciicompat(enc)) {
+ c = (unsigned char)*p;
+ return ISASCII(c) ? c : -1;
+ }
+ l = rb_enc_precise_mbclen(p, e, enc);
+ if (!MBCLEN_CHARFOUND(l))
+ return -1;
+ c = rb_enc_codepoint(p, e, enc);
+ if (rb_enc_isascii(c, enc))
+ return c;
+ return -1;
+}
+
int
rb_enc_codelen(int c, rb_encoding *enc)
{
diff --git a/ext/tk/sample/tkextlib/vu/canvSticker2.rb b/ext/tk/sample/tkextlib/vu/canvSticker2.rb
index 21f098a19..f54e74866 100644
--- a/ext/tk/sample/tkextlib/vu/canvSticker2.rb
+++ b/ext/tk/sample/tkextlib/vu/canvSticker2.rb
@@ -1,5 +1,4 @@
#!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
require 'tk'
require 'tkextlib/vu/charts'
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index a800f5427..6924e955a 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -77,6 +77,9 @@ int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*);
#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
+/* ptr,endptr,encoding -> 0x00..0x7f, -1 */
+int rb_enc_get_ascii(const char*, const char *, rb_encoding*);
+
/* code,encoding -> codelen */
int rb_enc_codelen(int, rb_encoding*);
diff --git a/include/ruby/regex.h b/include/ruby/regex.h
index b214c63d3..d0a670f28 100644
--- a/include/ruby/regex.h
+++ b/include/ruby/regex.h
@@ -29,7 +29,6 @@ extern "C" {
ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding;
-#define ismbchar(p, e, enc) (mbclen((p),(e),(enc)) != 1)
#define mbclen(p,e,enc) rb_enc_mbclen((p),(e),(enc))
#endif /* ifndef ONIG_RUBY_M17N */
diff --git a/parse.y b/parse.y
index 78cae5a36..0becb9280 100644
--- a/parse.y
+++ b/parse.y
@@ -4583,10 +4583,12 @@ ripper_dispatch_delayed_token(struct parser_params *parser, int t)
#endif
#define parser_mbclen() mbclen((lex_p-1),lex_pend,parser->enc)
-#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || ismbchar(p,e,enc))
-#define parser_ismbchar() ismbchar((lex_p-1), lex_pend, parser->enc)
+#define parser_precise_mbclen() rb_enc_precise_mbclen((lex_p-1),lex_pend,parser->enc)
+#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || !ISASCII(*p))
#define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),lex_pend,parser->enc))
+#define parser_isascii() ISASCII(*(lex_p-1))
+
static int
parser_yyerror(struct parser_params *parser, const char *msg)
{
@@ -5305,8 +5307,8 @@ dispose_string(VALUE str)
static int
parser_tokadd_mbchar(struct parser_params *parser, int c)
{
- int len = parser_mbclen();
- if (len <= 0 || lex_p + len - 1 > lex_pend) {
+ int len = parser_precise_mbclen();
+ if (!MBCLEN_CHARFOUND(len)) {
compile_error(PARSER_ARG "illegal multibyte char");
return -1;
}
@@ -5414,7 +5416,7 @@ parser_tokadd_string(struct parser_params *parser,
}
}
}
- else if (parser_ismbchar()) {
+ else if (!parser_isascii()) {
has_nonascii = 1;
if (enc != *encp) {
mixed_error(enc, *encp);
@@ -6306,7 +6308,7 @@ parser_yylex(struct parser_params *parser)
}
newtok();
enc = parser->enc;
- if (parser_ismbchar()) {
+ if (!parser_isascii()) {
if (tokadd_mbchar(c) == -1) return 0;
}
else if ((rb_enc_isalnum(c, parser->enc) || c == '_') &&
@@ -6889,7 +6891,7 @@ parser_yylex(struct parser_params *parser)
}
else {
term = nextc();
- if (rb_enc_isalnum(term, parser->enc) || parser_ismbchar()) {
+ if (rb_enc_isalnum(term, parser->enc) || !parser_isascii()) {
yyerror("unknown type of %string");
return 0;
}
@@ -8693,7 +8695,7 @@ is_special_global_name(const char *m, const char *e, rb_encoding *enc)
break;
case '-':
++m;
- if (is_identchar(m, e, enc)) {
+ if (m < e && is_identchar(m, e, enc)) {
if (!ISASCII(*m)) mb = 1;
m += rb_enc_mbclen(m, e, enc);
}
@@ -8776,9 +8778,9 @@ rb_enc_symname_p(const char *name, rb_encoding *enc)
default:
localid = !rb_enc_isupper(*m, enc);
id:
- if (*m != '_' && !rb_enc_isalpha(*m, enc) && !ismbchar(m, e, enc))
+ if (m >= e || (*m != '_' && !rb_enc_isalpha(*m, enc) && ISASCII(*m)))
return Qfalse;
- while (is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc);
+ while (m < e && is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc);
if (localid) {
switch (*m) {
case '!': case '?': case '=': ++m;
diff --git a/re.c b/re.c
index fca7f3a79..c30453591 100644
--- a/re.c
+++ b/re.c
@@ -218,10 +218,12 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
rb_encoding *enc = rb_enc_get(str);
const char *p, *pend;
int need_escape = 0;
+ int c;
p = s; pend = p + len;
while (p<pend) {
- if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) {
+ c = rb_enc_get_ascii(p, pend, enc);
+ if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) {
need_escape = 1;
break;
}
@@ -233,29 +235,31 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
else {
p = s;
while (p<pend) {
- if (*p == '\\') {
+ c = rb_enc_get_ascii(p, pend, enc);
+ if (c == '\\') {
int n = mbclen(p+1, pend, enc) + 1;
rb_str_buf_cat(str, p, n);
p += n;
continue;
}
- else if (*p == '/') {
+ else if (c == '/') {
char c = '\\';
rb_str_buf_cat(str, &c, 1);
rb_str_buf_cat(str, p, 1);
}
- else if (ismbchar(p, pend, enc)) {
- rb_str_buf_cat(str, p, mbclen(p, pend, enc));
- p += mbclen(p, pend, enc);
+ else if (c == -1) {
+ int l = mbclen(p, pend, enc);
+ rb_str_buf_cat(str, p, l);
+ p += l;
continue;
}
- else if (rb_enc_isprint(*p, enc)) {
+ else if (rb_enc_isprint(c, enc)) {
rb_str_buf_cat(str, p, 1);
}
- else if (!rb_enc_isspace(*p, enc)) {
+ else if (!rb_enc_isspace(c, enc)) {
char b[8];
- sprintf(b, "\\%03o", *p & 0377);
+ sprintf(b, "\\%03o", c);
rb_str_buf_cat(str, b, 4);
}
else {
@@ -1377,6 +1381,7 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
char *chbuf = ALLOCA_N(char, chmaxlen);
int chlen = 0;
int byte;
+ int l;
memset(chbuf, 0, chmaxlen);
@@ -1386,7 +1391,8 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
}
chbuf[chlen++] = byte;
- while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) {
+ while (chlen < chmaxlen &&
+ MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
byte = read_escaped_byte(&p, end, err);
if (byte == -1) {
return -1;
@@ -1394,11 +1400,11 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
chbuf[chlen++] = byte;
}
- if (chlen != mbclen(chbuf, chbuf+chlen, enc)) {
+ l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
+ if (MBCLEN_INVALID(l)) {
strcpy(err, "invalid multibyte escape");
return -1;
}
-
if (1 < chlen || (chbuf[0] & 0x80)) {
rb_str_buf_cat(buf, chbuf, chlen);
@@ -1515,13 +1521,12 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
char smallbuf[2];
while (p < end) {
- int chlen = mbclen(p, end, enc);
+ int chlen = rb_enc_precise_mbclen(p, end, enc);
+ if (!MBCLEN_CHARFOUND(chlen)) {
+ strcpy(err, "invalid multibyte character");
+ return -1;
+ }
if (1 < chlen || (*p & 0x80)) {
- if (end < p + chlen) {
- strcpy(err, "too short multibyte character");
- return -1;
- }
- /* xxx: validate the non-ascii character */
rb_str_buf_cat(buf, p, chlen);
p += chlen;
if (*encp == 0)
@@ -2093,8 +2098,8 @@ rb_reg_quote(VALUE str)
s = RSTRING_PTR(str);
send = s + RSTRING_LEN(str);
for (; s < send; s++) {
- c = *s;
- if (ismbchar(s, send, enc)) {
+ c = rb_enc_get_ascii(s, send, enc);
+ if (c == -1) {
int n = mbclen(s, send, enc);
while (n-- && s < send)
@@ -2129,8 +2134,8 @@ rb_reg_quote(VALUE str)
t += s - RSTRING_PTR(str);
for (; s < send; s++) {
- c = *s;
- if (ismbchar(s, send, enc)) {
+ c = rb_enc_get_ascii(s, send, enc);
+ if (c == -1) {
int n = mbclen(s, send, enc);
while (n-- && s < send)
@@ -2397,13 +2402,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
e = s + RSTRING_LEN(str);
while (s < e) {
+ int c = rb_enc_get_ascii(s, e, enc);
char *ss = s++;
- if (ismbchar(ss, e, enc)) {
+ if (c == -1) {
s += mbclen(ss, e, enc) - 1;
continue;
}
- if (*ss != '\\' || s == e) continue;
+ if (c != '\\' || s == e) continue;
if (!val) {
val = rb_str_buf_new(ss-p);
diff --git a/string.c b/string.c
index 20f2e3823..3811edc89 100644
--- a/string.c
+++ b/string.c
@@ -2725,9 +2725,6 @@ rb_str_reverse(VALUE str)
while (s < e) {
int clen = rb_enc_mbclen(s, e, enc);
- if (clen == 0) {
- rb_raise(rb_eArgError, "invalid mbstring sequence");
- }
p -= clen;
memcpy(p, s, clen);
s += clen;
@@ -4079,7 +4076,10 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
beg = start;
}
else {
- start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
+ if (RSTRING_PTR(str)+start == RSTRING_END(str))
+ start++;
+ else
+ start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
last_null = 1;
continue;
}
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index 02c8dca4d..df59c8877 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -77,8 +77,8 @@ class TestM17N < Test::Unit::TestCase
assert_raise(SyntaxError) { eval('/\xc2/u') }
assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
- #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
- #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
+ assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
+ assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit
assert_raise(SyntaxError) { eval("/\xfe/e") }
@@ -87,7 +87,7 @@ class TestM17N < Test::Unit::TestCase
# invalid suffix
assert_raise(SyntaxError) { eval('/\xc2\xff/u') }
assert_raise(SyntaxError) { eval('/\xc2 /u') }
- #assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
+ assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
end
def assert_regexp_generic_encoding(r)
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 9cb8c4ac1..84710bb44 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -20,7 +20,7 @@ class TestRegexp < Test::Unit::TestCase
def test_yoshidam_net_20041111_2
assert_raise(RegexpError) do
- s = "[\xFF-\xFF]"
+ s = "[\xFF-\xFF]".force_encoding("utf-8")
Regexp.new(s, nil, "u")
end
end
@@ -42,8 +42,8 @@ class TestRegexp < Test::Unit::TestCase
assert_equal :ok, begin
Regexp.union(
"a",
- Regexp.new("\x80".force_encoding("euc-jp")),
- Regexp.new("\x80".force_encoding("utf-8")))
+ Regexp.new("\xc2\xa1".force_encoding("euc-jp")),
+ Regexp.new("\xc2\xa1".force_encoding("utf-8")))
:ng
rescue ArgumentError
:ok