diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-12-11 03:08:50 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-12-11 03:08:50 +0000 |
commit | 257d79ae2d955b7fc2153a5efcb9cd5c2505cb2a (patch) | |
tree | 04a037e87e0e86eac5a46d8582c77836767f53d2 | |
parent | 10d13f89c7302cdb77349f6bdfa1d8c2576b6179 (diff) | |
download | ruby-257d79ae2d955b7fc2153a5efcb9cd5c2505cb2a.tar.gz ruby-257d79ae2d955b7fc2153a5efcb9cd5c2505cb2a.tar.xz ruby-257d79ae2d955b7fc2153a5efcb9cd5c2505cb2a.zip |
* encoding.c (rb_enc_get_ascii): add an argument to provide the
length of the returned character.
* include/ruby/encoding.h (rb_enc_get_ascii): add the argument.
* re.c (rb_reg_expr_str): modify rb_enc_get_ascii call.
(rb_reg_quote): ditto.
(rb_reg_regsub): ditto.
git-svn-id: http://svn.ruby-lang.org/repos/ruby/trunk@14190 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | encoding.c | 14 | ||||
-rw-r--r-- | include/ruby/encoding.h | 8 | ||||
-rw-r--r-- | re.c | 126 |
4 files changed, 90 insertions, 69 deletions
@@ -1,3 +1,14 @@ +Tue Dec 11 12:05:51 2007 Tanaka Akira <akr@fsij.org> + + * encoding.c (rb_enc_get_ascii): add an argument to provide the + length of the returned character. + + * include/ruby/encoding.h (rb_enc_get_ascii): add the argument. + + * re.c (rb_reg_expr_str): modify rb_enc_get_ascii call. + (rb_reg_quote): ditto. + (rb_reg_regsub): ditto. + Tue Dec 11 09:40:21 2007 Tanaka Akira <akr@fsij.org> * include/ruby/oniguruma.h (ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE): diff --git a/encoding.c b/encoding.c index 21b56f263..5949b17bf 100644 --- a/encoding.c +++ b/encoding.c @@ -505,22 +505,26 @@ rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) return n; } -int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc) +int rb_enc_get_ascii(const char *p, const char *e, int *len, rb_encoding *enc) { int c, l; if (e <= p) return -1; if (rb_enc_asciicompat(enc)) { c = (unsigned char)*p; - return ISASCII(c) ? c : -1; + if (!ISASCII(c)) + return -1; + if (len) *len = 1; + return c; } l = rb_enc_precise_mbclen(p, e, enc); if (!MBCLEN_CHARFOUND(l)) return -1; c = rb_enc_codepoint(p, e, enc); - if (rb_enc_isascii(c, enc)) - return c; - return -1; + if (!rb_enc_isascii(c, enc)) + return -1; + if (len) *len = l; + return c; } int diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 38ba031e6..dd30e66df 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -72,14 +72,14 @@ rb_encoding * rb_enc_find(const char *name); /* ptr,endptr,encoding -> mbclen */ int rb_enc_mbclen(const char*, const char *, rb_encoding*); -/* ptr,endptr,encoding -> chlen, invalid or needmore */ -int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*); +/* -> chlen, invalid or needmore */ +int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc); #define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret) #define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret) #define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret) -/* ptr,endptr,encoding -> 0x00..0x7f, -1 */ -int rb_enc_get_ascii(const char*, const char *, rb_encoding*); +/* -> 0x00..0x7f, -1 */ +int rb_enc_get_ascii(const char *p, const char *e, int *len, rb_encoding *enc); /* code,encoding -> codelen */ int rb_enc_codelen(int, rb_encoding*); @@ -218,16 +218,21 @@ rb_reg_expr_str(VALUE str, const char *s, long len) rb_encoding *enc = rb_enc_get(str); const char *p, *pend; int need_escape = 0; - int c; + int c, clen; p = s; pend = p + len; while (p<pend) { - c = rb_enc_get_ascii(p, pend, enc); - if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) { + c = rb_enc_get_ascii(p, pend, &clen, enc); + if (c == -1) { + p += mbclen(p, pend, enc); + } + else if (c != '/' && rb_enc_isprint(c, enc)) { + p += clen; + } + else { need_escape = 1; break; - } - p += mbclen(p, pend, enc); + } } if (!need_escape) { rb_str_buf_cat(str, s, len); @@ -235,9 +240,9 @@ rb_reg_expr_str(VALUE str, const char *s, long len) else { p = s; while (p<pend) { - c = rb_enc_get_ascii(p, pend, enc); - if (c == '\\') { - int n = mbclen(p+1, pend, enc) + 1; + c = rb_enc_get_ascii(p, pend, &clen, enc); + if (c == '\\' && p+clen < pend) { + int n = clen + mbclen(p+clen, pend, enc); rb_str_buf_cat(str, p, n); p += n; continue; @@ -245,7 +250,7 @@ rb_reg_expr_str(VALUE str, const char *s, long len) else if (c == '/') { char c = '\\'; rb_str_buf_cat(str, &c, 1); - rb_str_buf_cat(str, p, 1); + rb_str_buf_cat(str, p, clen); } else if (c == -1) { int l = mbclen(p, pend, enc); @@ -254,7 +259,7 @@ rb_reg_expr_str(VALUE str, const char *s, long len) continue; } else if (rb_enc_isprint(c, enc)) { - rb_str_buf_cat(str, p, 1); + rb_str_buf_cat(str, p, clen); } else if (!rb_enc_isspace(c, enc)) { char b[8]; @@ -263,9 +268,9 @@ rb_reg_expr_str(VALUE str, const char *s, long len) rb_str_buf_cat(str, b, 4); } else { - rb_str_buf_cat(str, p, 1); + rb_str_buf_cat(str, p, clen); } - p++; + p += clen; } } } @@ -2376,19 +2381,15 @@ rb_reg_quote(VALUE str) rb_encoding *enc = rb_enc_get(str); char *s, *send, *t; VALUE tmp; - int c; + int c, clen; int ascii_only = rb_enc_str_asciionly_p(str); s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); - for (; s < send; s++) { - c = rb_enc_get_ascii(s, send, enc); + while (s < send) { + c = rb_enc_get_ascii(s, send, &clen, enc); if (c == -1) { - int n = mbclen(s, send, enc); - - while (n-- && s < send) - s++; - s--; + s += mbclen(s, send, enc); continue; } switch (c) { @@ -2400,6 +2401,7 @@ rb_reg_quote(VALUE str) case '\t': case '\f': case '\v': case '\n': case '\r': goto meta_found; } + s += clen; } if (ascii_only && rb_enc_get_index(str) != 0) { str = rb_str_new3(str); @@ -2417,16 +2419,16 @@ rb_reg_quote(VALUE str) memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); t += s - RSTRING_PTR(str); - for (; s < send; s++) { - c = rb_enc_get_ascii(s, send, enc); + while (s < send) { + c = rb_enc_get_ascii(s, send, &clen, enc); if (c == -1) { int n = mbclen(s, send, enc); - while (n-- && s < send) + while (n--) *t++ = *s++; - s--; continue; } + s += clen; switch (c) { case '[': case ']': case '{': case '}': case '(': case ')': case '|': case '-': @@ -2684,8 +2686,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) { VALUE val = 0; char *p, *s, *e; - unsigned char uc; - int no; + int no, clen; rb_encoding *enc = rb_enc_check(str, src); rb_enc_check(str, regexp); @@ -2693,56 +2694,64 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) e = s + RSTRING_LEN(str); while (s < e) { - int c = rb_enc_get_ascii(s, e, enc); - char *ss = s++; + int c = rb_enc_get_ascii(s, e, &clen, enc); + char *ss; if (c == -1) { - s += mbclen(ss, e, enc) - 1; + s += mbclen(s, e, enc); continue; } + ss = s; + s += clen; + if (c != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); - rb_str_buf_cat(val, p, ss-p); - } - else { - rb_str_buf_cat(val, p, ss-p); } + rb_str_buf_cat(val, p, ss-p); + + c = rb_enc_get_ascii(s, e, &clen, enc); + if (c == -1) { + s += mbclen(s, e, enc); + rb_str_buf_cat(val, ss, s-ss); + continue; + } + s += clen; - uc = (unsigned char)*s++; p = s; - switch (uc) { + switch (c) { case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) { - no = uc - '0'; + no = c - '0'; } else { - continue; + continue; } break; case 'k': - if (s < e && *s == '<') { - char *name, *name_end; - - name_end = name = s + 1; - while (name_end < e) { - if (*name_end == '>') break; - name_end += mbclen(name_end, e, enc); - } - if (name_end < e) { - no = name_to_backref_number(regs, regexp, name, name_end); - p = s = name_end + 1; - break; - } - else { - rb_raise(rb_eRuntimeError, "invalid group name reference format"); - } + if (s < e && rb_enc_get_ascii(s, e, &clen, enc) == '<') { + char *name, *name_end; + + name_end = name = s + clen; + while (name_end < e) { + c = rb_enc_get_ascii(name_end, e, &clen, enc); + if (c == '>') break; + name_end += c == -1 ? mbclen(name_end, e, enc) : clen; + } + if (name_end < e) { + no = name_to_backref_number(regs, regexp, name, name_end); + p = s = name_end + clen; + break; + } + else { + rb_raise(rb_eRuntimeError, "invalid group name reference format"); + } } - rb_str_buf_cat(val, s-2, 2); + rb_str_buf_cat(val, ss, s-ss); continue; case '0': @@ -2765,11 +2774,11 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) break; case '\\': - rb_str_buf_cat(val, s-1, 1); + rb_str_buf_cat(val, s-clen, clen); continue; default: - rb_str_buf_cat(val, s-2, 2); + rb_str_buf_cat(val, ss, s-ss); continue; } @@ -2783,11 +2792,8 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) if (p < e) { if (!val) { val = rb_str_buf_new(e-p); - rb_str_buf_cat(val, p, e-p); - } - else { - rb_str_buf_cat(val, p, e-p); } + rb_str_buf_cat(val, p, e-p); } if (!val) return str; |