summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-11-27 02:54:10 +0000
committeryugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-11-27 02:54:10 +0000
commitc6a33f0841a8aec6503241b88aa15001b1ad9b67 (patch)
treeb17e74ddeec3165305798e9657938aaf13714f6c
parent2f1d6e714a7478e9c4af4452e35a27b9a3f0c333 (diff)
downloadruby-c6a33f0841a8aec6503241b88aa15001b1ad9b67.tar.gz
ruby-c6a33f0841a8aec6503241b88aa15001b1ad9b67.tar.xz
ruby-c6a33f0841a8aec6503241b88aa15001b1ad9b67.zip
merges r24544 from trunk into ruby_1_9_1.
-- \d, \s and \w are now non Unicode class. [ruby-dev:39026] * include/ruby/oniguruma.h (ONIGENC_CTYPE_SPECIAL_MASK): added. (ONIGENC_CTYPE_D): ditto. (ONIGENC_CTYPE_S): ditto. (ONIGENC_CTYPE_W): ditto. * regparse.c: \d, \s and \w are now non Unicode class. [ruby-dev:39026] (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w. (fetch_token): ditto. (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW]. (parse_exp): ditto. * test/ruby/test_regexp.rb (TestRegexp#test_char_class): add tests for above. git-svn-id: http://svn.ruby-lang.org/repos/ruby/branches/ruby_1_9_1@25941 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog18
-rw-r--r--include/ruby/oniguruma.h8
-rw-r--r--regparse.c59
-rw-r--r--test/ruby/test_regexp.rb7
-rw-r--r--version.h2
5 files changed, 81 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index 1bcb1a0f4..9db3b449a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+Sun Aug 16 00:30:33 2009 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * include/ruby/oniguruma.h
+ (ONIGENC_CTYPE_SPECIAL_MASK): added.
+ (ONIGENC_CTYPE_D): ditto.
+ (ONIGENC_CTYPE_S): ditto.
+ (ONIGENC_CTYPE_W): ditto.
+
+ * regparse.c: \d, \s and \w are now non Unicode class.
+ [ruby-dev:39026]
+ (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w.
+ (fetch_token): ditto.
+ (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW].
+ (parse_exp): ditto.
+
+ * test/ruby/test_regexp.rb (TestRegexp#test_char_class):
+ add tests for above.
+
Fri Sep 18 16:15:04 2009 Nobuyoshi Nakada <nobu@ruby-lang.org>
* compile.c (iseq_compile_each), parse.y (stmt, arg): arg_concat()
diff --git a/include/ruby/oniguruma.h b/include/ruby/oniguruma.h
index bc6905eda..6f8ce23c5 100644
--- a/include/ruby/oniguruma.h
+++ b/include/ruby/oniguruma.h
@@ -200,6 +200,14 @@ ONIG_EXTERN OnigEncodingType OnigEncodingASCII;
#define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */
#define ONIGENC_CTYPE_ASCII 14
#define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII
+#define ONIGENC_CTYPE_SPECIAL_MASK 128
+#define ONIGENC_CTYPE_S /* [\t\n\v\f\r\s] */ \
+ ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_SPACE
+#define ONIGENC_CTYPE_D /* [0-9] */ \
+ ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_DIGIT
+#define ONIGENC_CTYPE_W /* [0-9A-Za-z_] */ \
+ ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_WORD
+#define ONIGENC_CTYPE_SPECIAL_P(ctype) ((ctype) & ONIGENC_CTYPE_SPECIAL_MASK)
#define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e)
diff --git a/regparse.c b/regparse.c
index 1fc0459fa..75ad24fed 100644
--- a/regparse.c
+++ b/regparse.c
@@ -2946,32 +2946,32 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
switch (c) {
case 'w':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+ tok->u.prop.ctype = ONIGENC_CTYPE_W;
tok->u.prop.not = 0;
break;
case 'W':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+ tok->u.prop.ctype = ONIGENC_CTYPE_W;
tok->u.prop.not = 1;
break;
case 'd':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+ tok->u.prop.ctype = ONIGENC_CTYPE_D;
tok->u.prop.not = 0;
break;
case 'D':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+ tok->u.prop.ctype = ONIGENC_CTYPE_D;
tok->u.prop.not = 1;
break;
case 's':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+ tok->u.prop.ctype = ONIGENC_CTYPE_S;
tok->u.prop.not = 0;
break;
case 'S':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+ tok->u.prop.ctype = ONIGENC_CTYPE_S;
tok->u.prop.not = 1;
break;
case 'h':
@@ -3233,14 +3233,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'w':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+ tok->u.prop.ctype = ONIGENC_CTYPE_W;
tok->u.prop.not = 0;
break;
case 'W':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+ tok->u.prop.ctype = ONIGENC_CTYPE_W;
tok->u.prop.not = 1;
break;
@@ -3273,28 +3273,28 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 's':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+ tok->u.prop.ctype = ONIGENC_CTYPE_S;
tok->u.prop.not = 0;
break;
case 'S':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+ tok->u.prop.ctype = ONIGENC_CTYPE_S;
tok->u.prop.not = 1;
break;
case 'd':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+ tok->u.prop.ctype = ONIGENC_CTYPE_D;
tok->u.prop.not = 0;
break;
case 'D':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+ tok->u.prop.ctype = ONIGENC_CTYPE_D;
tok->u.prop.not = 1;
break;
@@ -3835,6 +3835,28 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
OnigCodePoint sb_out;
OnigEncoding enc = env->enc;
+ switch (ctype) {
+ case ONIGENC_CTYPE_D:
+ case ONIGENC_CTYPE_S:
+ case ONIGENC_CTYPE_W:
+ ctype ^= ONIGENC_CTYPE_SPECIAL_MASK;
+ if (not != 0) {
+ for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
+ if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
+ BITSET_SET_BIT(cc->bs, c);
+ }
+ ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
+ }
+ else {
+ for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
+ if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
+ BITSET_SET_BIT(cc->bs, c);
+ }
+ }
+ return 0;
+ break;
+ }
+
r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
if (r == 0) {
return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
@@ -5186,6 +5208,19 @@ parse_exp(Node** np, OnigToken* tok, int term,
case TK_CHAR_TYPE:
{
switch (tok->u.prop.ctype) {
+ case ONIGENC_CTYPE_D:
+ case ONIGENC_CTYPE_S:
+ case ONIGENC_CTYPE_W:
+ {
+ CClassNode* cc;
+ *np = node_new_cclass();
+ CHECK_NULL_RETURN_MEMERR(*np);
+ cc = NCCLASS(*np);
+ add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
+ if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
+ }
+ break;
+
case ONIGENC_CTYPE_WORD:
*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
CHECK_NULL_RETURN_MEMERR(*np);
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 82e18f21e..113246fef 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -659,6 +659,13 @@ class TestRegexp < Test::Unit::TestCase
check(/\A[[^b-c]&&[^e]&&a-f]\z/, %w(a d f), %w(b c e g 0))
check(/\A[\n\r\t]\z/, ["\n", "\r", "\t"])
failcheck('[9-1]')
+
+ assert_match(/\A\d+\z/, "0123456789")
+ assert_no_match(/\d/, "\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19")
+ assert_match(/\A\w+\z/, "09azAZ_")
+ assert_no_match(/\w/, "\uff10\uff19\uff41\uff5a\uff21\uff3a")
+ assert_match(/\A\s+\z/, "\r\n\v\f\r\s")
+ assert_no_match(/\s/, "\u0085")
end
def test_posix_bracket
diff --git a/version.h b/version.h
index 288ad26dd..1642f930f 100644
--- a/version.h
+++ b/version.h
@@ -1,5 +1,5 @@
#define RUBY_VERSION "1.9.1"
-#define RUBY_PATCHLEVEL 342
+#define RUBY_PATCHLEVEL 343
#define RUBY_VERSION_MAJOR 1
#define RUBY_VERSION_MINOR 9
#define RUBY_VERSION_TEENY 1