diff options
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | encoding.c | 12 | ||||
-rw-r--r-- | include/ruby/encoding.h | 12 | ||||
-rw-r--r-- | parse.y | 9 | ||||
-rw-r--r-- | string.c | 89 |
5 files changed, 120 insertions, 20 deletions
@@ -1,3 +1,21 @@ +Thu Sep 27 04:46:31 2007 Nobuyoshi Nakada <nobu@ruby-lang.org> + + * encoding.c (rb_enc_associate_index): deal with ASCII compatible + flags. + + * encoding.c (rb_enc_check): allow ASCII compatible strings. + + * parse.y (rb_intern_str): use ASCII encoding for ASCII string. + + * string.c (rb_enc_str_coderange): check for code-range. + + * string.c (rb_str_modify): clear code-range flags. + + * string.c (rb_str_hash, rb_str_eql): ASCII compatible strings are + comparable. + + * include/ruby/encoding.h: added code-range flags. + Thu Sep 27 04:40:47 2007 Nobuyoshi Nakada <nobu@ruby-lang.org> * gc.c (rb_mark_set): new function to mark keys. diff --git a/encoding.c b/encoding.c index a6ee890ce..bb9e77b5b 100644 --- a/encoding.c +++ b/encoding.c @@ -122,6 +122,10 @@ void rb_enc_associate_index(VALUE obj, int idx) { enc_check_capable(obj); + if (!ENC_CODERANGE_ASCIIONLY(obj) || + !rb_enc_asciicompat(rb_enc_from_index(idx))) { + ENC_CODERANGE_CLEAR(obj); + } if (idx < ENCODING_INLINE_MAX) { ENCODING_SET(obj, idx); return; @@ -204,6 +208,14 @@ rb_enc_check(VALUE str1, VALUE str2) return enc; } } + if (BUILTIN_TYPE(str1) == T_STRING && + BUILTIN_TYPE(str2) == T_STRING && + rb_enc_asciicompat(rb_enc_from_index(idx1)) && + rb_enc_asciicompat(rb_enc_from_index(idx2)) && + rb_enc_str_coderange(str1) == ENC_CODERANGE_SINGLE && + rb_enc_str_coderange(str2) == ENC_CODERANGE_SINGLE) { + return ONIG_ENCODING_ASCII; + } rb_raise(rb_eArgError, "character encodings differ"); } diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 1b97b6ab6..562f4b14c 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -24,6 +24,17 @@ } while (0) #define ENCODING_GET(obj) ((RBASIC(obj)->flags & ENCODING_MASK)>>ENCODING_SHIFT) +#define ENC_CODERANGE_MASK (FL_USER12|FL_USER13) +#define ENC_CODERANGE_UNKNOWN 0 +#define ENC_CODERANGE_SINGLE FL_USER12 +#define ENC_CODERANGE_MULTI FL_USER13 +#define ENC_CODERANGE_BROKEN (FL_USER12|FL_USER13) +#define ENC_CODERANGE(obj) (RBASIC(obj)->flags & ENC_CODERANGE_MASK) +#define ENC_CODERANGE_ASCIIONLY(obj) (ENC_CODERANGE(obj) == ENC_CODERANGE_SINGLE) +#define ENC_CODERANGE_SET(obj,cr) (RBASIC(obj)->flags &= ~ENC_CODERANGE_MASK | (cr)) +#define ENC_CODERANGE_CLEAR(obj) ENC_CODERANGE_SET(obj,0) + + typedef OnigEncodingType rb_encoding; int rb_enc_to_index(rb_encoding*); @@ -80,5 +91,6 @@ int rb_enc_toupper(int c, rb_encoding *enc); int rb_enc_tolower(int c, rb_encoding *enc); ID rb_intern3(const char*, long, rb_encoding*); int rb_enc_symname_p(const char*, rb_encoding*); +int rb_enc_str_coderange(VALUE); #endif /* RUBY_ENCODING_H */ @@ -8548,7 +8548,14 @@ rb_intern(const char *name) ID rb_intern_str(VALUE str) { - ID id = rb_intern3(RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str)); + int idx = 0; + ID id; + + if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) { + idx = rb_enc_get_index(str); + } + id = rb_intern3(RSTRING_PTR(str), RSTRING_LEN(str), + rb_enc_from_index(idx)); RB_GC_GUARD(str); return id; } @@ -92,8 +92,37 @@ VALUE rb_cSymbol; }\ } while (0) +#define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_SINGLE) + VALUE rb_fs; +int +rb_enc_str_coderange(VALUE str) +{ + long i; + int cr = ENC_CODERANGE(str); + + if (cr == ENC_CODERANGE_UNKNOWN) { + cr = ENC_CODERANGE_SINGLE; + for (i = 0; i < RSTRING_LEN(str); ++i) { + const char *p = &RSTRING_PTR(str)[i]; + int c = (unsigned char)*p; + + if (!ISASCII(c)) { + c = rb_enc_codepoint(p, RSTRING_END(str), rb_enc_get(str)); + if (c == -1) { + cr = ENC_CODERANGE_BROKEN; + } + else { + cr = ENC_CODERANGE_MULTI; + } + } + } + ENC_CODERANGE_SET(str, cr); + } + return cr; +} + static inline void str_mod_check(VALUE s, char *p, long len) { @@ -553,8 +582,8 @@ rb_str_format_m(VALUE str, VALUE arg) return rb_str_format(1, &arg, str); } -static int -str_independent(VALUE str) +static void +str_modifiable(VALUE str) { if (FL_TEST(str, STR_TMPLOCK)) { rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); @@ -562,6 +591,12 @@ str_independent(VALUE str) if (OBJ_FROZEN(str)) rb_error_frozen("string"); if (!OBJ_TAINTED(str) && rb_safe_level() >= 4) rb_raise(rb_eSecurityError, "Insecure: can't modify string"); +} + +static int +str_independent(VALUE str) +{ + str_modifiable(str); if (!STR_SHARED_P(str)) return 1; if (STR_EMBED_P(str)) return 1; return 0; @@ -589,6 +624,7 @@ rb_str_modify(VALUE str) { if (!str_independent(str)) str_make_independent(str); + ENC_CODERANGE_CLEAR(str); } void @@ -1129,8 +1165,12 @@ rb_memhash(const void *ptr, long len) int rb_str_hash(VALUE str) { + int e = rb_enc_get_index(str); + if (e && is_ascii_string(str)) { + e = 0; + } return hash((const void *)RSTRING_PTR(str), RSTRING_LEN(str), - rb_enc_get_index(str)); + e); } /* @@ -1149,18 +1189,6 @@ rb_str_hash_m(VALUE str) #define lesser(a,b) (((a)>(b))?(b):(a)) -static int -is_ascii_string(VALUE str) -{ - long i; - - for (i = 0; i < RSTRING_LEN(str); ++i) { - int c = (unsigned char)RSTRING_PTR(str)[i]; - if (!ISASCII(c)) return Qfalse; - } - return Qtrue; -} - int rb_str_comparable(VALUE str1, VALUE str2) { @@ -1234,8 +1262,7 @@ rb_str_eql(VALUE str1, VALUE str2) if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2)) return Qfalse; - if (rb_enc_get_index(str1) != rb_enc_get_index(str2)) - return Qfalse; + if (!rb_str_comparable(str1, str2)) return Qfalse; if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0) @@ -3529,7 +3556,7 @@ tr_setup_table(VALUE str, VALUE *tablep, VALUE *ctablep, rb_encoding *enc) static VALUE rb_str_delete_bang(int argc, VALUE *argv, VALUE str) { - rb_encoding *enc; + rb_encoding *enc = 0; char *s, *send, *t; VALUE del = 0, nodel = 0; int modify = 0; @@ -3736,7 +3763,7 @@ rb_str_tr_s(VALUE str, VALUE src, VALUE repl) static VALUE rb_str_count(int argc, VALUE *argv, VALUE str) { - rb_encoding *enc; + rb_encoding *enc = 0; VALUE del = 0, nodel = 0; char *s, *send; int i; @@ -5065,12 +5092,35 @@ rb_str_setter(VALUE val, ID id, VALUE *var) } +/* + * call-seq: + * str.encoding => str + * + * Retruns the encoding name. + */ + static VALUE str_encoding(VALUE str) { return rb_str_new2(rb_enc_name(rb_enc_get(str))); } + +/* + * call-seq: + * str.associate_encoding(encoding) => str + * + * Changes the encoding to +encoding+ and returns self. + */ + +static VALUE +rb_str_associate_encoding(VALUE str, VALUE encname) +{ + str_modifiable(str); + rb_enc_associate(str, rb_enc_find(StringValueCStr(encname))); + return str; +} + /********************************************************************** * Document-class: Symbol * @@ -5482,6 +5532,7 @@ Init_String(void) rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); rb_define_method(rb_cString, "encoding", str_encoding, 0); + rb_define_method(rb_cString, "associate_encoding", rb_str_associate_encoding, 1); id_to_s = rb_intern("to_s"); |