diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-09-05 20:24:18 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-09-05 20:24:18 +0000 |
commit | c31da5f9bbe27057ef04947e8b59f0557e1ab78d (patch) | |
tree | aac107a1766118db9503c3c9ee7324580e3912f0 | |
parent | 615b20dee898e404ef551d1965fd216e98d73177 (diff) | |
download | ruby-c31da5f9bbe27057ef04947e8b59f0557e1ab78d.tar.gz ruby-c31da5f9bbe27057ef04947e8b59f0557e1ab78d.tar.xz ruby-c31da5f9bbe27057ef04947e8b59f0557e1ab78d.zip |
* include/ruby/encoding.h (ECONV_UNDEF_HEX_CHARREF): defined.
* transcode.c (output_hex_charref): new function.
(rb_econv_convert): call output_hex_charref if
ECONV_UNDEF_HEX_CHARREF.
(Init_transcode): Encoding::Converter::UNDEF_HEX_CHARREF added.
git-svn-id: http://svn.ruby-lang.org/repos/ruby/trunk@19162 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | include/ruby/encoding.h | 1 | ||||
-rw-r--r-- | test/ruby/test_econv.rb | 15 | ||||
-rw-r--r-- | transcode.c | 69 |
4 files changed, 94 insertions, 0 deletions
@@ -1,3 +1,12 @@ +Sat Sep 6 05:22:29 2008 Tanaka Akira <akr@fsij.org> + + * include/ruby/encoding.h (ECONV_UNDEF_HEX_CHARREF): defined. + + * transcode.c (output_hex_charref): new function. + (rb_econv_convert): call output_hex_charref if + ECONV_UNDEF_HEX_CHARREF. + (Init_transcode): Encoding::Converter::UNDEF_HEX_CHARREF added. + Sat Sep 6 03:52:47 2008 Tanaka Akira <akr@fsij.org> * transcode.c (rb_econv_convert): use ECONV_INVALID_MASK and diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index ae1ec13e7..ecc00aff5 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -255,6 +255,7 @@ void rb_econv_binmode(rb_econv_t *ec); #define ECONV_UNDEF_MASK 0x00f0 #define ECONV_UNDEF_REPLACE 0x0020 +#define ECONV_UNDEF_HEX_CHARREF 0x0030 /* effective only if output is ascii compatible */ #define ECONV_UNIVERSAL_NEWLINE_DECODER 0x0100 diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index c7f270c52..29ae65b5a 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -670,4 +670,19 @@ class TestEncodingConverter < Test::Unit::TestCase ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace, :replace => "X") assert_equal("a X b", ec.convert("a \u3042 b")) end + + def test_hex_charref + ec = Encoding::Converter.new("UTF-8", "US-ASCII", Encoding::Converter::UNDEF_HEX_CHARREF) + assert_equal("あ", ec.convert("\u3042")) + + ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNDEF_HEX_CHARREF) + assert_equal("\xa4\xcf\xa4\xa1\xa4\xa4♥\xa1\xa3".force_encoding("euc-jp"), + ec.convert("\u{306f 3041 3044 2665 3002}")) + + ec = Encoding::Converter.new("UTF-8", "ISO-2022-JP", Encoding::Converter::UNDEF_HEX_CHARREF) + assert_equal("\e$B$O$!$$\e(B♥\e$B!#".force_encoding("ISO-2022-JP"), + ec.convert("\u{306f 3041 3044 2665 3002}")) + assert_equal("\e(B".force_encoding("ISO-2022-JP"), + ec.finish) + end end diff --git a/transcode.c b/transcode.c index 766425125..5318c564e 100644 --- a/transcode.c +++ b/transcode.c @@ -34,6 +34,12 @@ static VALUE sym_finished; static VALUE sym_output_followed_by_input; static VALUE sym_incomplete_input; +static unsigned char * +allocate_converted_string(const char *sname, const char *dname, + const unsigned char *str, size_t len, + unsigned char *caller_dst_buf, size_t caller_dst_bufsize, + size_t *dst_len_ptr); + /* dynamic structure, one per conversion (similar to iconv_t) */ /* may carry conversion state (e.g. for iso-2022-jp) */ typedef struct rb_transcoding { @@ -1261,6 +1267,62 @@ rb_econv_convert0(rb_econv_t *ec, static int output_replacement_character(rb_econv_t *ec); +static int +output_hex_charref(rb_econv_t *ec) +{ + int ret; + unsigned char utfbuf[1024]; + const unsigned char *utf; + size_t utf_len; + int utf_allocated = 0; + char charef_buf[16]; + const unsigned char *p; + + if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) { + utf = ec->last_error.error_bytes_start; + utf_len = ec->last_error.error_bytes_len; + } + else { + utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE", + ec->last_error.error_bytes_start, ec->last_error.error_bytes_len, + utfbuf, sizeof(utfbuf), + &utf_len); + if (!utf) + return -1; + if (utf != utfbuf && utf != ec->last_error.error_bytes_start) + utf_allocated = 1; + } + + if (utf_len % 4 != 0) + goto fail; + + p = utf; + while (4 <= utf_len) { + unsigned int u = 0; + u += p[0] << 24; + u += p[1] << 16; + u += p[2] << 8; + u += p[3]; + snprintf(charef_buf, sizeof(charef_buf), "&#x%x;", u); + + ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII"); + if (ret == -1) + goto fail; + + p += 4; + utf_len -= 4; + } + + if (utf_allocated) + xfree((void *)utf); + return 0; + + fail: + if (utf_allocated) + xfree((void *)utf); + return -1; +} + rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, @@ -1305,6 +1367,11 @@ rb_econv_convert(rb_econv_t *ec, if (output_replacement_character(ec) == 0) goto resume; break; + + case ECONV_UNDEF_HEX_CHARREF: + if (output_hex_charref(ec) == 0) + goto resume; + break; } } @@ -3424,10 +3491,12 @@ Init_transcode(void) rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0); rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0); rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1); + rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK)); rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE)); rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK)); rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE)); + rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF)); rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT)); rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER)); |