diff options
author | duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-01-20 06:12:48 +0000 |
---|---|---|
committer | duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-01-20 06:12:48 +0000 |
commit | 55c726aa8a711fc211e9affceaaf6f993ed17d5d (patch) | |
tree | 049a03fa66fe12d5409b58e01596aa121402d390 /enc | |
parent | 0ca9c123aa42b5d7a597bbd2639001c620e3f2bb (diff) | |
download | ruby-55c726aa8a711fc211e9affceaaf6f993ed17d5d.tar.gz ruby-55c726aa8a711fc211e9affceaaf6f993ed17d5d.tar.xz ruby-55c726aa8a711fc211e9affceaaf6f993ed17d5d.zip |
Sun Jan 20 15:08:08 2008 Martin Duerst <duerst@it.aoyama.ac.jp>
* enc/trans/utf_16_32.c: new file, currently implementing
UTF-16BE conversions only.
* test/ruby/test_transcode.rb: Added tests for UTF-16BE;
made check_both_ways() use force_encoding differently.
* transcode_data.h, transcode.c: Support for more conversion
functions.
git-svn-id: http://svn.ruby-lang.org/repos/ruby/trunk@15142 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'enc')
-rw-r--r-- | enc/trans/utf_16_32.c | 332 |
1 files changed, 332 insertions, 0 deletions
diff --git a/enc/trans/utf_16_32.c b/enc/trans/utf_16_32.c new file mode 100644 index 000000000..c0f5a7ab7 --- /dev/null +++ b/enc/trans/utf_16_32.c @@ -0,0 +1,332 @@ +/* Autogenerated, do not change */ +/* Report bugs to Martin Duerst (duerst@it.aoyama.ac.jp) */ + +#include "transcode_data.h" + +static int +fun_so_from_utf_16be(const unsigned char* s, unsigned char* o) +{ + if (!s[0] && s[1]<0x80) { + o[0] = s[1]; + return 1; + } + else if (s[0]<0x08) { + o[0] = 0xC0 | (s[0]<<2) | (s[1]>>6); + o[1] = 0x80 | s[1]&0x3F; + return 2; + } + else if ((s[0]&0xF8)!=0xD8) { + o[0] = 0xE0 | s[0]>>4; + o[1] = 0x80 | ((s[0]&0x0F)<<2) | (s[1]>>6); + o[2] = 0x80 | s[1]&0x3F; + return 3; + } + else { + unsigned int u = (((s[0]&0x03)<<2)|(s[1]>>6)) + 1; + o[0] = 0xF0 | u>>2; + o[1] = 0x80 | ((u&0x03)<<4) | (s[1]>>2)&0x0F; + o[2] = 0x80 | ((s[1]&0x03)<<4) | ((s[2]&0x03)<<2) | (s[3]>>6); + o[3] = 0x80 | s[3]&0x3F; + return 4; + } +} + +static int +fun_so_to_utf_16be(const unsigned char* s, unsigned char* o) +{ + if (!(s[0]&0x80)) { + o[0] = 0x00; + o[1] = s[0]; + return 2; + } + else if ((s[0]&0xE0)==0xC0) { + o[0] = (s[0]>>2)&0x07; + o[1] = ((s[0]&0x03)<<6) | s[1]&0x3F; + return 2; + } + else if ((s[0]&0xF0)==0xE0) { + o[0] = (s[0]<<4) | (s[1]>>2)^0x20; + o[1] = (s[1]<<6) | s[2]^0x80; + return 2; + } + else { + int w = (((s[0]&0x07)<<2) | (s[1]>>4)&0x03) - 1; + o[0] = 0xD8 | (w>>2); + o[1] = (w<<6) | ((s[1]&0x0F)<<2) | ((s[2]>>4)-8); + o[2] = 0xDC | ((s[2]>>2)&0x03); + o[3] = (s[2]<<6) | (s[3]&~0x80); + return 4; + } +} +static const unsigned char +from_UTF_16BE_00_offsets[256] = { + /* used by from_UTF_16BE_00 */ + /* used by from_UTF_16BE_D8 */ + /* used by from_UTF_16BE_D8_00_00 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +static const struct byte_lookup* const +from_UTF_16BE_00_infos[1] = { + /* used by from_UTF_16BE_00 */ + /* used by to_UTF_16BE_82 */ + FUNso, +}; +static const BYTE_LOOKUP +from_UTF_16BE_00 = { + /* used as from_UTF_16BE */ + /* used as from_UTF_16BE_D8_00 */ + from_UTF_16BE_00_offsets, + from_UTF_16BE_00_infos +}; + +static const struct byte_lookup* const +from_UTF_16BE_D8_00_00_infos[1] = { + INVALID, +}; +static const BYTE_LOOKUP +from_UTF_16BE_D8_00_00 = { + /* used as from_UTF_16BE_D8_00 */ + /* used as from_UTF_16BE */ + from_UTF_16BE_00_offsets, + from_UTF_16BE_D8_00_00_infos +}; + +static const unsigned char +from_UTF_16BE_D8_00_offsets[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +static const struct byte_lookup* const +from_UTF_16BE_D8_00_infos[2] = { + &from_UTF_16BE_D8_00_00, &from_UTF_16BE_00, +}; +static const BYTE_LOOKUP +from_UTF_16BE_D8_00 = { + from_UTF_16BE_D8_00_offsets, + from_UTF_16BE_D8_00_infos +}; + +static const struct byte_lookup* const +from_UTF_16BE_D8_infos[1] = { + &from_UTF_16BE_D8_00, +}; +static const BYTE_LOOKUP +from_UTF_16BE_D8 = { + from_UTF_16BE_00_offsets, + from_UTF_16BE_D8_infos +}; + +static const unsigned char +from_UTF_16BE_offsets[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +static const struct byte_lookup* const +from_UTF_16BE_infos[3] = { + &from_UTF_16BE_00, &from_UTF_16BE_D8, + &from_UTF_16BE_D8_00_00, +}; +static const BYTE_LOOKUP +from_UTF_16BE = { + from_UTF_16BE_offsets, + from_UTF_16BE_infos +}; + +static rb_transcoder +rb_from_UTF_16BE = { + "UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0, + NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16be +}; + +static const unsigned char +to_UTF_16BE_82_offsets[64] = { + /* used by to_UTF_16BE_82 */ + /* used by to_UTF_16BE_E1 */ + /* used by to_UTF_16BE_F1 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +static const BYTE_LOOKUP +to_UTF_16BE_82 = { + /* used as to_UTF_16BE */ + /* used as to_UTF_16BE_E0 */ + /* used as to_UTF_16BE_E1 */ + /* used as to_UTF_16BE_ED */ + /* used as to_UTF_16BE_F0_90 */ + /* used as to_UTF_16BE_F1_80 */ + /* used as to_UTF_16BE_F4_80 */ + to_UTF_16BE_82_offsets, + from_UTF_16BE_00_infos +}; + +static const unsigned char +to_UTF_16BE_E0_offsets[64] = { + /* used by to_UTF_16BE_E0 */ + /* used by to_UTF_16BE_ED */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; +static const struct byte_lookup* const +to_UTF_16BE_E0_infos[2] = { + INVALID, &to_UTF_16BE_82, +}; +static const BYTE_LOOKUP +to_UTF_16BE_E0 = { + to_UTF_16BE_E0_offsets, + to_UTF_16BE_E0_infos +}; + +static const struct byte_lookup* const +to_UTF_16BE_E1_infos[1] = { + &to_UTF_16BE_82, +}; +static const BYTE_LOOKUP +to_UTF_16BE_E1 = { + /* used as to_UTF_16BE */ + /* used as to_UTF_16BE_F0 */ + /* used as to_UTF_16BE_F1 */ + /* used as to_UTF_16BE_F4 */ + to_UTF_16BE_82_offsets, + to_UTF_16BE_E1_infos +}; + +static const struct byte_lookup* const +to_UTF_16BE_ED_infos[2] = { + &to_UTF_16BE_82, INVALID, +}; +static const BYTE_LOOKUP +to_UTF_16BE_ED = { + to_UTF_16BE_E0_offsets, + to_UTF_16BE_ED_infos +}; + +static const unsigned char +to_UTF_16BE_F0_offsets[64] = { + /* used by to_UTF_16BE_F0 */ + /* used by to_UTF_16BE_F4 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; +static const struct byte_lookup* const +to_UTF_16BE_F0_infos[2] = { + INVALID, &to_UTF_16BE_E1, +}; +static const BYTE_LOOKUP +to_UTF_16BE_F0 = { + to_UTF_16BE_F0_offsets, + to_UTF_16BE_F0_infos +}; + +static const struct byte_lookup* const +to_UTF_16BE_F1_infos[1] = { + &to_UTF_16BE_E1, +}; +static const BYTE_LOOKUP +to_UTF_16BE_F1 = { + to_UTF_16BE_82_offsets, + to_UTF_16BE_F1_infos +}; + +static const struct byte_lookup* const +to_UTF_16BE_F4_infos[2] = { + &to_UTF_16BE_E1, INVALID, +}; +static const BYTE_LOOKUP +to_UTF_16BE_F4 = { + to_UTF_16BE_F0_offsets, + to_UTF_16BE_F4_infos +}; + +static const unsigned char +to_UTF_16BE_offsets[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, + 6, 7, 7, 7, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; +static const struct byte_lookup* const +to_UTF_16BE_infos[9] = { + FUNso, INVALID, &to_UTF_16BE_82, &to_UTF_16BE_E0, + &to_UTF_16BE_E1, &to_UTF_16BE_ED, &to_UTF_16BE_F0, &to_UTF_16BE_F1, + &to_UTF_16BE_F4, +}; +static const BYTE_LOOKUP +to_UTF_16BE = { + to_UTF_16BE_offsets, + to_UTF_16BE_infos +}; + +static rb_transcoder +rb_to_UTF_16BE = { + "UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1, + NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be +}; + +void +Init_utf_16_32(void) +{ + rb_register_transcoder(&rb_from_UTF_16BE); + rb_register_transcoder(&rb_to_UTF_16BE); +} +/* Footprint (bytes): gross: 3420, saved: 1992, net: 1428 */ |