diff options
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | enc/trans/cp850-tbl.rb | 130 | ||||
-rw-r--r-- | enc/trans/cp852-tbl.rb | 130 | ||||
-rw-r--r-- | enc/trans/cp855-tbl.rb | 130 | ||||
-rw-r--r-- | enc/trans/koi8-r-tbl.rb | 130 | ||||
-rw-r--r-- | enc/trans/koi8-u-tbl.rb | 130 | ||||
-rw-r--r-- | enc/trans/single_byte.trans | 6 | ||||
-rw-r--r-- | enc/trans/tis-620-tbl.rb | 89 | ||||
-rw-r--r-- | test/ruby/test_transcode.rb | 192 |
9 files changed, 915 insertions, 32 deletions
@@ -1,3 +1,13 @@ +Tue Dec 9 18:35:35 2008 Martin Duerst <duerst@it.aoyama.ac.jp> + + * enc/trans/single_byte.trans, cp850-tbl.rb, cp852-tbl.rb, + cp855-tbl.rb, koi8-r-tbl.rb, koi8-u-tbl.rb, tis-620-tbl.rb: + new single-byte transcodings (from Yoshihiro Kambayashi) + + * test/ruby/test_transcode.rb: added tests for the above + (from Yoshihiro Kambayashi), small cosmetic fixes + + Tue Dec 9 13:33:53 2008 Nobuyoshi Nakada <nobu@ruby-lang.org> * compile.c (iseq_compile_each), gc.c (assign_heap_slot), diff --git a/enc/trans/cp850-tbl.rb b/enc/trans/cp850-tbl.rb new file mode 100644 index 000000000..615d3b259 --- /dev/null +++ b/enc/trans/cp850-tbl.rb @@ -0,0 +1,130 @@ +CP850_TO_UCS_TBL = [ + ["FF",0xA0], + ["AD",0xA1], + ["BD",0xA2], + ["9C",0xA3], + ["CF",0xA4], + ["BE",0xA5], + ["DD",0xA6], + ["F5",0xA7], + ["F9",0xA8], + ["B8",0xA9], + ["A6",0xAA], + ["AE",0xAB], + ["AA",0xAC], + ["F0",0xAD], + ["A9",0xAE], + ["EE",0xAF], + ["F8",0xB0], + ["F1",0xB1], + ["FD",0xB2], + ["FC",0xB3], + ["EF",0xB4], + ["E6",0xB5], + ["F4",0xB6], + ["FA",0xB7], + ["F7",0xB8], + ["FB",0xB9], + ["A7",0xBA], + ["AF",0xBB], + ["AC",0xBC], + ["AB",0xBD], + ["F3",0xBE], + ["A8",0xBF], + ["B7",0xC0], + ["B5",0xC1], + ["B6",0xC2], + ["C7",0xC3], + ["8E",0xC4], + ["8F",0xC5], + ["92",0xC6], + ["80",0xC7], + ["D4",0xC8], + ["90",0xC9], + ["D2",0xCA], + ["D3",0xCB], + ["DE",0xCC], + ["D6",0xCD], + ["D7",0xCE], + ["D8",0xCF], + ["D1",0xD0], + ["A5",0xD1], + ["E3",0xD2], + ["E0",0xD3], + ["E2",0xD4], + ["E5",0xD5], + ["99",0xD6], + ["9E",0xD7], + ["9D",0xD8], + ["EB",0xD9], + ["E9",0xDA], + ["EA",0xDB], + ["9A",0xDC], + ["ED",0xDD], + ["E8",0xDE], + ["E1",0xDF], + ["85",0xE0], + ["A0",0xE1], + ["83",0xE2], + ["C6",0xE3], + ["84",0xE4], + ["86",0xE5], + ["91",0xE6], + ["87",0xE7], + ["8A",0xE8], + ["82",0xE9], + ["88",0xEA], + ["89",0xEB], + ["8D",0xEC], + ["A1",0xED], + ["8C",0xEE], + ["8B",0xEF], + ["D0",0xF0], + ["A4",0xF1], + ["95",0xF2], + ["A2",0xF3], + ["93",0xF4], + ["E4",0xF5], + ["94",0xF6], + ["F6",0xF7], + ["9B",0xF8], + ["97",0xF9], + ["A3",0xFA], + ["96",0xFB], + ["81",0xFC], + ["EC",0xFD], + ["E7",0xFE], + ["98",0xFF], + ["D5",0x131], + ["9F",0x192], + ["F2",0x2017], + ["C4",0x2500], + ["B3",0x2502], + ["DA",0x250C], + ["BF",0x2510], + ["C0",0x2514], + ["D9",0x2518], + ["C3",0x251C], + ["B4",0x2524], + ["C2",0x252C], + ["C1",0x2534], + ["C5",0x253C], + ["CD",0x2550], + ["BA",0x2551], + ["C9",0x2554], + ["BB",0x2557], + ["C8",0x255A], + ["BC",0x255D], + ["CC",0x2560], + ["B9",0x2563], + ["CB",0x2566], + ["CA",0x2569], + ["CE",0x256C], + ["DF",0x2580], + ["DC",0x2584], + ["DB",0x2588], + ["B0",0x2591], + ["B1",0x2592], + ["B2",0x2593], + ["FE",0x25A0], +]
\ No newline at end of file diff --git a/enc/trans/cp852-tbl.rb b/enc/trans/cp852-tbl.rb new file mode 100644 index 000000000..6763bfa6e --- /dev/null +++ b/enc/trans/cp852-tbl.rb @@ -0,0 +1,130 @@ +CP852_TO_UCS_TBL = [ + ["FF",0xA0], + ["CF",0xA4], + ["F5",0xA7], + ["F9",0xA8], + ["AE",0xAB], + ["AA",0xAC], + ["F0",0xAD], + ["F8",0xB0], + ["EF",0xB4], + ["F7",0xB8], + ["AF",0xBB], + ["B5",0xC1], + ["B6",0xC2], + ["8E",0xC4], + ["80",0xC7], + ["90",0xC9], + ["D3",0xCB], + ["D6",0xCD], + ["D7",0xCE], + ["E0",0xD3], + ["E2",0xD4], + ["99",0xD6], + ["9E",0xD7], + ["E9",0xDA], + ["9A",0xDC], + ["ED",0xDD], + ["E1",0xDF], + ["A0",0xE1], + ["83",0xE2], + ["84",0xE4], + ["87",0xE7], + ["82",0xE9], + ["89",0xEB], + ["A1",0xED], + ["8C",0xEE], + ["A2",0xF3], + ["93",0xF4], + ["94",0xF6], + ["F6",0xF7], + ["A3",0xFA], + ["81",0xFC], + ["EC",0xFD], + ["C6",0x102], + ["C7",0x103], + ["A4",0x104], + ["A5",0x105], + ["8F",0x106], + ["86",0x107], + ["AC",0x10C], + ["9F",0x10D], + ["D2",0x10E], + ["D4",0x10F], + ["D1",0x110], + ["D0",0x111], + ["A8",0x118], + ["A9",0x119], + ["B7",0x11A], + ["D8",0x11B], + ["91",0x139], + ["92",0x13A], + ["95",0x13D], + ["96",0x13E], + ["9D",0x141], + ["88",0x142], + ["E3",0x143], + ["E4",0x144], + ["D5",0x147], + ["E5",0x148], + ["8A",0x150], + ["8B",0x151], + ["E8",0x154], + ["EA",0x155], + ["FC",0x158], + ["FD",0x159], + ["97",0x15A], + ["98",0x15B], + ["B8",0x15E], + ["AD",0x15F], + ["E6",0x160], + ["E7",0x161], + ["DD",0x162], + ["EE",0x163], + ["9B",0x164], + ["9C",0x165], + ["DE",0x16E], + ["85",0x16F], + ["EB",0x170], + ["FB",0x171], + ["8D",0x179], + ["AB",0x17A], + ["BD",0x17B], + ["BE",0x17C], + ["A6",0x17D], + ["A7",0x17E], + ["F3",0x2C7], + ["F4",0x2D8], + ["FA",0x2D9], + ["F2",0x2DB], + ["F1",0x2DD], + ["C4",0x2500], + ["B3",0x2502], + ["DA",0x250C], + ["BF",0x2510], + ["C0",0x2514], + ["D9",0x2518], + ["C3",0x251C], + ["B4",0x2524], + ["C2",0x252C], + ["C1",0x2534], + ["C5",0x253C], + ["CD",0x2550], + ["BA",0x2551], + ["C9",0x2554], + ["BB",0x2557], + ["C8",0x255A], + ["BC",0x255D], + ["CC",0x2560], + ["B9",0x2563], + ["CB",0x2566], + ["CA",0x2569], + ["CE",0x256C], + ["DF",0x2580], + ["DC",0x2584], + ["DB",0x2588], + ["B0",0x2591], + ["B1",0x2592], + ["B2",0x2593], + ["FE",0x25A0], +]
\ No newline at end of file diff --git a/enc/trans/cp855-tbl.rb b/enc/trans/cp855-tbl.rb new file mode 100644 index 000000000..72e548b9c --- /dev/null +++ b/enc/trans/cp855-tbl.rb @@ -0,0 +1,130 @@ +CP855_TO_UCS_TBL = [ + ["FF",0xA0], + ["CF",0xA4], + ["FD",0xA7], + ["AE",0xAB], + ["F0",0xAD], + ["AF",0xBB], + ["85",0x401], + ["81",0x402], + ["83",0x403], + ["87",0x404], + ["89",0x405], + ["8B",0x406], + ["8D",0x407], + ["8F",0x408], + ["91",0x409], + ["93",0x40A], + ["95",0x40B], + ["97",0x40C], + ["99",0x40E], + ["9B",0x40F], + ["A1",0x410], + ["A3",0x411], + ["EC",0x412], + ["AD",0x413], + ["A7",0x414], + ["A9",0x415], + ["EA",0x416], + ["F4",0x417], + ["B8",0x418], + ["BE",0x419], + ["C7",0x41A], + ["D1",0x41B], + ["D3",0x41C], + ["D5",0x41D], + ["D7",0x41E], + ["DD",0x41F], + ["E2",0x420], + ["E4",0x421], + ["E6",0x422], + ["E8",0x423], + ["AB",0x424], + ["B6",0x425], + ["A5",0x426], + ["FC",0x427], + ["F6",0x428], + ["FA",0x429], + ["9F",0x42A], + ["F2",0x42B], + ["EE",0x42C], + ["F8",0x42D], + ["9D",0x42E], + ["E0",0x42F], + ["A0",0x430], + ["A2",0x431], + ["EB",0x432], + ["AC",0x433], + ["A6",0x434], + ["A8",0x435], + ["E9",0x436], + ["F3",0x437], + ["B7",0x438], + ["BD",0x439], + ["C6",0x43A], + ["D0",0x43B], + ["D2",0x43C], + ["D4",0x43D], + ["D6",0x43E], + ["D8",0x43F], + ["E1",0x440], + ["E3",0x441], + ["E5",0x442], + ["E7",0x443], + ["AA",0x444], + ["B5",0x445], + ["A4",0x446], + ["FB",0x447], + ["F5",0x448], + ["F9",0x449], + ["9E",0x44A], + ["F1",0x44B], + ["ED",0x44C], + ["F7",0x44D], + ["9C",0x44E], + ["DE",0x44F], + ["84",0x451], + ["80",0x452], + ["82",0x453], + ["86",0x454], + ["88",0x455], + ["8A",0x456], + ["8C",0x457], + ["8E",0x458], + ["90",0x459], + ["92",0x45A], + ["94",0x45B], + ["96",0x45C], + ["98",0x45E], + ["9A",0x45F], + ["EF",0x2116], + ["C4",0x2500], + ["B3",0x2502], + ["DA",0x250C], + ["BF",0x2510], + ["C0",0x2514], + ["D9",0x2518], + ["C3",0x251C], + ["B4",0x2524], + ["C2",0x252C], + ["C1",0x2534], + ["C5",0x253C], + ["CD",0x2550], + ["BA",0x2551], + ["C9",0x2554], + ["BB",0x2557], + ["C8",0x255A], + ["BC",0x255D], + ["CC",0x2560], + ["B9",0x2563], + ["CB",0x2566], + ["CA",0x2569], + ["CE",0x256C], + ["DF",0x2580], + ["DC",0x2584], + ["DB",0x2588], + ["B0",0x2591], + ["B1",0x2592], + ["B2",0x2593], + ["FE",0x25A0], +]
\ No newline at end of file diff --git a/enc/trans/koi8-r-tbl.rb b/enc/trans/koi8-r-tbl.rb new file mode 100644 index 000000000..a1f55ff2e --- /dev/null +++ b/enc/trans/koi8-r-tbl.rb @@ -0,0 +1,130 @@ +KOI8_R_TO_UCS_TBL = [ + ["9A",0xA0], + ["BF",0xA9], + ["9C",0xB0], + ["9D",0xB2], + ["9E",0xB7], + ["9F",0xF7], + ["B3",0x401], + ["E1",0x410], + ["E2",0x411], + ["F7",0x412], + ["E7",0x413], + ["E4",0x414], + ["E5",0x415], + ["F6",0x416], + ["FA",0x417], + ["E9",0x418], + ["EA",0x419], + ["EB",0x41A], + ["EC",0x41B], + ["ED",0x41C], + ["EE",0x41D], + ["EF",0x41E], + ["F0",0x41F], + ["F2",0x420], + ["F3",0x421], + ["F4",0x422], + ["F5",0x423], + ["E6",0x424], + ["E8",0x425], + ["E3",0x426], + ["FE",0x427], + ["FB",0x428], + ["FD",0x429], + ["FF",0x42A], + ["F9",0x42B], + ["F8",0x42C], + ["FC",0x42D], + ["E0",0x42E], + ["F1",0x42F], + ["C1",0x430], + ["C2",0x431], + ["D7",0x432], + ["C7",0x433], + ["C4",0x434], + ["C5",0x435], + ["D6",0x436], + ["DA",0x437], + ["C9",0x438], + ["CA",0x439], + ["CB",0x43A], + ["CC",0x43B], + ["CD",0x43C], + ["CE",0x43D], + ["CF",0x43E], + ["D0",0x43F], + ["D2",0x440], + ["D3",0x441], + ["D4",0x442], + ["D5",0x443], + ["C6",0x444], + ["C8",0x445], + ["C3",0x446], + ["DE",0x447], + ["DB",0x448], + ["DD",0x449], + ["DF",0x44A], + ["D9",0x44B], + ["D8",0x44C], + ["DC",0x44D], + ["C0",0x44E], + ["D1",0x44F], + ["A3",0x451], + ["95",0x2219], + ["96",0x221A], + ["97",0x2248], + ["98",0x2264], + ["99",0x2265], + ["93",0x2320], + ["9B",0x2321], + ["80",0x2500], + ["81",0x2502], + ["82",0x250C], + ["83",0x2510], + ["84",0x2514], + ["85",0x2518], + ["86",0x251C], + ["87",0x2524], + ["88",0x252C], + ["89",0x2534], + ["8A",0x253C], + ["A0",0x2550], + ["A1",0x2551], + ["A2",0x2552], + ["A4",0x2553], + ["A5",0x2554], + ["A6",0x2555], + ["A7",0x2556], + ["A8",0x2557], + ["A9",0x2558], + ["AA",0x2559], + ["AB",0x255A], + ["AC",0x255B], + ["AD",0x255C], + ["AE",0x255D], + ["AF",0x255E], + ["B0",0x255F], + ["B1",0x2560], + ["B2",0x2561], + ["B4",0x2562], + ["B5",0x2563], + ["B6",0x2564], + ["B7",0x2565], + ["B8",0x2566], + ["B9",0x2567], + ["BA",0x2568], + ["BB",0x2569], + ["BC",0x256A], + ["BD",0x256B], + ["BE",0x256C], + ["8B",0x2580], + ["8C",0x2584], + ["8D",0x2588], + ["8E",0x258C], + ["8F",0x2590], + ["90",0x2591], + ["91",0x2592], + ["92",0x2593], + ["94",0x25A0], +]
\ No newline at end of file diff --git a/enc/trans/koi8-u-tbl.rb b/enc/trans/koi8-u-tbl.rb new file mode 100644 index 000000000..e87aa1aa3 --- /dev/null +++ b/enc/trans/koi8-u-tbl.rb @@ -0,0 +1,130 @@ +KOI8_U_TO_UCS_TBL = [ + ["9A",0xA0], + ["BF",0xA9], + ["9C",0xB0], + ["9D",0xB2], + ["9E",0xB7], + ["9F",0xF7], + ["B3",0x401], + ["B4",0x404], + ["B6",0x406], + ["B7",0x407], + ["E1",0x410], + ["E2",0x411], + ["F7",0x412], + ["E7",0x413], + ["E4",0x414], + ["E5",0x415], + ["F6",0x416], + ["FA",0x417], + ["E9",0x418], + ["EA",0x419], + ["EB",0x41A], + ["EC",0x41B], + ["ED",0x41C], + ["EE",0x41D], + ["EF",0x41E], + ["F0",0x41F], + ["F2",0x420], + ["F3",0x421], + ["F4",0x422], + ["F5",0x423], + ["E6",0x424], + ["E8",0x425], + ["E3",0x426], + ["FE",0x427], + ["FB",0x428], + ["FD",0x429], + ["FF",0x42A], + ["F9",0x42B], + ["F8",0x42C], + ["FC",0x42D], + ["E0",0x42E], + ["F1",0x42F], + ["C1",0x430], + ["C2",0x431], + ["D7",0x432], + ["C7",0x433], + ["C4",0x434], + ["C5",0x435], + ["D6",0x436], + ["DA",0x437], + ["C9",0x438], + ["CA",0x439], + ["CB",0x43A], + ["CC",0x43B], + ["CD",0x43C], + ["CE",0x43D], + ["CF",0x43E], + ["D0",0x43F], + ["D2",0x440], + ["D3",0x441], + ["D4",0x442], + ["D5",0x443], + ["C6",0x444], + ["C8",0x445], + ["C3",0x446], + ["DE",0x447], + ["DB",0x448], + ["DD",0x449], + ["DF",0x44A], + ["D9",0x44B], + ["D8",0x44C], + ["DC",0x44D], + ["C0",0x44E], + ["D1",0x44F], + ["A3",0x451], + ["A4",0x454], + ["A6",0x456], + ["A7",0x457], + ["BD",0x490], + ["AD",0x491], + ["95",0x2219], + ["96",0x221A], + ["97",0x2248], + ["98",0x2264], + ["99",0x2265], + ["93",0x2320], + ["9B",0x2321], + ["80",0x2500], + ["81",0x2502], + ["82",0x250C], + ["83",0x2510], + ["84",0x2514], + ["85",0x2518], + ["86",0x251C], + ["87",0x2524], + ["88",0x252C], + ["89",0x2534], + ["8A",0x253C], + ["A0",0x2550], + ["A1",0x2551], + ["A2",0x2552], + ["A5",0x2554], + ["A8",0x2557], + ["A9",0x2558], + ["AA",0x2559], + ["AB",0x255A], + ["AC",0x255B], + ["AE",0x255D], + ["AF",0x255E], + ["B0",0x255F], + ["B1",0x2560], + ["B2",0x2561], + ["B5",0x2563], + ["B8",0x2566], + ["B9",0x2567], + ["BA",0x2568], + ["BB",0x2569], + ["BC",0x256A], + ["BE",0x256C], + ["8B",0x2580], + ["8C",0x2584], + ["8D",0x2588], + ["8E",0x258C], + ["8F",0x2590], + ["90",0x2591], + ["91",0x2592], + ["92",0x2593], + ["94",0x25A0], +]
\ No newline at end of file diff --git a/enc/trans/single_byte.trans b/enc/trans/single_byte.trans index 80bd3f9ae..0d42740d3 100644 --- a/enc/trans/single_byte.trans +++ b/enc/trans/single_byte.trans @@ -73,6 +73,12 @@ transcode_tblgen_singlebyte "MACROMANIA" transcode_tblgen_singlebyte "MACTURKISH" transcode_tblgen_singlebyte "MACUKRAINE" + transcode_tblgen_singlebyte "KOI8-U" + transcode_tblgen_singlebyte "KOI8-R" + transcode_tblgen_singlebyte "TIS-620" + transcode_tblgen_singlebyte "CP850" + transcode_tblgen_singlebyte "CP852" + transcode_tblgen_singlebyte "CP855" %> <%= transcode_generated_code %> diff --git a/enc/trans/tis-620-tbl.rb b/enc/trans/tis-620-tbl.rb new file mode 100644 index 000000000..a2f605730 --- /dev/null +++ b/enc/trans/tis-620-tbl.rb @@ -0,0 +1,89 @@ +TIS_620_TO_UCS_TBL = [ + ["A1",0xE01], + ["A2",0xE02], + ["A3",0xE03], + ["A4",0xE04], + ["A5",0xE05], + ["A6",0xE06], + ["A7",0xE07], + ["A8",0xE08], + ["A9",0xE09], + ["AA",0xE0A], + ["AB",0xE0B], + ["AC",0xE0C], + ["AD",0xE0D], + ["AE",0xE0E], + ["AF",0xE0F], + ["B0",0xE10], + ["B1",0xE11], + ["B2",0xE12], + ["B3",0xE13], + ["B4",0xE14], + ["B5",0xE15], + ["B6",0xE16], + ["B7",0xE17], + ["B8",0xE18], + ["B9",0xE19], + ["BA",0xE1A], + ["BB",0xE1B], + ["BC",0xE1C], + ["BD",0xE1D], + ["BE",0xE1E], + ["BF",0xE1F], + ["C0",0xE20], + ["C1",0xE21], + ["C2",0xE22], + ["C3",0xE23], + ["C4",0xE24], + ["C5",0xE25], + ["C6",0xE26], + ["C7",0xE27], + ["C8",0xE28], + ["C9",0xE29], + ["CA",0xE2A], + ["CB",0xE2B], + ["CC",0xE2C], + ["CD",0xE2D], + ["CE",0xE2E], + ["CF",0xE2F], + ["D0",0xE30], + ["D1",0xE31], + ["D2",0xE32], + ["D3",0xE33], + ["D4",0xE34], + ["D5",0xE35], + ["D6",0xE36], + ["D7",0xE37], + ["D8",0xE38], + ["D9",0xE39], + ["DA",0xE3A], + ["DF",0xE3F], + ["E0",0xE40], + ["E1",0xE41], + ["E2",0xE42], + ["E3",0xE43], + ["E4",0xE44], + ["E5",0xE45], + ["E6",0xE46], + ["E7",0xE47], + ["E8",0xE48], + ["E9",0xE49], + ["EA",0xE4A], + ["EB",0xE4B], + ["EC",0xE4C], + ["ED",0xE4D], + ["EE",0xE4E], + ["EF",0xE4F], + ["F0",0xE50], + ["F1",0xE51], + ["F2",0xE52], + ["F3",0xE53], + ["F4",0xE54], + ["F5",0xE55], + ["F6",0xE56], + ["F7",0xE57], + ["F8",0xE58], + ["F9",0xE59], + ["FA",0xE5A], + ["FB",0xE5B], +] diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 1d16d9842..0d1dfa223 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -146,7 +146,7 @@ class TestTranscode < Test::Unit::TestCase assert_raise(Encoding::UndefinedConversionError) { "\xFC".encode("utf-8", 'windows-874') } assert_raise(Encoding::UndefinedConversionError) { "\xFF".encode("utf-8", 'windows-874') } end - + def test_windows_1250 check_both_ways("\u20AC", "\x80", 'windows-1250') # € assert_raise(Encoding::UndefinedConversionError) { "\x81".encode("utf-8", 'windows-1250') } @@ -175,7 +175,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u0111", "\xF0", 'windows-1250') # đ check_both_ways("\u02D9", "\xFF", 'windows-1250') # ˙ end - + def test_windows_1251 check_both_ways("\u0402", "\x80", 'windows-1251') # Ђ check_both_ways("\u20AC", "\x88", 'windows-1251') # € @@ -196,7 +196,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u0440", "\xF0", 'windows-1251') # р check_both_ways("\u044F", "\xFF", 'windows-1251') # я end - + def test_windows_1252 check_both_ways("\u20AC", "\x80", 'windows-1252') # € assert_raise(Encoding::UndefinedConversionError) { "\x81".encode("utf-8", 'windows-1252') } @@ -204,7 +204,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u0152", "\x8C", 'windows-1252') # >Œ assert_raise(Encoding::UndefinedConversionError) { "\x8D".encode("utf-8", 'windows-1252') } check_both_ways("\u017D", "\x8E", 'windows-1252') # Ž - assert_raise(Encoding::UndefinedConversionError) { "\x8F".encode("utf-8", 'windows-1252') } + assert_raise(Encoding::UndefinedConversionError) { "\x8F".encode("utf-8", 'windows-1252') } assert_raise(Encoding::UndefinedConversionError) { "\x90".encode("utf-8", 'windows-1252') } check_both_ways("\u2018", "\x91", 'windows-1252') #‘ check_both_ways("\u0153", "\x9C", 'windows-1252') # œ @@ -261,7 +261,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u03CE", "\xFE", 'windows-1253') # ώ assert_raise(Encoding::UndefinedConversionError) { "\xFF".encode("utf-8", 'windows-1253') } end - + def test_windows_1254 check_both_ways("\u20AC", "\x80", 'windows-1254') # € assert_raise(Encoding::UndefinedConversionError) { "\x81".encode("utf-8", 'windows-1254') } @@ -288,12 +288,12 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u011F", "\xF0", 'windows-1254') # ğ check_both_ways("\u00FF", "\xFF", 'windows-1254') # ÿ end - + def test_windows_1255 - check_both_ways("\u20AC", "\x80", 'windows-1255') # € - assert_raise(Encoding::UndefinedConversionError) { "\x81".encode("utf-8", 'windows-1255') } - check_both_ways("\u201A", "\x82", 'windows-1255') # ‚ - check_both_ways("\u2030", "\x89", 'windows-1255') # ‰ + check_both_ways("\u20AC", "\x80", 'windows-1255') # € + assert_raise(Encoding::UndefinedConversionError) { "\x81".encode("utf-8", 'windows-1255') } + check_both_ways("\u201A", "\x82", 'windows-1255') # ‚ + check_both_ways("\u2030", "\x89", 'windows-1255') # ‰ assert_raise(Encoding::UndefinedConversionError) { "\x8A".encode("utf-8", 'windows-1255') } check_both_ways("\u2039", "\x8B", 'windows-1255') # ‹ assert_raise(Encoding::UndefinedConversionError) { "\x8C".encode("utf-8", 'windows-1255') } @@ -333,7 +333,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u200F", "\xFE", 'windows-1255') # right-to-left mark assert_raise(Encoding::UndefinedConversionError) { "\xFF".encode("utf-8", 'windows-1255') } end - + def test_windows_1256 check_both_ways("\u20AC", "\x80", 'windows-1256') # € check_both_ways("\u0679", "\x8A", 'windows-1256') # ٹ @@ -356,7 +356,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u064B", "\xF0", 'windows-1256') # ًً check_both_ways("\u06D2", "\xFF", 'windows-1256') # ے end - + def test_windows_1257 check_both_ways("\u20AC", "\x80", 'windows-1257') # € assert_raise(Encoding::UndefinedConversionError) { "\x81".encode("utf-8", 'windows-1257') } @@ -420,7 +420,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u2261", "\xF0", 'IBM437') # ≡ check_both_ways("\u00A0", "\xFF", 'IBM437') # non-breaking space end - + def test_IBM775 check_both_ways("\u0106", "\x80", 'IBM775') # Ć check_both_ways("\u00C5", "\x8F", 'IBM775') # Å @@ -439,7 +439,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00AD", "\xF0", 'IBM775') # osft hyphen check_both_ways("\u00A0", "\xFF", 'IBM775') # non-breaking space end - + def test_IBM852 check_both_ways("\u00C7", "\x80", 'IBM852') # Ç check_both_ways("\u0106", "\x8F", 'IBM852') # Ć @@ -458,7 +458,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00AD", "\xF0", 'IBM852') # osft hyphen check_both_ways("\u00A0", "\xFF", 'IBM852') # non-breaking space end - + def test_IBM855 check_both_ways("\u0452", "\x80", 'IBM855') # ђ check_both_ways("\u0408", "\x8F", 'IBM855') # Ј @@ -505,7 +505,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00BE", "\xF3", 'IBM857') # ¾ check_both_ways("\u00A0", "\xFF", 'IBM857') # non-breaking space end - + def test_IBM860 check_both_ways("\u00C7", "\x80", 'IBM860') # Ç check_both_ways("\u00C2", "\x8F", 'IBM860') #  @@ -524,7 +524,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u2261", "\xF0", 'IBM860') # ≡ check_both_ways("\u00A0", "\xFF", 'IBM860') # non-breaking space end - + def test_IBM861 check_both_ways("\u00C7", "\x80", 'IBM861') # Ç check_both_ways("\u00C5", "\x8F", 'IBM861') # Å @@ -543,7 +543,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u2261", "\xF0", 'IBM861') # ≡ check_both_ways("\u00A0", "\xFF", 'IBM861') # non-breaking space end - + def test_IBM862 check_both_ways("\u05D0", "\x80", 'IBM862') # א check_both_ways("\u05DF", "\x8F", 'IBM862') # ן @@ -562,7 +562,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u2261", "\xF0", 'IBM862') # ≡ check_both_ways("\u00A0", "\xFF", 'IBM862') # non-breaking space end - + def test_IBM863 check_both_ways("\u00C7", "\x80", 'IBM863') # Ç check_both_ways("\u00A7", "\x8F", 'IBM863') # § @@ -581,7 +581,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u2261", "\xF0", 'IBM863') # ≡ check_both_ways("\u00A0", "\xFF", 'IBM863') # non-breaking space end - + def test_IBM865 check_both_ways("\u00C7", "\x80", 'IBM865') # Ç check_both_ways("\u00C5", "\x8F", 'IBM865') # Å @@ -600,7 +600,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u2261", "\xF0", 'IBM865') # ≡ check_both_ways("\u00A0", "\xFF", 'IBM865') # non-breaking space end - + def test_IBM866 check_both_ways("\u0410", "\x80", 'IBM866') # А check_both_ways("\u041F", "\x8F", 'IBM866') # П @@ -619,7 +619,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u0401", "\xF0", 'IBM866') # Ё check_both_ways("\u00A0", "\xFF", 'IBM866') # non-breaking space end - + def test_IBM869 assert_raise(Encoding::UndefinedConversionError) { "\x80".encode("utf-8", 'IBM869') } assert_raise(Encoding::UndefinedConversionError) { "\x85".encode("utf-8", 'IBM869') } @@ -646,7 +646,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00AD", "\xF0", 'IBM869') # soft hyphen check_both_ways("\u00A0", "\xFF", 'IBM869') # non-breaking space end - + def test_macCroatian check_both_ways("\u00C4", "\x80", 'macCroatian') # Ä check_both_ways("\u00E8", "\x8F", 'macCroatian') # è @@ -681,7 +681,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00E6", "\xFE", 'macCroatian') # æ check_both_ways("\u02C7", "\xFF", 'macCroatian') # ˇ end - + def test_macCyrillic check_both_ways("\u0410", "\x80", 'macCyrillic') # А check_both_ways("\u041F", "\x8F", 'macCyrillic') # П @@ -700,7 +700,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u0440", "\xF0", 'macCyrillic') # р check_both_ways("\u00A4", "\xFF", 'macCyrillic') # ¤ end - + def test_macGreek check_both_ways("\u00C4", "\x80", 'macGreek') # Ä check_both_ways("\u00E8", "\x8F", 'macGreek') # è @@ -721,7 +721,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u03B0", "\xFE", 'macGreek') # ΰ assert_raise(Encoding::UndefinedConversionError) { "\xFF".encode("utf-8", 'macGreek') } end - + def test_macIceland check_both_ways("\u00C4", "\x80", 'macIceland') # Ä check_both_ways("\u00E8", "\x8F", 'macIceland') # è @@ -760,7 +760,7 @@ class TestTranscode < Test::Unit::TestCase #check_both_ways("\uF8FF", "\xF0", 'macRoman') # Apple logo check_both_ways("\u02C7", "\xFF", 'macRoman') # ˇ end - + def test_macRomania check_both_ways("\u00C4", "\x80", 'macRomania') # Ä check_both_ways("\u00E8", "\x8F", 'macRomania') # è @@ -780,7 +780,7 @@ class TestTranscode < Test::Unit::TestCase #check_both_ways("\uF8FF", "\xF0", 'macRomania') # Apple logo check_both_ways("\u02C7", "\xFF", 'macRomania') # ˇ end - + def test_macTurkish check_both_ways("\u00C4", "\x80", 'macTurkish') # Ä check_both_ways("\u00E8", "\x8F", 'macTurkish') # è @@ -802,7 +802,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u02C6", "\xF6", 'macTurkish') # ˆ check_both_ways("\u02C7", "\xFF", 'macTurkish') # ˇ end - + def test_macUkraine check_both_ways("\u0410", "\x80", 'macUkraine') # А check_both_ways("\u041F", "\x8F", 'macUkraine') # П @@ -821,7 +821,135 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u0440", "\xF0", 'macUkraine') # р check_both_ways("\u00A4", "\xFF", 'macUkraine') # ¤ end + + def test_koi8_u + check_both_ways("\u2500", "\x80", 'KOI8-U') # ─ + check_both_ways("\u2590", "\x8F", 'KOI8-U') # ▐ + check_both_ways("\u2591", "\x90", 'KOI8-U') # ░ + check_both_ways("\u00F7", "\x9F", 'KOI8-U') # ÷ + check_both_ways("\u2550", "\xA0", 'KOI8-U') # ═ + check_both_ways("\u0454", "\xA4", 'KOI8-U') # є + check_both_ways("\u0456", "\xA6", 'KOI8-U') # і + check_both_ways("\u0457", "\xA7", 'KOI8-U') # ї + check_both_ways("\u0491", "\xAD", 'KOI8-U') # ґ + check_both_ways("\u255E", "\xAF", 'KOI8-U') # ╞ + check_both_ways("\u255F", "\xB0", 'KOI8-U') # ╟ + check_both_ways("\u0404", "\xB4", 'KOI8-U') # Є + check_both_ways("\u0406", "\xB6", 'KOI8-U') # І + check_both_ways("\u0407", "\xB7", 'KOI8-U') # Ї + check_both_ways("\u0490", "\xBD", 'KOI8-U') # Ґ + check_both_ways("\u00A9", "\xBF", 'KOI8-U') # © + check_both_ways("\u044E", "\xC0", 'KOI8-U') # ю + check_both_ways("\u043E", "\xCF", 'KOI8-U') # о + check_both_ways("\u043F", "\xD0", 'KOI8-U') # п + check_both_ways("\u044A", "\xDF", 'KOI8-U') # ъ + check_both_ways("\u042E", "\xE0", 'KOI8-U') # Ю + check_both_ways("\u041E", "\xEF", 'KOI8-U') # О + check_both_ways("\u041F", "\xF0", 'KOI8-U') # П + check_both_ways("\u042A", "\xFF", 'KOI8-U') # Ъ + end + + def test_koi8_r + check_both_ways("\u2500", "\x80", 'KOI8-R') # ─ + check_both_ways("\u2590", "\x8F", 'KOI8-R') # ▐ + check_both_ways("\u2591", "\x90", 'KOI8-R') # ░ + check_both_ways("\u00F7", "\x9F", 'KOI8-R') # ÷ + check_both_ways("\u2550", "\xA0", 'KOI8-R') # ═ + check_both_ways("\u255E", "\xAF", 'KOI8-R') # ╞ + check_both_ways("\u255F", "\xB0", 'KOI8-R') # ╟ + check_both_ways("\u00A9", "\xBF", 'KOI8-R') # © + check_both_ways("\u044E", "\xC0", 'KOI8-R') # ю + check_both_ways("\u043E", "\xCF", 'KOI8-R') # о + check_both_ways("\u043F", "\xD0", 'KOI8-R') # п + check_both_ways("\u044A", "\xDF", 'KOI8-R') # ъ + check_both_ways("\u042E", "\xE0", 'KOI8-R') # Ю + check_both_ways("\u041E", "\xEF", 'KOI8-R') # О + check_both_ways("\u041F", "\xF0", 'KOI8-R') # П + check_both_ways("\u042A", "\xFF", 'KOI8-R') # Ъ + end + + def test_TIS_620 + assert_raise(Encoding::UndefinedConversionError) { "\x80".encode("utf-8", 'TIS-620') } + assert_raise(Encoding::UndefinedConversionError) { "\x8F".encode("utf-8", 'TIS-620') } + assert_raise(Encoding::UndefinedConversionError) { "\x90".encode("utf-8", 'TIS-620') } + assert_raise(Encoding::UndefinedConversionError) { "\x9F".encode("utf-8", 'TIS-620') } + assert_raise(Encoding::UndefinedConversionError) { "\xA0".encode("utf-8", 'TIS-620') } + check_both_ways("\u0E01", "\xA1", 'TIS-620') # ก + check_both_ways("\u0E0F", "\xAF", 'TIS-620') # ฏ + check_both_ways("\u0E10", "\xB0", 'TIS-620') # ฐ + check_both_ways("\u0E1F", "\xBF", 'TIS-620') # ฟ + check_both_ways("\u0E20", "\xC0", 'TIS-620') # ภ + check_both_ways("\u0E2F", "\xCF", 'TIS-620') # ฯ + check_both_ways("\u0E30", "\xD0", 'TIS-620') # ะ + check_both_ways("\u0E3A", "\xDA", 'TIS-620') # ฺ + assert_raise(Encoding::UndefinedConversionError) { "\xDB".encode("utf-8", 'TIS-620') } + assert_raise(Encoding::UndefinedConversionError) { "\xDE".encode("utf-8", 'TIS-620') } + check_both_ways("\u0E3F", "\xDF", 'TIS-620') # ฿ + check_both_ways("\u0E40", "\xE0", 'TIS-620') # เ + check_both_ways("\u0E4F", "\xEF", 'TIS-620') # ๏ + check_both_ways("\u0E50", "\xF0", 'TIS-620') # ๐ + check_both_ways("\u0E5B", "\xFB", 'TIS-620') # ๛ + assert_raise(Encoding::UndefinedConversionError) { "\xFC".encode("utf-8", 'TIS-620') } + assert_raise(Encoding::UndefinedConversionError) { "\xFF".encode("utf-8", 'TIS-620') } + end + def test_CP850 + check_both_ways("\u00C7", "\x80", 'CP850') # Ç + check_both_ways("\u00C5", "\x8F", 'CP850') # Å + check_both_ways("\u00C9", "\x90", 'CP850') # É + check_both_ways("\u0192", "\x9F", 'CP850') # ƒ + check_both_ways("\u00E1", "\xA0", 'CP850') # á + check_both_ways("\u00BB", "\xAF", 'CP850') # » + check_both_ways("\u2591", "\xB0", 'CP850') # ░ + check_both_ways("\u2510", "\xBF", 'CP850') # ┐ + check_both_ways("\u2514", "\xC0", 'CP850') # └ + check_both_ways("\u00A4", "\xCF", 'CP850') # ¤ + check_both_ways("\u00F0", "\xD0", 'CP850') # ð + check_both_ways("\u2580", "\xDF", 'CP850') # ▀ + check_both_ways("\u00D3", "\xE0", 'CP850') # Ó + check_both_ways("\u00B4", "\xEF", 'CP850') # ´ + check_both_ways("\u00AD", "\xF0", 'CP850') # soft hyphen + check_both_ways("\u00A0", "\xFF", 'CP850') # non-breaking space + end + + def test_CP852 + check_both_ways("\u00C7", "\x80", 'CP852') # Ç + check_both_ways("\u0106", "\x8F", 'CP852') # Ć + check_both_ways("\u00C9", "\x90", 'CP852') # É + check_both_ways("\u010D", "\x9F", 'CP852') # č + check_both_ways("\u00E1", "\xA0", 'CP852') # á + check_both_ways("\u00BB", "\xAF", 'CP852') # » + check_both_ways("\u2591", "\xB0", 'CP852') # ░ + check_both_ways("\u2510", "\xBF", 'CP852') # ┐ + check_both_ways("\u2514", "\xC0", 'CP852') # └ + check_both_ways("\u00A4", "\xCF", 'CP852') # ¤ + check_both_ways("\u0111", "\xD0", 'CP852') # đ + check_both_ways("\u2580", "\xDF", 'CP852') # ▀ + check_both_ways("\u00D3", "\xE0", 'CP852') # Ó + check_both_ways("\u00B4", "\xEF", 'CP852') # ´ + check_both_ways("\u00AD", "\xF0", 'CP852') # soft hyphen + check_both_ways("\u00A0", "\xFF", 'CP852') # non-breaking space + end + + def test_CP855 + check_both_ways("\u0452", "\x80", 'CP855') # ђ + check_both_ways("\u0408", "\x8F", 'CP855') # Ј + check_both_ways("\u0459", "\x90", 'CP855') # љ + check_both_ways("\u042A", "\x9F", 'CP855') # Ъ + check_both_ways("\u0430", "\xA0", 'CP855') # а + check_both_ways("\u00BB", "\xAF", 'CP855') # » + check_both_ways("\u2591", "\xB0", 'CP855') # ░ + check_both_ways("\u2510", "\xBF", 'CP855') # ┐ + check_both_ways("\u2514", "\xC0", 'CP855') # └ + check_both_ways("\u00A4", "\xCF", 'CP855') # ¤ + check_both_ways("\u043B", "\xD0", 'CP855') # л + check_both_ways("\u2580", "\xDF", 'CP855') # ▀ + check_both_ways("\u042F", "\xE0", 'CP855') # Я + check_both_ways("\u2116", "\xEF", 'CP855') # № + check_both_ways("\u00AD", "\xF0", 'CP855') # soft hyphen + check_both_ways("\u00A0", "\xFF", 'CP855') # non-breaking space + end + def check_utf_16_both_ways(utf8, raw) copy = raw.dup 0.step(copy.length-1, 2) { |i| copy[i+1], copy[i] = copy[i], copy[i+1] } @@ -933,7 +1061,7 @@ class TestTranscode < Test::Unit::TestCase check_utf_32_both_ways("\u{8FF00}", "\x00\x08\xFF\x00") check_utf_32_both_ways("\u{F00FF}", "\x00\x0F\x00\xFF") end - + def test_invalid_ignore # arguments only assert_nothing_raised { 'abc'.encode('utf-8', invalid: :replace, replace: "") } @@ -1212,11 +1340,11 @@ class TestTranscode < Test::Unit::TestCase assert_equal("\e$B!!\e(B".force_encoding("ISO-2022-JP"), "\xA1\xA1".encode("ISO-2022-JP", "EUC-JP")) end - + def test_iso_2022_jp_1 # check_both_ways("\u9299", "\x1b$(Dd!\x1b(B", "iso-2022-jp-1") # JIS X 0212 区68 点01 銙 end - + def test_unicode_public_review_issue_121 # see http://www.unicode.org/review/pr-121.html # assert_equal("\x00\x61\xFF\xFD\x00\x62".force_encoding('UTF-16BE'), # "\x61\xF1\x80\x80\xE1\x80\xC2\x62".encode('UTF-16BE', 'UTF-8', invalid: :replace)) # option 1 |