/*
* Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
* Copyright (c) 1996-2009, The nkf Project.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source distribution.
*/
#define NKF_VERSION "2.0.9"
#define NKF_RELEASE_DATE "2009-01-20"
#define COPY_RIGHT \
"Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
"Copyright (C) 1996-2009, The nkf Project."
#include "config.h"
#include "nkf.h"
#include "utf8tbl.h"
#ifdef __WIN32__
#include <windows.h>
#include <locale.h>
#endif
#if defined(__OS2__)
# define INCL_DOS
# define INCL_DOSERRORS
# include <os2.h>
#endif
#include <assert.h>
/* state of output_mode and input_mode
c2 0 means ASCII
JIS_X_0201_1976_K
ISO_8859_1
JIS_X_0208
EOF all termination
c1 32bit data
*/
/* MIME ENCODE */
#define FIXED_MIME 7
#define STRICT_MIME 8
/* byte order */
enum byte_order {
ENDIAN_BIG = 1,
ENDIAN_LITTLE = 2,
ENDIAN_2143 = 3,
ENDIAN_3412 = 4
};
/* ASCII CODE */
#define BS 0x08
#define TAB 0x09
#define LF 0x0a
#define CR 0x0d
#define ESC 0x1b
#define SP 0x20
#define DEL 0x7f
#define SI 0x0f
#define SO 0x0e
#define SS2 0x8e
#define SS3 0x8f
#define CRLF 0x0D0A
/* encodings */
enum nkf_encodings {
ASCII,
ISO_8859_1,
ISO_2022_JP,
CP50220,
CP50221,
CP50222,
ISO_2022_JP_1,
ISO_2022_JP_3,
ISO_2022_JP_2004,
SHIFT_JIS,
WINDOWS_31J,
CP10001,
EUC_JP,
EUCJP_NKF,
CP51932,
EUCJP_MS,
EUCJP_ASCII,
SHIFT_JISX0213,
SHIFT_JIS_2004,
EUC_JISX0213,
EUC_JIS_2004,
UTF_8,
UTF_8N,
UTF_8_BOM,
UTF8_MAC,
UTF_16,
UTF_16BE,
UTF_16BE_BOM,
UTF_16LE,
UTF_16LE_BOM,
UTF_32,
UTF_32BE,
UTF_32BE_BOM,
UTF_32LE,
UTF_32LE_BOM,
BINARY,
NKF_ENCODING_TABLE_SIZE,
JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
/* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
/* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
/* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
JIS_X_0208 = 0x1168, /* @B */
JIS_X_0212 = 0x1159, /* D */
/* JIS_X_0213_2000_1 = 0x1228, */ /* O */
JIS_X_0213_2 = 0x1229, /* P */
JIS_X_0213_1 = 0x1233 /* Q */
};
static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
static void j_oconv(nkf_char c2, nkf_char c1);
static void s_oconv(nkf_char c2, nkf_char c1);
static void e_oconv(nkf_char c2, nkf_char c1);
static void w_oconv(nkf_char c2, nkf_char c1);
static void w_oconv16(nkf_char c2, nkf_char c1);
static void w_oconv32(nkf_char c2, nkf_char c1);
typedef struct {
const char *name;
nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
void (*oconv)(nkf_char c2, nkf_char c1);
} nkf_native_encoding;
nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
typedef struct {
const int id;
const char *name;
const nkf_native_encoding *base_encoding;
} nkf_encoding;
nkf_encoding nkf_encoding_table[] = {
{ASCII, "US-ASCII", &NkfEncodingASCII},
{ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
{ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
{CP50220, "CP50220", &NkfEncodingISO_2022_JP},
{CP50221, "CP50221", &NkfEncodingISO_2022_JP},
{CP50222, "CP50222", &NkfEncodingISO_2022_JP},
{ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
{ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
{ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
{SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
{WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
{CP10001, "CP10001", &NkfEncodingShift_JIS},
{EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
{EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
{CP51932, "CP51932", &NkfEncodingEUC_JP},
{EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
{EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
{SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
{SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
{EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
{EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
{UTF_8, "UTF-8", &NkfEncodingUTF_8},
{UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
{UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
{UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
{UTF_16, "UTF-16", &NkfEncodingUTF_16},
{UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
{UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
{UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
{UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
{UTF_32, "UTF-32", &NkfEncodingUTF_32},
{UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
{UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
{UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
{UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
{BINARY, "BINARY", &NkfEncodingASCII},
{-1, NULL, NULL}
};
struct {
const char *name;
const int id;
} encoding_name_to_id_table[] = {
{"US-ASCII", ASCII},
{"ASCII", ASCII},
{"ISO-2022-JP", ISO_2022_JP},
{"ISO2022JP-CP932", CP50220},
{"CP50220", CP50220},
{"CP50221", CP50221},
{"CSISO2022JP", CP50221},
{"CP50222", CP50222},
{"ISO-2022-JP-1", ISO_2022_JP_1},
{"ISO-2022-JP-3", ISO_2022_JP_3},
{"ISO-2022-JP-2004", ISO_2022_JP_2004},
{"SHIFT_JIS", SHIFT_JIS},
{"SJIS", SHIFT_JIS},
{"WINDOWS-31J", WINDOWS_31J},
{"CSWINDOWS31J", WINDOWS_31J},
{"CP932", WINDOWS_31J},
{"MS932", WINDOWS_31J},
{"CP10001", CP10001},
{"EUCJP", EUC_JP},
{"EUC-JP", EUC_JP},
{"EUCJP-NKF", EUCJP_NKF},
{"CP51932", CP51932},
{"EUC-JP-MS", EUCJP_MS},
{"EUCJP-MS", EUCJP_MS},
{"EUCJPMS", EUCJP_MS},
{"EUC-JP-ASCII", EUCJP_ASCII},
{"EUCJP-ASCII", EUCJP_ASCII},
{"SHIFT_JISX0213", SHIFT_JISX0213},
{"SHIFT_JIS-2004", SHIFT_JIS_2004},
{"EUC-JISX0213", EUC_JISX0213},
{"EUC-JIS-2004", EUC_JIS_2004},
{"UTF-8", UTF_8},
{"UTF-8N", UTF_8N},
{"UTF-8-BOM", UTF_8_BOM},
{"UTF8-MAC", UTF8_MAC},
{"UTF-8-MAC", UTF8_MAC},
{"UTF-16", UTF_16},
{"UTF-16BE", UTF_16BE},
{"UTF-16BE-BOM", UTF_16BE_BOM},
{"UTF-16LE", UTF_16LE},
{"UTF-16LE-BOM", UTF_16LE_BOM},
{"UTF-32", UTF_32},
{"UTF-32BE", UTF_32BE},
{"UTF-32BE-BOM", UTF_32BE_BOM},
{"UTF-32LE", UTF_32LE},
{"UTF-32LE-BOM", UTF_32LE_BOM},
{"BINARY", BINARY},
{NULL, -1}
};
#if defined(DEFAULT_CODE_JIS)
#define DEFAULT_ENCIDX ISO_2022_JP
#elif defined(DEFAULT_CODE_SJIS)
#define DEFAULT_ENCIDX SHIFT_JIS
#elif defined(DEFAULT_CODE_WINDOWS_31J)
#define DEFAULT_ENCIDX WINDOWS_31J
#elif defined(DEFAULT_CODE_EUC)
#define DEFAULT_ENCIDX EUC_JP
#elif defined(DEFAULT_CODE_UTF8)
#define DEFAULT_ENCIDX UTF_8
#endif
#define is_alnum(c) \
(('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
/* I don't trust portablity of toupper */
#define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
#define nkf_isoctal(c) ('0'<=c && c<='7')
#define nkf_isdigit(c) ('0'<=c && c<='9')
#define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
#define nkf_isblank(c) (c == SP || c == TAB)
#define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
#define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
#define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
#define nkf_isprint(c) (SP<=c && c<='~')
#define nkf_isgraph(c) ('!'<=c && c<='~')
#define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
('A'<=c&&c<='F') ? (c-'A'+10) : \
('a'<=c&&c<='f') ? (c-'a'+10) : 0)
#define bin2hex(c) ("0123456789ABCDEF"[c&15])
#define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
#define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
&& (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
#define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
#define HOLD_SIZE 1024
#if defined(INT_IS_SHORT)
#define IOBUF_SIZE 2048
#else
#define IOBUF_SIZE 16384
#endif
#define DEFAULT_J 'B'
#define DEFAULT_R 'B'
#define GETA1 0x22
#define GETA2 0x2e
/* MIME preprocessor */
#ifdef EASYWIN /*Easy Win */
extern POINT _BufferSize;
#endif
struct input_code{
const char *name;
nkf_char stat;
nkf_char score;
nkf_char index;
nkf_char buf[3];
void (*status_func)(struct input_code *, nkf_char);
nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
int _file_stat;
};
static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
static nkf_encoding *input_encoding = NULL;
static nkf_encoding *output_encoding = NULL;
#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
/* UCS Mapping
* 0: Shift_JIS, eucJP-ascii
* 1: eucJP-ms
* 2: CP932, CP51932
* 3: CP10001
*/
#define UCS_MAP_ASCII 0
#define UCS_MAP_MS 1
#define UCS_MAP_CP932 2
#define UCS_MAP_CP10001 3
static int ms_ucs_map_f = UCS_MAP_ASCII;
#endif
#ifdef UTF8_INPUT_ENABLE
/* no NEC special, NEC-selected IBM extended and IBM extended characters */
static int no_cp932ext_f = FALSE;
/* ignore ZERO WIDTH NO-BREAK SPACE */
static int no_best_fit_chars_f = FALSE;
static int input_endian = ENDIAN_BIG;
static nkf_char unicode_subchar = '?'; /* the regular substitution character */
static void (*encode_fallback)(nkf_char c) = NULL;
static void w_status(struct input_code *, nkf_char);
#endif
#ifdef UTF8_OUTPUT_ENABLE
static int output_bom_f = FALSE;
static int output_endian = ENDIAN_BIG;
#endif
static void std_putc(nkf_char c);
static nkf_char std_getc(FILE *f);
static nkf_char std_ungetc(nkf_char c,FILE *f);
static nkf_char broken_getc(FILE *f);
static nkf_char broken_ungetc(nkf_char c,FILE *f);
static nkf_char mime_getc(FILE *f);
static void mime_putc(nkf_char c);
/* buffers */
#if !defined(PERL_XS) && !defined(WIN32DLL)
static unsigned char stdibuf[IOBUF_SIZE];
static unsigned char stdobuf[IOBUF_SIZE];
#endif
/* flags */
static int unbuf_f = FALSE;
static int estab_f = FALSE;
static int nop_f = FALSE;
static int binmode_f = TRUE; /* binary mode */
static int rot_f = FALSE; /* rot14/43 mode */
static int hira_f = FALSE; /* hira/kata henkan */
static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
static int mime_decode_f = FALSE; /* mime decode is explicitly on */
static int mimebuf_f = FALSE; /* MIME buffered input */
static int broken_f = FALSE; /* convert ESC-less broken JIS */
static int iso8859_f = FALSE; /* ISO8859 through */
static int mimeout_f = FALSE; /* base64 mode */
static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
#ifdef UNICODE_NORMALIZATION
static int nfc_f = FALSE;
static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
#endif
#ifdef INPUT_OPTION
static int cap_f = FALSE;
static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
static int url_f = FALSE;
static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
#endif
#define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
#define CLASS_MASK NKF_INT32_C(0xFF000000)
#define CLASS_UNICODE NKF_INT32_C(0x01000000)
#define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
#define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
#define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
#define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
#define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
#define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
#define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
#define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
#ifdef NUMCHAR_OPTION
static int numchar_f = FALSE;
static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
#endif
#ifdef CHECK_OPTION
static int noout_f = FALSE;
static void no_putc(nkf_char c);
static int debug_f = FALSE;
static void debug(const char *str);
static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
#endif
static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
static void set_input_codename(const char *codename);
#ifdef EXEC_IO
static int exec_f = 0;
#endif
#ifdef SHIFTJIS_CP932
/* invert IBM extended characters to others */
static int cp51932_f = FALSE;
/* invert NEC-selected IBM extended characters to IBM extended characters */
static int cp932inv_f = TRUE;
/* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
#endif /* SHIFTJIS_CP932 */
static int x0212_f = FALSE;
static int x0213_f = FALSE;
static unsigned char prefix_table[256];
static void e_status(struct input_code *, nkf_char);
static void s_status(struct input_code *, nkf_char);
struct input_code input_code_list[] = {
{"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
{"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
#ifdef UTF8_INPUT_ENABLE
{"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
#endif
{0}
};
static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
static int base64_count = 0;
/* X0208 -> ASCII converter */
/* fold parameter */
static int f_line = 0; /* chars in line */
static int f_prev = 0;
static int fold_preserve_f = FALSE; /* preserve new lines */
static int fold_f = FALSE;
static int fold_len = 0;
/* options */
static unsigned char kanji_intro = DEFAULT_J;
static unsigned char ascii_intro = DEFAULT_R;
/* Folding */
#define FOLD_MARGIN 10
#define DEFAULT_FOLD 60
static int fold_margin = FOLD_MARGIN;
/* process default */
static nkf_char
no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
{
fprintf(stderr,"nkf internal module connection failure.\n");
exit(EXIT_FAILURE);
return 0; /* LINT */
}
static void
no_connection(nkf_char c2, nkf_char c1)
{
no_connection2(c2,c1,0);
}
static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
/* static redirections */
static void (*o_putc)(nkf_char c) = std_putc;
static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
|