summaryrefslogtreecommitdiffstats
path: root/src/storage/pinyin_parser2.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/storage/pinyin_parser2.cpp')
-rw-r--r--src/storage/pinyin_parser2.cpp1329
1 files changed, 0 insertions, 1329 deletions
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
deleted file mode 100644
index 676f138..0000000
--- a/src/storage/pinyin_parser2.cpp
+++ /dev/null
@@ -1,1329 +0,0 @@
-/*
- * libzhuyin
- * Library to deal with zhuyin.
- *
- * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
-
-
-#include "pinyin_parser2.h"
-#include <ctype.h>
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>
-#include "stl_lite.h"
-#include "pinyin_phrase2.h"
-#include "zhuyin_custom2.h"
-#include "chewing_key.h"
-#include "pinyin_parser_table.h"
-#include "chewing_table.h"
-
-
-using namespace zhuyin;
-
-
-#define FULL_PINYIN_SUPPORT_QUOTATION 0
-
-
-static bool check_pinyin_options(pinyin_option_t options, const pinyin_index_item_t * item) {
- guint32 flags = item->m_flags;
- assert (flags & IS_PINYIN);
-
- /* handle incomplete pinyin. */
- if (flags & PINYIN_INCOMPLETE) {
- if (!(options & PINYIN_INCOMPLETE))
- return false;
- }
-
-#if 0
- /* handle correct pinyin, currently only one flag per item. */
- flags &= PINYIN_CORRECT_ALL;
- options &= PINYIN_CORRECT_ALL;
-
- if (flags) {
- if ((flags & options) != flags)
- return false;
- }
-#endif
-
- return true;
-}
-
-static bool check_chewing_options(pinyin_option_t options, const chewing_index_item_t * item) {
- guint32 flags = item->m_flags;
- assert (flags & IS_BOPOMOFO);
-
- /* handle incomplete chewing. */
- if (flags & CHEWING_INCOMPLETE) {
- if (!(options & CHEWING_INCOMPLETE))
- return false;
- }
-
- /* handle correct chewing, currently only one flag per item. */
- flags &= ZHUYIN_CORRECT_ALL;
- options &= ZHUYIN_CORRECT_ALL;
-
- if (flags) {
- if ((flags & options) != flags)
- return false;
- }
-
- return true;
-}
-
-
-gint _ChewingKey::get_table_index() {
- assert(m_initial < CHEWING_NUMBER_OF_INITIALS);
- assert(m_middle < CHEWING_NUMBER_OF_MIDDLES);
- assert(m_final < CHEWING_NUMBER_OF_FINALS);
-
- gint index = chewing_key_table[(m_initial * CHEWING_NUMBER_OF_MIDDLES + m_middle) * CHEWING_NUMBER_OF_FINALS + m_final];
- return index == -1 ? 0 : index;
-}
-
-gchar * _ChewingKey::get_pinyin_string(ZhuyinScheme scheme) {
- assert(m_tone < CHEWING_NUMBER_OF_TONES);
- gint index = get_table_index();
- assert(index < (int) G_N_ELEMENTS(content_table));
- const content_table_item_t & item = content_table[index];
-
- const char * pinyin_str = NULL;
-
- switch(scheme) {
- case FULL_PINYIN_HANYU:
- pinyin_str = item.m_hanyu_pinyin;
- break;
- case FULL_PINYIN_LUOMA:
- pinyin_str = item.m_luoma_pinyin;
- break;
- case FULL_PINYIN_SECONDARY_BOPOMOFO:
- pinyin_str = item.m_secondary_bopomofo;
- break;
- default:
- assert(false);
- }
-
- if (CHEWING_ZERO_TONE == m_tone) {
- return g_strdup(pinyin_str);
- } else {
- return g_strdup_printf("%s%d", pinyin_str, m_tone);
- }
-}
-
-gchar * _ChewingKey::get_bopomofo_string() {
- assert(m_tone < CHEWING_NUMBER_OF_TONES);
- gint index = get_table_index();
- assert(index < (int) G_N_ELEMENTS(content_table));
- const content_table_item_t & item = content_table[index];
-
- if (CHEWING_ZERO_TONE == m_tone) {
- return g_strdup(item.m_bopomofo);
- } else if (CHEWING_1 == m_tone) {
- /* for first tone, usually not display it. */
- return g_strdup(item.m_bopomofo);
- } else {
- return g_strdup_printf("%s%s", item.m_bopomofo,
- chewing_tone_table[m_tone]);
- }
-}
-
-/* Pinyin Parsers */
-
-/* internal information for pinyin parsers. */
-struct parse_value_t{
- ChewingKey m_key;
- ChewingKeyRest m_key_rest;
- gint16 m_num_keys;
- gint16 m_parsed_len;
- gint16 m_last_step;
-
- /* constructor */
-public:
- parse_value_t(){
- m_num_keys = 0;
- m_parsed_len = 0;
- m_last_step = -1;
- }
-};
-
-const guint16 max_full_pinyin_length = 7; /* include tone. */
-
-const guint16 max_double_pinyin_length = 3; /* include tone. */
-
-const guint16 max_chewing_length = 4; /* include tone. */
-
-const guint16 max_chewing_dachen26_length = 12; /* include tone. */
-
-const guint16 max_utf8_length = 6;
-
-static bool compare_pinyin_less_than(const pinyin_index_item_t & lhs,
- const pinyin_index_item_t & rhs){
- return 0 > strcmp(lhs.m_pinyin_input, rhs.m_pinyin_input);
-}
-
-static inline bool search_pinyin_index(pinyin_option_t options,
- const pinyin_index_item_t * pinyin_index,
- size_t len,
- const char * pinyin,
- ChewingKey & key){
- pinyin_index_item_t item;
- memset(&item, 0, sizeof(item));
- item.m_pinyin_input = pinyin;
-
- std_lite::pair<const pinyin_index_item_t *,
- const pinyin_index_item_t *> range;
- range = std_lite::equal_range
- (pinyin_index, pinyin_index + len,
- item, compare_pinyin_less_than);
-
- guint16 range_len = range.second - range.first;
- assert(range_len <= 1);
- if (range_len == 1) {
- const pinyin_index_item_t * index = range.first;
-
- if (!check_pinyin_options(options, index))
- return false;
-
- key = content_table[index->m_table_index].m_chewing_key;
- assert(key.get_table_index() == index->m_table_index);
- return true;
- }
-
- return false;
-}
-
-static bool compare_chewing_less_than(const chewing_index_item_t & lhs,
- const chewing_index_item_t & rhs){
- return 0 > strcmp(lhs.m_chewing_input, rhs.m_chewing_input);
-}
-
-static inline bool search_chewing_index(pinyin_option_t options,
- const chewing_index_item_t * chewing_index,
- size_t len,
- const char * chewing,
- ChewingKey & key){
- chewing_index_item_t item;
- memset(&item, 0, sizeof(item));
- item.m_chewing_input = chewing;
-
- std_lite::pair<const chewing_index_item_t *,
- const chewing_index_item_t *> range;
- range = std_lite::equal_range
- (chewing_index, chewing_index + len,
- item, compare_chewing_less_than);
-
- guint16 range_len = range.second - range.first;
- assert (range_len <= 1);
-
- if (range_len == 1) {
- const chewing_index_item_t * index = range.first;
-
- if (!check_chewing_options(options, index))
- return false;
-
- key = content_table[index->m_table_index].m_chewing_key;
- assert(key.get_table_index() == index->m_table_index);
- return true;
- }
-
- return false;
-}
-
-/* Full Pinyin Parser */
-FullPinyinParser2::FullPinyinParser2 (){
- m_pinyin_index = NULL; m_pinyin_index_len = 0;
- m_parse_steps = g_array_new(TRUE, FALSE, sizeof(parse_value_t));
-
- set_scheme(FULL_PINYIN_DEFAULT);
-}
-
-bool FullPinyinParser2::parse_one_key (pinyin_option_t options,
- ChewingKey & key,
- const char * pinyin, int len) const {
- /* "'" are not accepted in parse_one_key. */
- gchar * input = g_strndup(pinyin, len);
- assert(NULL == strchr(input, '\''));
-
- guint16 tone = CHEWING_ZERO_TONE; guint16 tone_pos = 0;
- guint16 parsed_len = len;
- key = ChewingKey();
-
- if (options & USE_TONE) {
- /* find the tone in the last character. */
- char chr = input[parsed_len - 1];
- if ( '0' < chr && chr <= '5' ) {
- tone = chr - '0';
- parsed_len --;
- tone_pos = parsed_len;
- }
-
- /* check the force tone option. */
- if (options & FORCE_TONE && CHEWING_ZERO_TONE == tone) {
- g_free(input);
- return false;
- }
- }
-
- /* parse pinyin core staff here. */
-
- /* Note: optimize here? */
- input[parsed_len] = '\0';
- if (!search_pinyin_index(options, m_pinyin_index, m_pinyin_index_len,
- input, key)) {
- g_free(input);
- return false;
- }
-
- if (options & USE_TONE) {
- /* post processing tone. */
- if ( parsed_len == tone_pos ) {
- if (tone != CHEWING_ZERO_TONE) {
- key.m_tone = tone;
- parsed_len ++;
- }
- }
- }
-
- g_free(input);
- return parsed_len == len;
-}
-
-
-int FullPinyinParser2::parse (pinyin_option_t options, ChewingKeyVector & keys,
- ChewingKeyRestVector & key_rests,
- const char *str, int len) const {
- int i;
- /* clear arrays. */
- g_array_set_size(keys, 0);
- g_array_set_size(key_rests, 0);
-
- /* init m_parse_steps, and prepare dynamic programming. */
- int step_len = len + 1;
- g_array_set_size(m_parse_steps, 0);
- parse_value_t value;
- for (i = 0; i < step_len; ++i) {
- g_array_append_val(m_parse_steps, value);
- }
-
- size_t next_sep = 0;
- gchar * input = g_strndup(str, len);
- parse_value_t * curstep = NULL, * nextstep = NULL;
-
- for (i = 0; i < len; ++i) {
-
-#if FULL_PINYIN_SUPPORT_QUOTATION
- if (input[i] == '\'') {
- curstep = &g_array_index(m_parse_steps, parse_value_t, i);
- nextstep = &g_array_index(m_parse_steps, parse_value_t, i + 1);
-
- /* propagate current step into next step. */
- nextstep->m_key = ChewingKey();
- nextstep->m_key_rest = ChewingKeyRest();
- nextstep->m_num_keys = curstep->m_num_keys;
- nextstep->m_parsed_len = curstep->m_parsed_len + 1;
- nextstep->m_last_step = i;
- next_sep = 0;
- continue;
- }
-#else
- if (input[i] == '\'') {
- break;
- }
-#endif
-
- /* forward to next "'" */
- if ( 0 == next_sep ) {
- int k;
- for (k = i; k < len; ++k) {
- if (input[k] == '\'')
- break;
- }
- next_sep = k;
- }
-
- /* dynamic programming here. */
- /* for (size_t m = i; m < next_sep; ++m) */
- {
- size_t m = i;
- curstep = &g_array_index(m_parse_steps, parse_value_t, m);
- size_t try_len = std_lite::min
- (m + max_full_pinyin_length, next_sep);
- for (size_t n = m + 1; n < try_len + 1; ++n) {
- nextstep = &g_array_index(m_parse_steps, parse_value_t, n);
-
- /* gen next step */
- const char * onepinyin = input + m;
- gint16 onepinyinlen = n - m;
- value = parse_value_t();
-
- ChewingKey key; ChewingKeyRest rest;
- bool parsed = parse_one_key
- (options, key, onepinyin, onepinyinlen);
- rest.m_raw_begin = m; rest.m_raw_end = n;
- if (!parsed)
- continue;
-
- //printf("onepinyin:%s len:%d\n", onepinyin, onepinyinlen);
-
- value.m_key = key; value.m_key_rest = rest;
- value.m_num_keys = curstep->m_num_keys + 1;
- value.m_parsed_len = curstep->m_parsed_len + onepinyinlen;
- value.m_last_step = m;
-
- /* save next step */
- /* no previous result */
- if (-1 == nextstep->m_last_step)
- *nextstep = value;
- /* prefer the longest pinyin */
- if (value.m_parsed_len > nextstep->m_parsed_len)
- *nextstep = value;
- /* prefer the shortest keys with the same pinyin length */
- if (value.m_parsed_len == nextstep->m_parsed_len &&
- value.m_num_keys < nextstep->m_num_keys)
- *nextstep = value;
-
- }
- }
- }
-
- /* final step for back tracing. */
- gint16 parsed_len = final_step(step_len, keys, key_rests);
-
- g_free(input);
- return parsed_len;
-}
-
-int FullPinyinParser2::final_step(size_t step_len, ChewingKeyVector & keys,
- ChewingKeyRestVector & key_rests) const{
- int i;
- gint16 parsed_len = 0;
- parse_value_t * curstep = NULL;
-
- /* find longest match, which starts from the beginning of input. */
- for (i = step_len - 1; i >= 0; --i) {
- curstep = &g_array_index(m_parse_steps, parse_value_t, i);
- if (i == curstep->m_parsed_len)
- break;
- }
- /* prepare saving. */
- parsed_len = curstep->m_parsed_len;
- gint16 num_keys = curstep->m_num_keys;
- g_array_set_size(keys, num_keys);
- g_array_set_size(key_rests, num_keys);
-
- /* save the match. */
- while (curstep->m_last_step != -1) {
- gint16 pos = curstep->m_num_keys - 1;
-
- /* skip "'" */
- if (0 != curstep->m_key.get_table_index()) {
- ChewingKey * key = &g_array_index(keys, ChewingKey, pos);
- ChewingKeyRest * rest = &g_array_index
- (key_rests, ChewingKeyRest, pos);
- *key = curstep->m_key; *rest = curstep->m_key_rest;
- }
-
- /* back ward */
- curstep = &g_array_index(m_parse_steps, parse_value_t,
- curstep->m_last_step);
- }
- return parsed_len;
-}
-
-bool FullPinyinParser2::set_scheme(ZhuyinScheme scheme){
- switch(scheme){
- case FULL_PINYIN_HANYU:
- m_pinyin_index = hanyu_pinyin_index;
- m_pinyin_index_len = G_N_ELEMENTS(hanyu_pinyin_index);
- break;
- case FULL_PINYIN_LUOMA:
- m_pinyin_index = luoma_pinyin_index;
- m_pinyin_index_len = G_N_ELEMENTS(luoma_pinyin_index);
- break;
- case FULL_PINYIN_SECONDARY_BOPOMOFO:
- m_pinyin_index = secondary_bopomofo_index;
- m_pinyin_index_len = G_N_ELEMENTS(secondary_bopomofo_index);
- break;
- default:
- assert(false);
- }
- return true;
-}
-
-#if 0
-
-static const char * pinyin_symbols[27] = {
- "a", "b", "c", "d", "e", "f", "g",
- "h", "i", "j", "k", "l", "m", "n",
- "o", "p", "q", "r", "s", "t",
- "u", "v", "w", "x", "y", "z",
- "'"
-};
-
-bool FullPinyinParser2::in_chewing_scheme(pinyin_option_t options,
- const char key,
- const char ** symbol) const {
- int id;
- if ('a' <= key && key <= 'z') {
- id = key - 'a';
- *symbol = pinyin_symbols[id];
- return true;
- }
-
- if ('\'' == key) {
- id = 26;
- *symbol = pinyin_symbols[id];
- return true;
- }
-
- return false;
-}
-
-#endif
-
-/* the chewing string must be freed with g_free. */
-static bool search_chewing_symbols(const chewing_symbol_item_t * symbol_table,
- const char key, const char ** chewing) {
- *chewing = "";
- /* just iterate the table, as we only have < 50 items. */
- while (symbol_table->m_input != '\0') {
- if (symbol_table->m_input == key) {
- *chewing = symbol_table->m_chewing;
- return true;
- }
- symbol_table ++;
- }
- return false;
-}
-
-static bool search_chewing_tones(const chewing_tone_item_t * tone_table,
- const char key, unsigned char * tone) {
- *tone = CHEWING_ZERO_TONE;
- /* just iterate the table, as we only have < 10 items. */
- while (tone_table->m_input != '\0') {
- if (tone_table->m_input == key) {
- *tone = tone_table->m_tone;
- return true;
- }
- tone_table ++;
- }
- return false;
-}
-
-static int search_chewing_symbols2(const chewing_symbol_item_t * symbol_table,
- const char key,
- const char ** first,
- const char ** second) {
- int num = 0;
- *first = NULL; *second = NULL;
-
- /* just iterate the table, as we only have < 50 items. */
- while (symbol_table->m_input != '\0') {
- if (symbol_table->m_input == key) {
- ++num;
- if (NULL == *first) {
- *first = symbol_table->m_chewing;
- } else {
- *second = symbol_table->m_chewing;
- }
- }
-
- /* search done */
- if (symbol_table->m_input > key)
- break;
-
- symbol_table++;
- }
-
- assert(0 <= num && num <= 2);
- return num;
-}
-
-#if 1
-bool ChewingSimpleParser2::parse_one_key(pinyin_option_t options,
- ChewingKey & key,
- const char * str, int len) const {
- options &= ~ZHUYIN_AMB_ALL;
- unsigned char tone = CHEWING_ZERO_TONE;
-
- int symbols_len = len;
- /* probe whether the last key is tone key in str. */
- if (options & USE_TONE) {
- char ch = str[len - 1];
- /* remove tone from input */
- if (search_chewing_tones(m_tone_table, ch, &tone))
- symbols_len --;
-
- /* check the force tone option */
- if (options & FORCE_TONE && CHEWING_ZERO_TONE == tone)
- return false;
- }
-
- int i;
- gchar * chewing = NULL; const char * onechar = NULL;
-
- /* probe the possible chewing map in the rest of str. */
- for (i = 0; i < symbols_len; ++i) {
- if (!search_chewing_symbols(m_symbol_table, str[i], &onechar)) {
- g_free(chewing);
- return false;
- }
-
- if (!chewing) {
- chewing = g_strdup(onechar);
- } else {
- gchar * tmp = chewing;
- chewing = g_strconcat(chewing, onechar, NULL);
- g_free(tmp);
- }
- }
-
- /* search the chewing in the chewing index table. */
- if (chewing && search_chewing_index(options, bopomofo_index,
- G_N_ELEMENTS(bopomofo_index),
- chewing, key)) {
- /* save back tone if available. */
- key.m_tone = tone;
- g_free(chewing);
- return true;
- }
-
- g_free(chewing);
- return false;
-}
-
-#endif
-
-/* only characters in chewing keyboard scheme are accepted here. */
-int ChewingSimpleParser2::parse(pinyin_option_t options,
- ChewingKeyVector & keys,
- ChewingKeyRestVector & key_rests,
- const char *str, int len) const {
- /* add keyboard mapping specific options. */
- options |= m_options;
-
- g_array_set_size(keys, 0);
- g_array_set_size(key_rests, 0);
-
- int maximum_len = 0; int i;
- /* probe the longest possible chewing string. */
- for (i = 0; i < len; ++i) {
- gchar ** symbols = NULL;
- if (!in_chewing_scheme(options, str[i], symbols)) {
- g_strfreev(symbols);
- break;
- }
- g_strfreev(symbols);
- }
- maximum_len = i;
-
- /* maximum forward match for chewing. */
- int parsed_len = 0;
- while (parsed_len < maximum_len) {
- const char * cur_str = str + parsed_len;
- i = std_lite::min(maximum_len - parsed_len,
- (int)max_chewing_length);
-
- ChewingKey key; ChewingKeyRest key_rest;
- for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
- if (success)
- break;
- }
-
- if (0 == i) /* no more possible chewings. */
- break;
-
- key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i;
- parsed_len += i;
-
- /* save the pinyin. */
- g_array_append_val(keys, key);
- g_array_append_val(key_rests, key_rest);
- }
-
- return parsed_len;
-}
-
-
-bool ChewingSimpleParser2::set_scheme(ZhuyinScheme scheme) {
- m_options = SHUFFLE_CORRECT;
-
- switch(scheme) {
- case CHEWING_STANDARD:
- m_symbol_table = chewing_standard_symbols;
- m_tone_table = chewing_standard_tones;
- return true;
- case CHEWING_IBM:
- m_symbol_table = chewing_ibm_symbols;
- m_tone_table = chewing_ibm_tones;
- return true;
- case CHEWING_GINYIEH:
- m_symbol_table = chewing_ginyieh_symbols;
- m_tone_table = chewing_ginyieh_tones;
- return true;
- case CHEWING_ETEN:
- m_symbol_table = chewing_eten_symbols;
- m_tone_table = chewing_eten_tones;
- return true;
- case CHEWING_STANDARD_DVORAK:
- m_symbol_table = chewing_standard_dvorak_symbols;
- m_tone_table = chewing_standard_dvorak_tones;
- default:
- assert(FALSE);
- }
-
- return false;
-}
-
-bool ChewingSimpleParser2::in_chewing_scheme(pinyin_option_t options,
- const char key,
- gchar ** & symbols) const {
- symbols = NULL;
- GPtrArray * array = g_ptr_array_new();
-
- const gchar * chewing = NULL;
- unsigned char tone = CHEWING_ZERO_TONE;
-
- if (search_chewing_symbols(m_symbol_table, key, &chewing)) {
- g_ptr_array_add(array, g_strdup(chewing));
- g_ptr_array_add(array, NULL);
- /* must be freed by g_strfreev. */
- symbols = (gchar **) g_ptr_array_free(array, FALSE);
- return true;
- }
-
- if (!(options & USE_TONE))
- return false;
-
- if (search_chewing_tones(m_tone_table, key, &tone)) {
- g_ptr_array_add(array, g_strdup(chewing_tone_table[tone]));
- g_ptr_array_add(array, NULL);
- /* must be freed by g_strfreev. */
- symbols = (gchar **) g_ptr_array_free(array, FALSE);
- return true;
- }
-
- return false;
-}
-
-bool ChewingDiscreteParser2::parse_one_key(pinyin_option_t options,
- ChewingKey & key,
- const char * str, int len) const {
- if (0 == len)
- return false;
-
- options &= ~ZHUYIN_AMB_ALL;
-
- int index = 0;
- const char * initial = "";
- const char * middle = "";
- const char * final = "";
- unsigned char tone = CHEWING_ZERO_TONE;
-
- /* probe initial */
- if (search_chewing_symbols(m_initial_table, str[index], &initial)) {
- index++;
- }
-
- if (index == len)
- goto probe;
-
- /* probe middle */
- if (search_chewing_symbols(m_middle_table, str[index], &middle)) {
- index++;
- }
-
- if (index == len)
- goto probe;
-
- /* probe final */
- if (search_chewing_symbols(m_final_table, str[index], &final)) {
- index++;
- }
-
- if (index == len) {
- /* check the force tone option. */
- if (options & USE_TONE && options & FORCE_TONE)
- return false;
- goto probe;
- }
-
- /* probe tone */
- if (options & USE_TONE) {
- if (search_chewing_tones(m_tone_table, str[index], &tone)) {
- index ++;
- }
- }
-
-probe:
- /* check the force tone option. */
- if (options & FORCE_TONE && CHEWING_ZERO_TONE == tone) {
- return false;
- }
-
- gchar * chewing = g_strconcat(initial, middle, final, NULL);
-
- /* search the chewing in the chewing index table. */
- if (index == len && search_chewing_index(options, m_chewing_index,
- m_chewing_index_len,
- chewing, key)) {
- /* save back tone if available. */
- key.m_tone = tone;
- g_free(chewing);
- return true;
- }
-
- g_free(chewing);
- return false;
-}
-
-/* only characters in chewing keyboard scheme are accepted here. */
-int ChewingDiscreteParser2::parse(pinyin_option_t options,
- ChewingKeyVector & keys,
- ChewingKeyRestVector & key_rests,
- const char *str, int len) const {
- /* add keyboard mapping specific options. */
- options |= m_options;
-
- g_array_set_size(keys, 0);
- g_array_set_size(key_rests, 0);
-
- int maximum_len = 0; int i;
- /* probe the longest possible chewing string. */
- for (i = 0; i < len; ++i) {
- gchar ** symbols = NULL;
- if (!in_chewing_scheme(options, str[i], symbols)) {
- g_strfreev(symbols);
- break;
- }
- g_strfreev(symbols);
- }
- maximum_len = i;
-
- /* maximum forward match for chewing. */
- int parsed_len = 0;
- while (parsed_len < maximum_len) {
- const char * cur_str = str + parsed_len;
- i = std_lite::min(maximum_len - parsed_len,
- (int)max_chewing_length);
-
- ChewingKey key; ChewingKeyRest key_rest;
- for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
- if (success)
- break;
- }
-
- if (0 == i) /* no more possible chewings. */
- break;
-
- key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i;
- parsed_len += i;
-
- /* save the pinyin. */
- g_array_append_val(keys, key);
- g_array_append_val(key_rests, key_rest);
- }
-
- return parsed_len;
-}
-
-bool ChewingDiscreteParser2::set_scheme(ZhuyinScheme scheme) {
- m_options = 0;
-
-#define INIT_PARSER(index, table) { \
- m_chewing_index = index; \
- m_chewing_index_len = G_N_ELEMENTS(index); \
- m_initial_table = chewing_##table##_initials; \
- m_middle_table = chewing_##table##_middles; \
- m_final_table = chewing_##table##_finals; \
- m_tone_table = chewing_##table##_tones; \
- }
-
- switch(scheme) {
- case CHEWING_HSU:
- m_options = HSU_CORRECT;
- INIT_PARSER(hsu_bopomofo_index, hsu);
- break;
- case CHEWING_ETEN26:
- m_options = ETEN26_CORRECT;
- INIT_PARSER(eten26_bopomofo_index, eten26);
- break;
- case CHEWING_HSU_DVORAK:
- m_options = HSU_CORRECT;
- INIT_PARSER(hsu_bopomofo_index, hsu_dvorak);
- break;
- default:
- assert(FALSE);
- }
-
-#undef INIT_PARSER
-
- return true;
-}
-
-bool ChewingDiscreteParser2::in_chewing_scheme(pinyin_option_t options,
- const char key,
- gchar ** & symbols) const {
- symbols = NULL;
- GPtrArray * array = g_ptr_array_new();
-
- const gchar * first = NULL, * second = NULL;
- unsigned char tone = CHEWING_ZERO_TONE;
-
- if (search_chewing_symbols2(m_initial_table, key, &first, &second)) {
- if (first)
- g_ptr_array_add(array, g_strdup(first));
- if (second)
- g_ptr_array_add(array, g_strdup(second));
- }
-
- if (search_chewing_symbols2(m_middle_table, key, &first, &second)) {
- if (first)
- g_ptr_array_add(array, g_strdup(first));
- if (second)
- g_ptr_array_add(array, g_strdup(second));
- }
-
- if (search_chewing_symbols2(m_final_table, key, &first, &second)) {
- if (first)
- g_ptr_array_add(array, g_strdup(first));
- if (second)
- g_ptr_array_add(array, g_strdup(second));
- }
-
- if (!(options & USE_TONE))
- goto end;
-
- if (search_chewing_tones(m_tone_table, key, &tone)) {
- g_ptr_array_add(array, g_strdup(chewing_tone_table[tone]));
- }
-
-end:
- assert(array->len <= 3);
-
- if (array->len) {
- g_ptr_array_add(array, NULL);
- /* must be freed by g_strfreev. */
- symbols = (gchar **) g_ptr_array_free(array, FALSE);
- return true;
- }
-
- g_ptr_array_free(array, TRUE);
- return false;
-}
-
-ChewingDaChenCP26Parser2::ChewingDaChenCP26Parser2() {
- m_chewing_index = bopomofo_index;
- m_chewing_index_len = G_N_ELEMENTS(bopomofo_index);
-
- m_initial_table = chewing_dachen_cp26_initials;
- m_middle_table = chewing_dachen_cp26_middles;
- m_final_table = chewing_dachen_cp26_finals;
- m_tone_table = chewing_dachen_cp26_tones;
-}
-
-static int count_same_chars(const char * str, int len) {
- assert(len > 0);
-
- int count = 0;
- const char cur_char = str[0];
-
- for (int i = 0; i < len; ++i) {
- if (cur_char != str[i])
- break;
- ++count;
- }
-
- assert(count >= 1);
- return count;
-}
-
-bool ChewingDaChenCP26Parser2::parse_one_key(pinyin_option_t options,
- ChewingKey & key,
- const char *str, int len) const {
- if (0 == len)
- return false;
-
- options &= ~ZHUYIN_AMB_ALL;
-
- const char * initial = "";
- const char * middle = "";
- const char * final = "";
- unsigned char tone = CHEWING_ZERO_TONE;
-
- gchar * input = g_strndup(str, len);
- int index = 0;
-
- char ch;
- const char * first = NULL;
- const char * second = NULL;
-
- /* probe whether the last key is tone key in input. */
- if (options & USE_TONE) {
- ch = input[len - 1];
- /* remove tone from input */
- if (search_chewing_tones(m_tone_table, ch, &tone))
- len --;
-
- /* check the force tone option. */
- if (options & FORCE_TONE && CHEWING_ZERO_TONE == tone) {
- g_free(input);
- return false;
- }
- }
-
- if (0 == len)
- return false;
-
- int choice; int count;
-
- /* probe initial */
- do {
- ch = input[index];
- count = count_same_chars(input + index, len - index);
- if (search_chewing_symbols2(m_initial_table, ch, &first, &second)) {
- index += count;
- if (NULL == second) {
- initial = first;
- break;
- } else {
- choice = (count - 1) % 2;
- if (0 == choice)
- initial = first;
- if (1 == choice)
- initial = second;
- }
- }
- } while (0);
-
- if (index == len)
- goto probe;
-
- first = NULL; second = NULL;
- /* probe middle */
- do {
- ch = input[index];
- count = count_same_chars(input + index, len - index);
- /* handle 'u' */
- if ('u' == ch) {
- choice = (count - 1) % 3;
- if (0 == choice)
- middle = "ㄧ";
- if (1 == choice)
- final = "ㄚ";
- if (2 == choice) {
- middle = "ㄧ";
- final = "ㄚ";
- }
- }
- /* handle 'm' */
- if ('m' == ch) {
- choice = (count - 1) % 2;
- if (0 == choice)
- middle = "ㄩ";
- if (1 == choice)
- final = "ㄡ";
- }
- /* handle 'j' */
- if ('j' == ch) {
- middle = "ㄨ";
- }
- if (search_chewing_symbols2(m_middle_table, ch, &first, &second)) {
- index += count;
- assert(NULL == second);
- }
- } while(0);
-
- if (index == len)
- goto probe;
-
- /* probe final */
- do {
- /* for 'u' and 'm' */
- if (0 != strlen(final))
- break;
-
- ch = input[index];
- count = count_same_chars(input + index, len - index);
- if (search_chewing_symbols2(m_final_table, ch, &first, &second)) {
- index += count;
- if (NULL == second) {
- final = first;
- break;
- } else {
- choice = (count - 1) % 2;
- if (0 == choice)
- final = first;
- if (1 == choice)
- final = second;
- }
- }
- } while(0);
-
- if (index == len)
- goto probe;
-
-probe:
- gchar * chewing = g_strconcat(initial, middle, final, NULL);
-
- /* search the chewing in the chewing index table. */
- if (index == len && search_chewing_index(options, m_chewing_index,
- m_chewing_index_len,
- chewing, key)) {
- /* save back tone if available. */
- key.m_tone = tone;
- g_free(chewing);
- g_free(input);
- return true;
- }
-
- g_free(chewing);
- g_free(input);
- return false;
-}
-
-int ChewingDaChenCP26Parser2::parse(pinyin_option_t options,
- ChewingKeyVector & keys,
- ChewingKeyRestVector & key_rests,
- const char *str, int len) const {
- g_array_set_size(keys, 0);
- g_array_set_size(key_rests, 0);
-
- int maximum_len = 0; int i;
- /* probe the longest possible chewing string. */
- for (i = 0; i < len; ++i) {
- gchar ** symbols = NULL;
- if (!in_chewing_scheme(options, str[i], symbols)) {
- g_strfreev(symbols);
- break;
- }
- g_strfreev(symbols);
- }
- maximum_len = i;
-
- /* maximum forward match for chewing. */
- int parsed_len = 0;
- const char * cur_str = NULL;
- ChewingKey key; ChewingKeyRest key_rest;
-
- while (parsed_len < maximum_len) {
- cur_str = str + parsed_len;
- i = std_lite::min(maximum_len - parsed_len,
- (int)max_chewing_dachen26_length);
-
- for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
- if (success)
- break;
- }
-
- if (0 == i) /* no more possible chewings. */
- break;
-
- key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i;
- parsed_len += i;
-
- /* save the pinyin. */
- g_array_append_val(keys, key);
- g_array_append_val(key_rests, key_rest);
- }
-
-#if 0
- /* for the last partial input */
- options |= CHEWING_INCOMPLETE;
-
- cur_str = str + parsed_len;
- i = std_lite::min(maximum_len - parsed_len,
- (int) max_chewing_dachen26_length);
- for (; i > 0; --i) {
- bool success = parse_one_key(options, key, cur_str, i);
- if (success)
- break;
- }
-
- if (i > 0) { /* found one */
- key_rest.m_raw_begin = parsed_len; key_rest.m_raw_end = parsed_len + i;
- parsed_len += i;
-
- /* save the pinyin. */
- g_array_append_val(keys, key);
- g_array_append_val(key_rests, key_rest);
- }
-#endif
-
- return parsed_len;
-}
-
-
-bool ChewingDaChenCP26Parser2::in_chewing_scheme(pinyin_option_t options,
- const char key,
- gchar ** & symbols) const {
- symbols = NULL;
- GPtrArray * array = g_ptr_array_new();
-
- const gchar * first = NULL, * second = NULL;
- unsigned char tone = CHEWING_ZERO_TONE;
-
- if (search_chewing_symbols2(m_initial_table, key, &first, &second)) {
- if (first)
- g_ptr_array_add(array, g_strdup(first));
- if (second)
- g_ptr_array_add(array, g_strdup(second));
- }
-
- if (search_chewing_symbols2(m_middle_table, key, &first, &second)) {
- if (first)
- g_ptr_array_add(array, g_strdup(first));
- if (second)
- g_ptr_array_add(array, g_strdup(second));
- }
-
- if (search_chewing_symbols2(m_final_table, key, &first, &second)) {
- if (first)
- g_ptr_array_add(array, g_strdup(first));
- if (second)
- g_ptr_array_add(array, g_strdup(second));
- }
-
- /* handles for "i" */
- if ('i' == key) {
- g_ptr_array_add(array, g_strdup("ㄧㄚ"));
- }
-
- if (!(options & USE_TONE))
- goto end;
-
- if (search_chewing_tones(m_tone_table, key, &tone)) {
- g_ptr_array_add(array, g_strdup(chewing_tone_table[tone]));
- }
-
-end:
- assert(array->len <= 3);
-
- if (array->len) {
- g_ptr_array_add(array, NULL);
- /* must be freed by g_strfreev. */
- symbols = (gchar **) g_ptr_array_free(array, FALSE);
- return true;
- }
-
- g_ptr_array_free(array, TRUE);
- return false;
-}
-
-ChewingDirectParser2::ChewingDirectParser2 (){
- m_chewing_index = bopomofo_index;
- m_chewing_index_len = G_N_ELEMENTS(bopomofo_index);
-}
-
-bool ChewingDirectParser2::parse_one_key(pinyin_option_t options,
- ChewingKey & key,
- const char *str, int len) const {
- options &= ~ZHUYIN_AMB_ALL;
- /* by default, chewing will use the first tone. */
- unsigned char tone = CHEWING_1;
-
- if (0 == len)
- return false;
-
- const gchar * last_char = NULL;
- for (const char * p = str; p < str + len; p = g_utf8_next_char(p)) {
- last_char = p;
- }
-
- /* probe tone first. */
- if (options & USE_TONE) {
- gchar buffer[max_utf8_length + 1];
- memset(buffer, 0, sizeof(buffer));
- g_utf8_strncpy(buffer, last_char, 1);
-
- /* for loop chewing_tone_table. */
- int i = 1;
- for (; i < (int) G_N_ELEMENTS(chewing_tone_table); ++i) {
- const char * symbol = chewing_tone_table[i];
- if (0 == strcmp(symbol, buffer)) {
- tone = i;
- len -= strlen(buffer);
- break;
- }
- }
-
- /* check the force tone option. */
- if (options & FORCE_TONE && CHEWING_ZERO_TONE == tone) {
- return false;
- }
- }
-
- gchar * chewing = g_strndup(str, len);
- /* search the chewing in the chewing index table. */
- if (len && search_chewing_index(options, m_chewing_index,
- m_chewing_index_len, chewing, key)) {
- /* save back tone if available. */
- key.m_tone = tone;
- g_free(chewing);
-
- assert(tone != CHEWING_ZERO_TONE);
- return true;
- }
-
- g_free(chewing);
- return false;
-}
-
-int ChewingDirectParser2::parse(pinyin_option_t options,
- ChewingKeyVector & keys,
- ChewingKeyRestVector & key_rests,
- const char *str, int len) const {
- g_array_set_size(keys, 0);
- g_array_set_size(key_rests, 0);
-
- ChewingKey key; ChewingKeyRest key_rest;
-
- int parsed_len = 0;
- int i = 0, cur = 0, next = 0;
- while (cur < len) {
- /* probe next position */
- for (i = cur; i < len; ++i) {
- if (' ' == str[i] || '\'' == str[i])
- break;
- }
- next = i;
-
- if (parse_one_key(options, key, str + cur, next - cur)) {
- key_rest.m_raw_begin = cur; key_rest.m_raw_end = next;
-
- /* save the pinyin. */
- g_array_append_val(keys, key);
- g_array_append_val(key_rests, key_rest);
- } else {
- return parsed_len;
- }
-
- /* skip consecutive spaces. */
- for (i = next; i < len; ++i) {
- if (' ' != str[i] && '\'' != str[i])
- break;
- }
-
- cur = i;
- parsed_len = i;
- }
-
- return parsed_len;
-}