summaryrefslogtreecommitdiffstats
path: root/src/storage
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2011-11-15 17:07:56 +0800
committerPeng Wu <alexepico@gmail.com>2011-11-15 17:33:58 +0800
commit4e5e619a20c4d7c9cc1229a2f3e26a7219bf6841 (patch)
treee13861d630f78a2b08f1f292feb3742db0352f6a /src/storage
parent5501ea429dd50330caa9cd6ffbd8236d1663fa6e (diff)
downloadlibpinyin-4e5e619a20c4d7c9cc1229a2f3e26a7219bf6841.tar.gz
libpinyin-4e5e619a20c4d7c9cc1229a2f3e26a7219bf6841.tar.xz
libpinyin-4e5e619a20c4d7c9cc1229a2f3e26a7219bf6841.zip
begin to write full pinyin parser2 parse_one_key
Diffstat (limited to 'src/storage')
-rw-r--r--src/storage/chewing_key.h8
-rw-r--r--src/storage/pinyin_parser2.cpp75
-rw-r--r--src/storage/pinyin_parser2.h3
3 files changed, 80 insertions, 6 deletions
diff --git a/src/storage/chewing_key.h b/src/storage/chewing_key.h
index ea086b7..5d708e0 100644
--- a/src/storage/chewing_key.h
+++ b/src/storage/chewing_key.h
@@ -181,14 +181,14 @@ struct ChewingKey
struct ChewingKeyRest
{
guint16 m_index; /* the index in pinyin parser table. */
- guint16 m_pinyin_begin; /* the begin of pinyin in raw input. */
- guint16 m_pinyin_end; /* the end of pinyin in raw input. */
+ guint16 m_raw_begin; /* the begin of the raw input. */
+ guint16 m_raw_end; /* the end of the raw input. */
ChewingKeyRest() {
/* the 0th item in pinyin parser table is reserved for invalid. */
m_index = 0;
- m_pinyin_begin = 0;
- m_pinyin_end = 0;
+ m_raw_begin = 0;
+ m_raw_end = 0;
}
const char * get_pinyin_string();
diff --git a/src/storage/pinyin_parser2.cpp b/src/storage/pinyin_parser2.cpp
index ea180fe..c7bf71b 100644
--- a/src/storage/pinyin_parser2.cpp
+++ b/src/storage/pinyin_parser2.cpp
@@ -19,7 +19,11 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
+
+#include <ctype.h>
#include <assert.h>
+#include <string.h>
+#include "stl_lite.h"
#include "pinyin_custom2.h"
#include "chewing_key.h"
#include "pinyin_parser2.h"
@@ -28,7 +32,7 @@
using namespace pinyin;
-static bool check_pinyin_options(guint32 options, pinyin_index_item_t * item) {
+static bool check_pinyin_options(guint32 options, const pinyin_index_item_t * item) {
guint32 flags = item->m_flags;
assert (flags & IS_PINYIN);
@@ -50,7 +54,7 @@ static bool check_pinyin_options(guint32 options, pinyin_index_item_t * item) {
return true;
}
-static bool check_chewing_options(guint32 options, chewing_index_item_t * item) {
+static bool check_chewing_options(guint32 options, const chewing_index_item_t * item) {
guint32 flags = item->m_flags;
assert (flags & IS_CHEWING);
@@ -63,6 +67,8 @@ static bool check_chewing_options(guint32 options, chewing_index_item_t * item)
return true;
}
+
+/* methods for Chewing Keys to access pinyin parser table. */
const char * ChewingKeyRest::get_pinyin_string(){
if (m_index == 0)
return NULL;
@@ -80,3 +86,68 @@ const char * ChewingKeyRest::get_chewing_string(){
assert(m_index < G_N_ELEMENTS(content_table));
return content_table[m_index].m_chewing_str;
}
+
+
+static bool compare_less_than(const pinyin_index_item_t & lhs,
+ const pinyin_index_item_t & rhs){
+ return 0 > strcmp(lhs.m_pinyin_input, rhs.m_pinyin_input);
+}
+
+int FullPinyinParser2::parse_one_key (guint32 options, ChewingKey & key,
+ ChewingKeyRest & key_rest,
+ const char * pinyin, int len) const {
+ /* "'" are not accepted in parse_one_key. */
+ assert(NULL == strchr(pinyin, '\''));
+ gchar * input = g_strndup(pinyin, len);
+
+ guint16 tone = CHEWING_ZERO_TONE; guint16 tone_pos = 0;
+ guint16 parsed_len = len;
+ key = ChewingKey(); key_rest = ChewingKeyRest();
+
+ /* find the tone in the last character. */
+ char chr = input[parsed_len - 1];
+ if ( '0' < chr && chr <= '5' ) {
+ tone = chr - '0';
+ parsed_len --;
+ tone_pos = parsed_len;
+ }
+
+ /* parse pinyin core staff here. */
+ pinyin_index_item_t item;
+ memset(&item, 0, sizeof(item));
+
+ for (; parsed_len > 0; --parsed_len) {
+ input[parsed_len] = '\0';
+ item.m_pinyin_input = input;
+ std_lite::pair<const pinyin_index_item_t *,
+ const pinyin_index_item_t *> range;
+ range = std_lite::equal_range
+ (pinyin_index, pinyin_index + G_N_ELEMENTS(pinyin_index),
+ item, compare_less_than);
+
+ guint16 len = range.second - range.first;
+ assert (len <= 1);
+ if ( len == 1 ) {
+ const pinyin_index_item_t * index = range.first;
+
+ if (!check_pinyin_options(options, index))
+ continue;
+
+ key_rest.m_index = index->m_table_index;
+ key = content_table[key_rest.m_index].m_chewing_key;
+ break;
+ }
+ }
+
+ /* post processing tone. */
+ if ( parsed_len == tone_pos ) {
+ if (tone != CHEWING_ZERO_TONE) {
+ key.m_tone = tone;
+ parsed_len ++;
+ }
+ }
+
+ key_rest.m_raw_begin = 0; key_rest.m_raw_end = parsed_len;
+ g_free(input);
+ return parsed_len;
+}
diff --git a/src/storage/pinyin_parser2.h b/src/storage/pinyin_parser2.h
index 588a4f1..67bf0b0 100644
--- a/src/storage/pinyin_parser2.h
+++ b/src/storage/pinyin_parser2.h
@@ -146,12 +146,15 @@ public:
* @brief Class to parse Chewing input string
*
* Several keyboard scheme are supported:
+ * * ZHUYIN_ZHUYIN Parse original ZhuYin string, such as ㄅㄧㄢ
* * Chewing_STANDARD Standard ZhuYin keyboard, which maps 1 to Bo(ㄅ), q to Po(ㄆ) etc.
* * Chewing_HSU Hsu ZhuYin keyboard, which uses a-z (except q) chars.
* * Chewing_IBM IBM ZhuYin keyboard, which maps 1 to Bo(ㄅ), 2 to Po(ㄆ) etc.
* * Chewing_GIN_YIEH Gin-Yieh ZhuYin keyboard.
* * Chewing_ET Eten (倚天) ZhuYin keyboard.
* * Chewing_ET26 Eten (倚天) ZhuYin keyboard, which only uses a-z chars.
+ * UTF-8 string is used in ZhuYin Parser, because the requirement of supporting original ZhuYin strings.
+ * So that the length of inputted string is calculated in number of utf8 chars instead of bytes.
*/
class ChewingParser2 : public PinyinParser2
{