diff options
| author | Peng Wu <alexepico@gmail.com> | 2024-09-26 14:13:42 +0800 |
|---|---|---|
| committer | Peng Wu <alexepico@gmail.com> | 2024-09-27 09:54:21 +0800 |
| commit | 43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6 (patch) | |
| tree | a6a255542d4fba695cac756f317b33bdedb6cc8c /src/storage | |
| parent | 17b616cbc44dcdf662edb665e0f3ddee9d065070 (diff) | |
| download | libpinyin-43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6.tar.gz libpinyin-43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6.tar.xz libpinyin-43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6.zip | |
Update class PunctTable
Diffstat (limited to 'src/storage')
| -rw-r--r-- | src/storage/punct_table.cpp | 32 | ||||
| -rw-r--r-- | src/storage/punct_table.h | 3 | ||||
| -rw-r--r-- | src/storage/punct_table_bdb.cpp | 2 | ||||
| -rw-r--r-- | src/storage/punct_table_kyotodb.cpp | 28 |
4 files changed, 42 insertions, 23 deletions
diff --git a/src/storage/punct_table.cpp b/src/storage/punct_table.cpp index 184be92..461d4bc 100644 --- a/src/storage/punct_table.cpp +++ b/src/storage/punct_table.cpp @@ -23,6 +23,8 @@ using namespace pinyin; +static const ucs4_t null_char = 0; + PunctTableEntry::PunctTableEntry() { m_ucs4_cache = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); m_utf8_cache = g_string_new(NULL); @@ -44,11 +46,8 @@ bool PunctTableEntry::escape(const gchar * punct, gint maxlen) { glong ucs4_len = 0; gunichar * ucs4_str = g_utf8_to_ucs4(punct, maxlen, NULL, &ucs4_len, NULL); - for(int i = 0; i < ucs4_len; ++i) { - g_array_append_val(m_ucs4_cache, ucs4_str[i]); - if (i < ucs4_len - 1) - g_array_append_val(m_ucs4_cache, ucs4_str[i]); - } + g_array_append_vals(m_ucs4_cache, ucs4_str, ucs4_len); + g_array_append_val(m_ucs4_cache, null_char); g_free(ucs4_str); return true; @@ -64,15 +63,11 @@ int PunctTableEntry::unescape(const ucs4_t * punct, gint maxlen) { while (index < maxlen) { g_string_append_unichar(m_utf8_cache, punct[index]); index++; - if (index >= maxlen) - break; - if (punct[index - 1] == punct[index]) - index++; - else + if (punct[index] == null_char) break; } - return index; + return index + 1; } bool PunctTableEntry::get_all_punctuations(gchar ** & puncts) { @@ -102,8 +97,11 @@ bool PunctTableEntry::append_punctuation(const gchar * punct) { gchar ** puncts = NULL; get_all_punctuations(puncts); - if (puncts && g_strv_contains(puncts, punct)) - abort(); + if (puncts && g_strv_contains(puncts, punct)) { + fprintf(stderr, "duplicated punctuations: %s\n", punct); + g_strfreev(puncts); + return false; + } g_strfreev(puncts); if (!escape(punct)) @@ -138,13 +136,9 @@ bool PunctTableEntry::remove_punctuation(const gchar * punct) { } /* check the next punctuation index */ + while (null_char != *(begin + index)) + index++; index++; - while (begin + index < end) { - if (begin[index - 1] == begin[index]) - index += 2; - else - break; - } } return false; diff --git a/src/storage/punct_table.h b/src/storage/punct_table.h index 256fd7b..b8b84d4 100644 --- a/src/storage/punct_table.h +++ b/src/storage/punct_table.h @@ -40,8 +40,7 @@ class PunctTable; /** * In order to support some punctuations with variable length, - * the code store "..." like ".....". The ".." string means - * this punctuation has another character following ".". + * the punctuations are appended with the null ucs4_t character. */ class PunctTableEntry{ friend class PunctTable; diff --git a/src/storage/punct_table_bdb.cpp b/src/storage/punct_table_bdb.cpp index 3908dda..b4e52d9 100644 --- a/src/storage/punct_table_bdb.cpp +++ b/src/storage/punct_table_bdb.cpp @@ -153,7 +153,7 @@ bool PunctTable::load_entry(phrase_token_t index) { memset(&db_data, 0, sizeof(DBT)); int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); if (ret != 0) - return false; + return true; m_entry->m_chunk.set_content(0, db_data.data, db_data.size); return true; diff --git a/src/storage/punct_table_kyotodb.cpp b/src/storage/punct_table_kyotodb.cpp index bfa5d6c..698af1b 100644 --- a/src/storage/punct_table_kyotodb.cpp +++ b/src/storage/punct_table_kyotodb.cpp @@ -80,6 +80,19 @@ bool PunctTable::load_db(const char * filename) { if (!m_db->load_snapshot(filename, NULL)) return false; +#if 0 + /* load db into memory. */ + BasicDB * tmp_db = new TreeDB; + if (!tmp_db->open(filename, BasicDB::OREADER)) + return false; + + CopyVisitor visitor(m_db); + tmp_db->iterate(&visitor, false); + + tmp_db->close(); + delete tmp_db; +#endif + return true; } @@ -91,6 +104,19 @@ bool PunctTable::save_db(const char * new_filename) { if (!m_db->dump_snapshot(new_filename, NULL)) return false; +#if 0 + BasicDB * tmp_db = new TreeDB; + if (!tmp_db->open(new_filename, BasicDB::OWRITER|BasicDB::OCREATE)) + return false; + + CopyVisitor visitor(tmp_db); + m_db->iterate(&visitor, false); + + tmp_db->synchronize(); + tmp_db->close(); + delete tmp_db; +#endif + return true; } @@ -105,7 +131,7 @@ bool PunctTable::load_entry(phrase_token_t index) { const int32_t vsiz = m_db->check(kbuf, sizeof(phrase_token_t)); /* -1 on failure. */ if (-1 == vsiz || 0 == vsiz) - return false; + return true; m_entry->m_chunk.set_size(vsiz); /* m_chunk may re-allocate here. */ |
