summaryrefslogtreecommitdiffstats
path: root/src/storage
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2024-09-26 14:13:42 +0800
committerPeng Wu <alexepico@gmail.com>2024-09-27 09:54:21 +0800
commit43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6 (patch)
treea6a255542d4fba695cac756f317b33bdedb6cc8c /src/storage
parent17b616cbc44dcdf662edb665e0f3ddee9d065070 (diff)
downloadlibpinyin-43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6.tar.gz
libpinyin-43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6.tar.xz
libpinyin-43c867fb6ed0f29cb3b75aea485f9952f6cdb7f6.zip
Update class PunctTable
Diffstat (limited to 'src/storage')
-rw-r--r--src/storage/punct_table.cpp32
-rw-r--r--src/storage/punct_table.h3
-rw-r--r--src/storage/punct_table_bdb.cpp2
-rw-r--r--src/storage/punct_table_kyotodb.cpp28
4 files changed, 42 insertions, 23 deletions
diff --git a/src/storage/punct_table.cpp b/src/storage/punct_table.cpp
index 184be92..461d4bc 100644
--- a/src/storage/punct_table.cpp
+++ b/src/storage/punct_table.cpp
@@ -23,6 +23,8 @@
using namespace pinyin;
+static const ucs4_t null_char = 0;
+
PunctTableEntry::PunctTableEntry() {
m_ucs4_cache = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
m_utf8_cache = g_string_new(NULL);
@@ -44,11 +46,8 @@ bool PunctTableEntry::escape(const gchar * punct, gint maxlen) {
glong ucs4_len = 0;
gunichar * ucs4_str = g_utf8_to_ucs4(punct, maxlen, NULL, &ucs4_len, NULL);
- for(int i = 0; i < ucs4_len; ++i) {
- g_array_append_val(m_ucs4_cache, ucs4_str[i]);
- if (i < ucs4_len - 1)
- g_array_append_val(m_ucs4_cache, ucs4_str[i]);
- }
+ g_array_append_vals(m_ucs4_cache, ucs4_str, ucs4_len);
+ g_array_append_val(m_ucs4_cache, null_char);
g_free(ucs4_str);
return true;
@@ -64,15 +63,11 @@ int PunctTableEntry::unescape(const ucs4_t * punct, gint maxlen) {
while (index < maxlen) {
g_string_append_unichar(m_utf8_cache, punct[index]);
index++;
- if (index >= maxlen)
- break;
- if (punct[index - 1] == punct[index])
- index++;
- else
+ if (punct[index] == null_char)
break;
}
- return index;
+ return index + 1;
}
bool PunctTableEntry::get_all_punctuations(gchar ** & puncts) {
@@ -102,8 +97,11 @@ bool PunctTableEntry::append_punctuation(const gchar * punct) {
gchar ** puncts = NULL;
get_all_punctuations(puncts);
- if (puncts && g_strv_contains(puncts, punct))
- abort();
+ if (puncts && g_strv_contains(puncts, punct)) {
+ fprintf(stderr, "duplicated punctuations: %s\n", punct);
+ g_strfreev(puncts);
+ return false;
+ }
g_strfreev(puncts);
if (!escape(punct))
@@ -138,13 +136,9 @@ bool PunctTableEntry::remove_punctuation(const gchar * punct) {
}
/* check the next punctuation index */
+ while (null_char != *(begin + index))
+ index++;
index++;
- while (begin + index < end) {
- if (begin[index - 1] == begin[index])
- index += 2;
- else
- break;
- }
}
return false;
diff --git a/src/storage/punct_table.h b/src/storage/punct_table.h
index 256fd7b..b8b84d4 100644
--- a/src/storage/punct_table.h
+++ b/src/storage/punct_table.h
@@ -40,8 +40,7 @@ class PunctTable;
/**
* In order to support some punctuations with variable length,
- * the code store "..." like ".....". The ".." string means
- * this punctuation has another character following ".".
+ * the punctuations are appended with the null ucs4_t character.
*/
class PunctTableEntry{
friend class PunctTable;
diff --git a/src/storage/punct_table_bdb.cpp b/src/storage/punct_table_bdb.cpp
index 3908dda..b4e52d9 100644
--- a/src/storage/punct_table_bdb.cpp
+++ b/src/storage/punct_table_bdb.cpp
@@ -153,7 +153,7 @@ bool PunctTable::load_entry(phrase_token_t index) {
memset(&db_data, 0, sizeof(DBT));
int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
if (ret != 0)
- return false;
+ return true;
m_entry->m_chunk.set_content(0, db_data.data, db_data.size);
return true;
diff --git a/src/storage/punct_table_kyotodb.cpp b/src/storage/punct_table_kyotodb.cpp
index bfa5d6c..698af1b 100644
--- a/src/storage/punct_table_kyotodb.cpp
+++ b/src/storage/punct_table_kyotodb.cpp
@@ -80,6 +80,19 @@ bool PunctTable::load_db(const char * filename) {
if (!m_db->load_snapshot(filename, NULL))
return false;
+#if 0
+ /* load db into memory. */
+ BasicDB * tmp_db = new TreeDB;
+ if (!tmp_db->open(filename, BasicDB::OREADER))
+ return false;
+
+ CopyVisitor visitor(m_db);
+ tmp_db->iterate(&visitor, false);
+
+ tmp_db->close();
+ delete tmp_db;
+#endif
+
return true;
}
@@ -91,6 +104,19 @@ bool PunctTable::save_db(const char * new_filename) {
if (!m_db->dump_snapshot(new_filename, NULL))
return false;
+#if 0
+ BasicDB * tmp_db = new TreeDB;
+ if (!tmp_db->open(new_filename, BasicDB::OWRITER|BasicDB::OCREATE))
+ return false;
+
+ CopyVisitor visitor(tmp_db);
+ m_db->iterate(&visitor, false);
+
+ tmp_db->synchronize();
+ tmp_db->close();
+ delete tmp_db;
+#endif
+
return true;
}
@@ -105,7 +131,7 @@ bool PunctTable::load_entry(phrase_token_t index) {
const int32_t vsiz = m_db->check(kbuf, sizeof(phrase_token_t));
/* -1 on failure. */
if (-1 == vsiz || 0 == vsiz)
- return false;
+ return true;
m_entry->m_chunk.set_size(vsiz);
/* m_chunk may re-allocate here. */