summaryrefslogtreecommitdiffstats
path: root/data
diff options
context:
space:
mode:
Diffstat (limited to 'data')
-rw-r--r--data/CMakeLists.txt95
-rw-r--r--data/Makefile.am67
-rw-r--r--data/table.conf17
3 files changed, 179 insertions, 0 deletions
diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt
new file mode 100644
index 0000000..7301279
--- /dev/null
+++ b/data/CMakeLists.txt
@@ -0,0 +1,95 @@
+set(
+ BINARY_MODEL_DATA
+ gb_char.bin
+ gbk_char.bin
+ phrase_index.bin
+ pinyin_index.bin
+ bigram.db
+)
+
+set(
+ BINARY_MODEL_DATA_FILES
+ ${CMAKE_BINARY_DIR}/data/gb_char.bin
+ ${CMAKE_BINARY_DIR}/data/gbk_char.bin
+ ${CMAKE_BINARY_DIR}/data/phrase_index.bin
+ ${CMAKE_BINARY_DIR}/data/pinyin_index.bin
+ ${CMAKE_BINARY_DIR}/data/bigram.db
+)
+
+set(
+ gen_binary_files_BIN
+ ${CMAKE_BINARY_DIR}/utils/storage/gen_binary_files
+)
+
+set(
+ import_interpolation_BIN
+ ${CMAKE_BINARY_DIR}/utils/storage/import_interpolation
+)
+
+set(
+ gen_unigram_BIN
+ ${CMAKE_BINARY_DIR}/utils/training/gen_unigram
+)
+
+add_custom_target(
+ data
+ ALL
+ DEPENDS
+ ${BINARY_MODEL_DATA}
+)
+
+add_custom_command(
+ OUTPUT
+ ${CMAKE_SOURCE_DIR}/data/gb_char.table
+ ${CMAKE_SOURCE_DIR}/data/gbk_char.table
+ ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ COMMENT
+ "Downloading textual model data..."
+ COMMAND
+ wget http://downloads.sourceforge.net/libpinyin/models/model5.text.tar.gz
+ COMMAND
+ tar xvf model5.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data
+)
+
+add_custom_command(
+ OUTPUT
+ gb_char.bin
+ gbk_char.bin
+ phrase_index.bin
+ pinyin_index.bin
+ COMMENT
+ "Building binary model data..."
+ COMMAND
+ ${gen_binary_files_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data
+ DEPENDS
+ gen_binary_files
+ ${CMAKE_SOURCE_DIR}/data/gb_char.table
+ ${CMAKE_SOURCE_DIR}/data/gbk_char.table
+)
+
+add_custom_command(
+ OUTPUT
+ bigram.db
+ COMMENT
+ "Building binary bigram data..."
+ COMMAND
+ ${import_interpolation_BIN} < ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+ COMMAND
+ ${gen_unigram_BIN}
+ DEPENDS
+ import_interpolation
+ ${CMAKE_SOURCE_DIR}/data/interpolation2.text
+)
+
+install(
+ FILES
+ ${BINARY_MODEL_DATA_FILES}
+ DESTINATION
+ ${DIR_SHARE_LIBPINYIN}/data
+)
+
+set_directory_properties(
+ PROPERTIES
+ ADDITIONAL_MAKE_CLEAN_FILES
+ ${BINARY_MODEL_DATA_FILES}
+)
diff --git a/data/Makefile.am b/data/Makefile.am
new file mode 100644
index 0000000..c75fd95
--- /dev/null
+++ b/data/Makefile.am
@@ -0,0 +1,67 @@
+## Makefile.am -- Process this file with automake to produce Makefile.in
+## Copyright (C) 2011 Peng Wu
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
+tablefiles = gb_char.table gbk_char.table \
+ merged.table \
+ art.table culture.table economy.table \
+ geology.table history.table life.table \
+ nature.table scitech.table \
+ society.table sport.table
+
+binfiles = ${tablefiles:.table=.bin}
+
+
+textual_model_data = interpolation2.text \
+ $(tablefiles)
+
+
+binary_model_data = phrase_index.bin pinyin_index.bin \
+ bigram.db \
+ $(binfiles)
+
+
+MAINTAINERCLEANFILES = Makefile.in
+
+EXTRA_DIST = $(textual_model_data) \
+ table.conf
+
+libpinyin_db_DATA = $(binary_model_data) \
+ table.conf
+
+libpinyin_dbdir = $(libdir)/libpinyin/data
+
+CLEANFILES = $(binary_model_data)
+
+interpolation2.text:
+ wget http://downloads.sourceforge.net/libpinyin/models/model6.text.tar.gz
+ tar xvf model6.text.tar.gz -C $(top_srcdir)/data
+
+
+$(tablefiles): interpolation2.text
+
+bigram.db: $(textual_model_data)
+ $(RM) $(binary_model_data)
+ ../utils/storage/gen_binary_files --table-dir $(top_srcdir)/data
+ ../utils/storage/import_interpolation --table-dir $(top_srcdir)/data < $(top_srcdir)/data/interpolation2.text
+ ../utils/training/gen_unigram --table-dir $(top_srcdir)/data
+
+phrase_index.bin pinyin_index.bin $(binfiles): bigram.db
+
+modify:
+ git reset --hard
+ sed -i -r -e "s'lambda parameter:0\\.[0-9]{3,6}'lambda parameter:$(LAMBDA_PARAMETER)'" table.conf
diff --git a/data/table.conf b/data/table.conf
new file mode 100644
index 0000000..096907c
--- /dev/null
+++ b/data/table.conf
@@ -0,0 +1,17 @@
+binary format version:3
+model data version:6
+lambda parameter:0.276607
+
+4 art.table art.bin art.dbin DICTIONARY
+5 culture.table culture.bin culture.dbin DICTIONARY
+6 economy.table economy.bin economy.dbin DICTIONARY
+7 geology.table geology.bin geology.dbin DICTIONARY
+8 history.table history.bin history.dbin DICTIONARY
+
+9 life.table life.bin life.dbin DICTIONARY
+10 nature.table nature.bin nature.dbin DICTIONARY
+11 scitech.table scitech.bin scitech.dbin DICTIONARY
+12 society.table society.bin society.dbin DICTIONARY
+13 sport.table sport.bin sport.dbin DICTIONARY
+
+14 NULL NULL network.bin USER_FILE \ No newline at end of file