diff options
Diffstat (limited to 'data')
-rw-r--r-- | data/CMakeLists.txt | 95 | ||||
-rw-r--r-- | data/Makefile.am | 67 | ||||
-rw-r--r-- | data/table.conf | 17 |
3 files changed, 179 insertions, 0 deletions
diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt new file mode 100644 index 0000000..7301279 --- /dev/null +++ b/data/CMakeLists.txt @@ -0,0 +1,95 @@ +set( + BINARY_MODEL_DATA + gb_char.bin + gbk_char.bin + phrase_index.bin + pinyin_index.bin + bigram.db +) + +set( + BINARY_MODEL_DATA_FILES + ${CMAKE_BINARY_DIR}/data/gb_char.bin + ${CMAKE_BINARY_DIR}/data/gbk_char.bin + ${CMAKE_BINARY_DIR}/data/phrase_index.bin + ${CMAKE_BINARY_DIR}/data/pinyin_index.bin + ${CMAKE_BINARY_DIR}/data/bigram.db +) + +set( + gen_binary_files_BIN + ${CMAKE_BINARY_DIR}/utils/storage/gen_binary_files +) + +set( + import_interpolation_BIN + ${CMAKE_BINARY_DIR}/utils/storage/import_interpolation +) + +set( + gen_unigram_BIN + ${CMAKE_BINARY_DIR}/utils/training/gen_unigram +) + +add_custom_target( + data + ALL + DEPENDS + ${BINARY_MODEL_DATA} +) + +add_custom_command( + OUTPUT + ${CMAKE_SOURCE_DIR}/data/gb_char.table + ${CMAKE_SOURCE_DIR}/data/gbk_char.table + ${CMAKE_SOURCE_DIR}/data/interpolation2.text + COMMENT + "Downloading textual model data..." + COMMAND + wget http://downloads.sourceforge.net/libpinyin/models/model5.text.tar.gz + COMMAND + tar xvf model5.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data +) + +add_custom_command( + OUTPUT + gb_char.bin + gbk_char.bin + phrase_index.bin + pinyin_index.bin + COMMENT + "Building binary model data..." + COMMAND + ${gen_binary_files_BIN} --table-dir ${CMAKE_SOURCE_DIR}/data + DEPENDS + gen_binary_files + ${CMAKE_SOURCE_DIR}/data/gb_char.table + ${CMAKE_SOURCE_DIR}/data/gbk_char.table +) + +add_custom_command( + OUTPUT + bigram.db + COMMENT + "Building binary bigram data..." + COMMAND + ${import_interpolation_BIN} < ${CMAKE_SOURCE_DIR}/data/interpolation2.text + COMMAND + ${gen_unigram_BIN} + DEPENDS + import_interpolation + ${CMAKE_SOURCE_DIR}/data/interpolation2.text +) + +install( + FILES + ${BINARY_MODEL_DATA_FILES} + DESTINATION + ${DIR_SHARE_LIBPINYIN}/data +) + +set_directory_properties( + PROPERTIES + ADDITIONAL_MAKE_CLEAN_FILES + ${BINARY_MODEL_DATA_FILES} +) diff --git a/data/Makefile.am b/data/Makefile.am new file mode 100644 index 0000000..c75fd95 --- /dev/null +++ b/data/Makefile.am @@ -0,0 +1,67 @@ +## Makefile.am -- Process this file with automake to produce Makefile.in +## Copyright (C) 2011 Peng Wu +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +tablefiles = gb_char.table gbk_char.table \ + merged.table \ + art.table culture.table economy.table \ + geology.table history.table life.table \ + nature.table scitech.table \ + society.table sport.table + +binfiles = ${tablefiles:.table=.bin} + + +textual_model_data = interpolation2.text \ + $(tablefiles) + + +binary_model_data = phrase_index.bin pinyin_index.bin \ + bigram.db \ + $(binfiles) + + +MAINTAINERCLEANFILES = Makefile.in + +EXTRA_DIST = $(textual_model_data) \ + table.conf + +libpinyin_db_DATA = $(binary_model_data) \ + table.conf + +libpinyin_dbdir = $(libdir)/libpinyin/data + +CLEANFILES = $(binary_model_data) + +interpolation2.text: + wget http://downloads.sourceforge.net/libpinyin/models/model6.text.tar.gz + tar xvf model6.text.tar.gz -C $(top_srcdir)/data + + +$(tablefiles): interpolation2.text + +bigram.db: $(textual_model_data) + $(RM) $(binary_model_data) + ../utils/storage/gen_binary_files --table-dir $(top_srcdir)/data + ../utils/storage/import_interpolation --table-dir $(top_srcdir)/data < $(top_srcdir)/data/interpolation2.text + ../utils/training/gen_unigram --table-dir $(top_srcdir)/data + +phrase_index.bin pinyin_index.bin $(binfiles): bigram.db + +modify: + git reset --hard + sed -i -r -e "s'lambda parameter:0\\.[0-9]{3,6}'lambda parameter:$(LAMBDA_PARAMETER)'" table.conf diff --git a/data/table.conf b/data/table.conf new file mode 100644 index 0000000..096907c --- /dev/null +++ b/data/table.conf @@ -0,0 +1,17 @@ +binary format version:3 +model data version:6 +lambda parameter:0.276607 + +4 art.table art.bin art.dbin DICTIONARY +5 culture.table culture.bin culture.dbin DICTIONARY +6 economy.table economy.bin economy.dbin DICTIONARY +7 geology.table geology.bin geology.dbin DICTIONARY +8 history.table history.bin history.dbin DICTIONARY + +9 life.table life.bin life.dbin DICTIONARY +10 nature.table nature.bin nature.dbin DICTIONARY +11 scitech.table scitech.bin scitech.dbin DICTIONARY +12 society.table society.bin society.dbin DICTIONARY +13 sport.table sport.bin sport.dbin DICTIONARY + +14 NULL NULL network.bin USER_FILE
\ No newline at end of file |