1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <glib.h>
#include "pinyin.h"
typedef GHashTable * HashofWordPair;
typedef GHashTable * HashofSecondWord;
/* Hash token of Hash token of word count. */
HashofWordPair g_hash_of_document = NULL;
PhraseLargeTable * g_phrases = NULL;
void print_help(){
printf("gen_k_mixture_model [--skip-pi-gram-training]\n");
printf(" [--skip-bi-gram-training]\n");
printf(" [--skip-k-mixture-model-training]\n");
printf(" [--maximum-ocurrs-allowed <INT>]\n");
printf(" [--maximum-increase-rates-allowed <FLOAT>]\n");
printf(" [--k-mixture-model-file <FILENAME>]\n");
printf(" {<FILENAME>}+");
}
bool convert_document_to_hash(FILE * document){
char * linebuf = NULL;
size_t size = 0;
phrase_token_t last_token, cur_token = last_token = 0;
while ( getline(&linebuf, &size, document) ){
if ( feof(document) )
break;
/* Note: check '\n' here? */
linebuf[strlen(linebuf) - 1] = '\0';
glong phrase_len = 0;
utf16_t * phrase = g_utf8_to_utf16(linebuf, -1, NULL, &phrase_len, NULL);
if ( phrase_len == 0 )
continue;
phrase_token_t token = 0;
int search_result = g_phrases->search( phrase_len, phrase, token );
if ( ! (search_result & SEARCH_OK) )
token = 0;
last_token = cur_token;
cur_token = token;
/* remember the (last_token, cur_token) word pair. */
gpointer value = NULL;
HashofSecondWord hash_of_second_word = NULL;
gboolean lookup_result = g_hash_table_lookup_extended
(g_hash_of_document, GUINT_TO_POINTER(last_token),
NULL, &value);
if ( !lookup_result ){
hash_of_second_word = g_hash_table_new(g_int_hash, g_int_equal);
} else {
hash_of_second_word = (HashofSecondWord) value;
}
value = NULL;
lookup_result = g_hash_table_lookup_extended
(hash_of_second_word, GUINT_TO_POINTER(cur_token),
NULL, &value);
guint32 count = 0;
if ( lookup_result ) {
count = GPOINTER_TO_UINT(value);
}
count ++;
g_hash_table_insert(hash_of_second_word,
GUINT_TO_POINTER(cur_token),
GUINT_TO_POINTER(count));
g_hash_table_insert(g_hash_of_document,
GUINT_TO_POINTER(last_token),
hash_of_second_word);
}
return true;
}
int main(int argc, char * argv[]){
g_hash_of_document = g_hash_table_new_full
(g_int_hash, g_int_equal, NULL, (GDestroyNotify)g_hash_table_unref);
return 0;
}
|