diff options
author | aashiks <aashiks@gmail.com> | 2009-04-06 14:55:08 +0530 |
---|---|---|
committer | aashiks <aashiks@gmail.com> | 2009-04-06 14:55:08 +0530 |
commit | 061d7889a5f4b5faea4abc71996ae9bea259018d (patch) | |
tree | 8d8c6665873c028e5fe548741fe242286dc3c9ee /silpa/modules/ngram/visualizer.py | |
parent | 7bbd521108e0b0b095bd761d8d3db27195707999 (diff) | |
parent | 97f2231e942f67450b27f161da11e863334921ba (diff) | |
download | Rachana.git-061d7889a5f4b5faea4abc71996ae9bea259018d.tar.gz Rachana.git-061d7889a5f4b5faea4abc71996ae9bea259018d.tar.xz Rachana.git-061d7889a5f4b5faea4abc71996ae9bea259018d.zip |
Merge branch 'master' of ssh://aashiks@git.sv.nongnu.org/srv/git/smc
Diffstat (limited to 'silpa/modules/ngram/visualizer.py')
-rw-r--r-- | silpa/modules/ngram/visualizer.py | 49 |
1 files changed, 37 insertions, 12 deletions
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index 0debe28..e2c1136 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -22,18 +22,27 @@ # URL: http://www.smc.org.in import pydot import codecs +import pickle class NGramVisualizer: depth=0 - def loadCorpus(self, corpus_file_name): - graph_dict = dict() + def loadCorpus(self,new_file_name,corpus_file_name): + limiters = [".","!","?",",",";"] + try: + corpusfile = open(corpus_file_name) + except IOError: + graph_dict = dict() + else: + graph_dict = pickle.load(corpusfile) + # graph_dict = dict() line = [] line_number = 0 rule_number = 0 - corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') + corpus="" + data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') while 1: line_number = line_number +1 - text = unicode( corpus_file.readline()) + text = unicode( data_file.readline()) if text == "": break if text[0] == '#': @@ -42,10 +51,22 @@ class NGramVisualizer: line = text.strip() if(line == ""): continue + corpus=corpus+" "+line + sentences=[] + sentence = "" + start = 0 + for index in range(0,len(corpus)): + for delimit in limiters: + if corpus[index] == delimit: + sentence = corpus[start:index] + sentences.append(sentence) + start = index+1 + for line in sentences: words=line.split(" ") word_count=len(words) prev_word="" for word in words: + #print word word=word.strip() if(prev_word==""): prev_word=word @@ -56,8 +77,10 @@ class NGramVisualizer: else: graph_dict[prev_word]=word prev_word=word - prev_word="" - return graph_dict + prev_word="" + + pickle.dump(graph_dict,open(corpus_file_name,'w')) + #return graph_dict def generate_full_graph(self, start_word, graph_dict,outputimage): for key in graph_dict.iterkeys(): @@ -93,10 +116,12 @@ class NGramVisualizer: return graph -if __name__ == "__main__": - ngv=NGramVisualizer () - graph_dict=ngv.loadCorpus ("hi.txt") - graph=pydot.Dot() - graph=ngv.generate_graph(graph_dict, graph,u"भारत") - print graph.to_string().encode("utf-8") +#if __name__ == "__main__": +# ngv=NGramVisualizer () +# graph_dict = dict() +# graph_dict=ngv.loadCorpus ("ml.txt",graph_dict) +# pickle.dump(graph_dict,open('ngram_ml.txt','w')) +# graph=pydot.Dot() +# graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്ക്ക്") +# print graph.to_string().encode("utf-8") #graph.write("ngvgraph-hi.png","dot", "raw" ) |