From fce87d8ba5e8217128a5da786ffdbf1696dc0ff0 Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Mon, 30 Mar 2009 22:47:46 +0530 Subject: Adding ngram module --- silpa/modules/ngram/visualizer.py | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 silpa/modules/ngram/visualizer.py (limited to 'silpa/modules/ngram/visualizer.py') diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py new file mode 100644 index 0000000..0debe28 --- /dev/null +++ b/silpa/modules/ngram/visualizer.py @@ -0,0 +1,102 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Ngram +# Copyright 2008 Santhosh Thottingal +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com +# URL: http://www.smc.org.in +import pydot +import codecs + +class NGramVisualizer: + depth=0 + def loadCorpus(self, corpus_file_name): + graph_dict = dict() + line = [] + line_number = 0 + rule_number = 0 + corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') + while 1: + line_number = line_number +1 + text = unicode( corpus_file.readline()) + if text == "": + break + if text[0] == '#': + continue + line_number = line_number +1 + line = text.strip() + if(line == ""): + continue + words=line.split(" ") + word_count=len(words) + prev_word="" + for word in words: + word=word.strip() + if(prev_word==""): + prev_word=word + continue; + if(prev_word!=""): + if(graph_dict.has_key(prev_word)): + graph_dict[prev_word]=graph_dict[prev_word]+" -> "+word + else: + graph_dict[prev_word]=word + prev_word=word + prev_word="" + return graph_dict + def generate_full_graph(self, start_word, graph_dict,outputimage): + + for key in graph_dict.iterkeys(): + values=graph_dict[key].split("->") + for value in values: + value=value.strip() + #print key, value + if(start_word>""): + if(key==start_word): + graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8'))) + else: + graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8'))) + + + def generate_graph(self, graph_dict, graph, src): + self.depth=self.depth+1 + #print self.depth ,src + if(graph.get_node(src)!=[]): + return graph + if(self.depth>200): + return graph + values=[] + if(graph_dict.has_key(src)) : + values=graph_dict[src].split("->") + for dest in values: + dest=dest.strip() + #print src, dest,graph.get_edge(src,dest) + if(graph.get_edge(src,dest)): + continue + else: + graph.add_edge(pydot.Edge(src,dest)) + graph=self.generate_graph(graph_dict, graph, dest) + + return graph + +if __name__ == "__main__": + ngv=NGramVisualizer () + graph_dict=ngv.loadCorpus ("hi.txt") + graph=pydot.Dot() + graph=ngv.generate_graph(graph_dict, graph,u"भारत") + print graph.to_string().encode("utf-8") + #graph.write("ngvgraph-hi.png","dot", "raw" ) -- cgit From 2d44c4f029972bdba12cf2f8d1e863f71c05087c Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Sun, 5 Apr 2009 17:59:16 +0530 Subject: Updated the ngram module in silpa to deal with sentence breaks --- silpa/modules/ngram/visualizer.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'silpa/modules/ngram/visualizer.py') diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index 0debe28..d46baeb 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -26,10 +26,12 @@ import codecs class NGramVisualizer: depth=0 def loadCorpus(self, corpus_file_name): + limiters = [".","!","?",",",";"] graph_dict = dict() line = [] line_number = 0 rule_number = 0 + corpus="" corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') while 1: line_number = line_number +1 @@ -42,10 +44,22 @@ class NGramVisualizer: line = text.strip() if(line == ""): continue + corpus=corpus+" "+line + sentences=[] + sentence = "" + start = 0 + for index in range(0,len(corpus)): + for delimit in limiters: + if corpus[index] == delimit: + sentence = corpus[start:index] + sentences.append(sentence) + start = index+1 + for line in sentences: words=line.split(" ") word_count=len(words) prev_word="" for word in words: + #print word word=word.strip() if(prev_word==""): prev_word=word @@ -56,7 +70,8 @@ class NGramVisualizer: else: graph_dict[prev_word]=word prev_word=word - prev_word="" + prev_word="" + return graph_dict def generate_full_graph(self, start_word, graph_dict,outputimage): @@ -95,8 +110,8 @@ class NGramVisualizer: if __name__ == "__main__": ngv=NGramVisualizer () - graph_dict=ngv.loadCorpus ("hi.txt") + graph_dict=ngv.loadCorpus ("ml.txt") graph=pydot.Dot() - graph=ngv.generate_graph(graph_dict, graph,u"भारत") + graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്") print graph.to_string().encode("utf-8") #graph.write("ngvgraph-hi.png","dot", "raw" ) -- cgit From 97f2231e942f67450b27f161da11e863334921ba Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Mon, 6 Apr 2009 04:15:42 +0530 Subject: the ngram code is changed and removed all hard codings --- silpa/modules/ngram/visualizer.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'silpa/modules/ngram/visualizer.py') diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index d46baeb..e2c1136 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -22,20 +22,27 @@ # URL: http://www.smc.org.in import pydot import codecs +import pickle class NGramVisualizer: depth=0 - def loadCorpus(self, corpus_file_name): + def loadCorpus(self,new_file_name,corpus_file_name): limiters = [".","!","?",",",";"] - graph_dict = dict() + try: + corpusfile = open(corpus_file_name) + except IOError: + graph_dict = dict() + else: + graph_dict = pickle.load(corpusfile) + # graph_dict = dict() line = [] line_number = 0 rule_number = 0 corpus="" - corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') + data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') while 1: line_number = line_number +1 - text = unicode( corpus_file.readline()) + text = unicode( data_file.readline()) if text == "": break if text[0] == '#': @@ -72,7 +79,8 @@ class NGramVisualizer: prev_word=word prev_word="" - return graph_dict + pickle.dump(graph_dict,open(corpus_file_name,'w')) + #return graph_dict def generate_full_graph(self, start_word, graph_dict,outputimage): for key in graph_dict.iterkeys(): @@ -108,10 +116,12 @@ class NGramVisualizer: return graph -if __name__ == "__main__": - ngv=NGramVisualizer () - graph_dict=ngv.loadCorpus ("ml.txt") - graph=pydot.Dot() - graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്") - print graph.to_string().encode("utf-8") +#if __name__ == "__main__": +# ngv=NGramVisualizer () +# graph_dict = dict() +# graph_dict=ngv.loadCorpus ("ml.txt",graph_dict) +# pickle.dump(graph_dict,open('ngram_ml.txt','w')) +# graph=pydot.Dot() +# graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്") +# print graph.to_string().encode("utf-8") #graph.write("ngvgraph-hi.png","dot", "raw" ) -- cgit From 1b62cfff2a910765b700bbea15786a1d62d800ef Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Tue, 7 Apr 2009 03:10:02 +0530 Subject: Updated the ngram module with option parser --- silpa/modules/ngram/visualizer.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'silpa/modules/ngram/visualizer.py') diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index e2c1136..585c11f 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -26,7 +26,7 @@ import pickle class NGramVisualizer: depth=0 - def loadCorpus(self,new_file_name,corpus_file_name): + def loadCorpus(self,corpus,corpus_file_name): limiters = [".","!","?",",",";"] try: corpusfile = open(corpus_file_name) @@ -35,23 +35,6 @@ class NGramVisualizer: else: graph_dict = pickle.load(corpusfile) # graph_dict = dict() - line = [] - line_number = 0 - rule_number = 0 - corpus="" - data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') - while 1: - line_number = line_number +1 - text = unicode( data_file.readline()) - if text == "": - break - if text[0] == '#': - continue - line_number = line_number +1 - line = text.strip() - if(line == ""): - continue - corpus=corpus+" "+line sentences=[] sentence = "" start = 0 -- cgit