From fce87d8ba5e8217128a5da786ffdbf1696dc0ff0 Mon Sep 17 00:00:00 2001
From: Santhosh Thottingal <santhosh.thottingal@gmail.com>
Date: Mon, 30 Mar 2009 22:47:46 +0530
Subject: Adding ngram module

---
 silpa/modules/ngram/visualizer.py | 102 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 silpa/modules/ngram/visualizer.py

(limited to 'silpa/modules/ngram/visualizer.py')

diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
new file mode 100644
index 0000000..0debe28
--- /dev/null
+++ b/silpa/modules/ngram/visualizer.py
@@ -0,0 +1,102 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
+# URL: http://www.smc.org.in
+import pydot
+import codecs
+
+class NGramVisualizer:
+	depth=0
+	def loadCorpus(self, corpus_file_name):	
+		graph_dict = dict()
+		line = []
+		line_number = 0
+		rule_number = 0
+		corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
+		while 1:
+			line_number = line_number +1 
+   			text = unicode( corpus_file.readline())
+			if text == "":
+			      break
+			if text[0] == '#': 
+			      continue 
+			line_number = line_number +1       
+			line = text.strip()
+			if(line == ""):
+				  continue 
+			words=line.split(" ")
+			word_count=len(words)
+			prev_word=""
+			for word in words:
+				word=word.strip()
+				if(prev_word==""):
+					prev_word=word	
+					continue;
+				if(prev_word!=""):
+					if(graph_dict.has_key(prev_word)):
+						graph_dict[prev_word]=graph_dict[prev_word]+" -> "+word
+					else:
+						graph_dict[prev_word]=word
+					prev_word=word	
+			prev_word=""		
+		return graph_dict
+	def generate_full_graph(self, start_word, graph_dict,outputimage):
+		
+		for key in graph_dict.iterkeys():
+			values=graph_dict[key].split("->")
+			for value in values:
+				value=value.strip()
+				#print key, value
+				if(start_word>""):
+					if(key==start_word):
+						graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
+				else:
+					graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))		
+		
+		
+	def generate_graph(self, graph_dict, graph, src):
+		self.depth=self.depth+1
+		#print self.depth ,src 
+		if(graph.get_node(src)!=[]):
+			return graph
+		if(self.depth>200):
+			return graph
+		values=[]		
+		if(graph_dict.has_key(src))	:
+			values=graph_dict[src].split("->")
+		for dest in values:
+			dest=dest.strip()
+			#print src, dest,graph.get_edge(src,dest)
+			if(graph.get_edge(src,dest)):
+				continue
+			else:	
+				graph.add_edge(pydot.Edge(src,dest))	
+				graph=self.generate_graph(graph_dict, graph, dest)
+				
+		return graph
+		
+if __name__ == "__main__":
+	ngv=NGramVisualizer () 
+	graph_dict=ngv.loadCorpus ("hi.txt")
+	graph=pydot.Dot()
+	graph=ngv.generate_graph(graph_dict, graph,u"भारत")
+	print graph.to_string().encode("utf-8")
+	#graph.write("ngvgraph-hi.png","dot", "raw" )
-- 
cgit 


From 2d44c4f029972bdba12cf2f8d1e863f71c05087c Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Sun, 5 Apr 2009 17:59:16 +0530
Subject: Updated the ngram module in silpa to deal with sentence breaks

---
 silpa/modules/ngram/visualizer.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'silpa/modules/ngram/visualizer.py')

diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index 0debe28..d46baeb 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -26,10 +26,12 @@ import codecs
 class NGramVisualizer:
 	depth=0
 	def loadCorpus(self, corpus_file_name):	
+		limiters = [".","!","?",",",";"]
 		graph_dict = dict()
 		line = []
 		line_number = 0
 		rule_number = 0
+		corpus=""
 		corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
 		while 1:
 			line_number = line_number +1 
@@ -42,10 +44,22 @@ class NGramVisualizer:
 			line = text.strip()
 			if(line == ""):
 				  continue 
+			corpus=corpus+" "+line
+		sentences=[]
+		sentence = ""
+		start = 0
+		for index in range(0,len(corpus)):
+			for delimit in limiters:
+				if corpus[index] == delimit:
+					sentence = corpus[start:index]
+					sentences.append(sentence)
+					start = index+1
+		for line in sentences:
 			words=line.split(" ")
 			word_count=len(words)
 			prev_word=""
 			for word in words:
+				#print word
 				word=word.strip()
 				if(prev_word==""):
 					prev_word=word	
@@ -56,7 +70,8 @@ class NGramVisualizer:
 					else:
 						graph_dict[prev_word]=word
 					prev_word=word	
-			prev_word=""		
+			prev_word=""
+
 		return graph_dict
 	def generate_full_graph(self, start_word, graph_dict,outputimage):
 		
@@ -95,8 +110,8 @@ class NGramVisualizer:
 		
 if __name__ == "__main__":
 	ngv=NGramVisualizer () 
-	graph_dict=ngv.loadCorpus ("hi.txt")
+	graph_dict=ngv.loadCorpus ("ml.txt")
 	graph=pydot.Dot()
-	graph=ngv.generate_graph(graph_dict, graph,u"भारत")
+	graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
 	print graph.to_string().encode("utf-8")
 	#graph.write("ngvgraph-hi.png","dot", "raw" )
-- 
cgit 


From 97f2231e942f67450b27f161da11e863334921ba Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Mon, 6 Apr 2009 04:15:42 +0530
Subject: the ngram code is changed and removed all hard codings

---
 silpa/modules/ngram/visualizer.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

(limited to 'silpa/modules/ngram/visualizer.py')

diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index d46baeb..e2c1136 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -22,20 +22,27 @@
 # URL: http://www.smc.org.in
 import pydot
 import codecs
+import pickle
 
 class NGramVisualizer:
 	depth=0
-	def loadCorpus(self, corpus_file_name):	
+	def loadCorpus(self,new_file_name,corpus_file_name):	
 		limiters = [".","!","?",",",";"]
-		graph_dict = dict()
+		try:
+			corpusfile = open(corpus_file_name)
+		except IOError:
+			graph_dict = dict()
+		else:
+			graph_dict = pickle.load(corpusfile)
+	#	graph_dict = dict()
 		line = []
 		line_number = 0
 		rule_number = 0
 		corpus=""
-		corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
+		data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
 		while 1:
 			line_number = line_number +1 
-   			text = unicode( corpus_file.readline())
+   			text = unicode( data_file.readline())
 			if text == "":
 			      break
 			if text[0] == '#': 
@@ -72,7 +79,8 @@ class NGramVisualizer:
 					prev_word=word	
 			prev_word=""
 
-		return graph_dict
+		pickle.dump(graph_dict,open(corpus_file_name,'w'))
+		#return graph_dict
 	def generate_full_graph(self, start_word, graph_dict,outputimage):
 		
 		for key in graph_dict.iterkeys():
@@ -108,10 +116,12 @@ class NGramVisualizer:
 				
 		return graph
 		
-if __name__ == "__main__":
-	ngv=NGramVisualizer () 
-	graph_dict=ngv.loadCorpus ("ml.txt")
-	graph=pydot.Dot()
-	graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
-	print graph.to_string().encode("utf-8")
+#if __name__ == "__main__":
+#	ngv=NGramVisualizer () 
+#	graph_dict = dict()
+#	graph_dict=ngv.loadCorpus ("ml.txt",graph_dict)
+#	pickle.dump(graph_dict,open('ngram_ml.txt','w'))
+#	graph=pydot.Dot()
+#	graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
+#	print graph.to_string().encode("utf-8")
 	#graph.write("ngvgraph-hi.png","dot", "raw" )
-- 
cgit 


From 1b62cfff2a910765b700bbea15786a1d62d800ef Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Tue, 7 Apr 2009 03:10:02 +0530
Subject: Updated the ngram module with option parser

---
 silpa/modules/ngram/visualizer.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'silpa/modules/ngram/visualizer.py')

diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index e2c1136..585c11f 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -26,7 +26,7 @@ import pickle
 
 class NGramVisualizer:
 	depth=0
-	def loadCorpus(self,new_file_name,corpus_file_name):	
+	def loadCorpus(self,corpus,corpus_file_name):	
 		limiters = [".","!","?",",",";"]
 		try:
 			corpusfile = open(corpus_file_name)
@@ -35,23 +35,6 @@ class NGramVisualizer:
 		else:
 			graph_dict = pickle.load(corpusfile)
 	#	graph_dict = dict()
-		line = []
-		line_number = 0
-		rule_number = 0
-		corpus=""
-		data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
-		while 1:
-			line_number = line_number +1 
-   			text = unicode( data_file.readline())
-			if text == "":
-			      break
-			if text[0] == '#': 
-			      continue 
-			line_number = line_number +1       
-			line = text.strip()
-			if(line == ""):
-				  continue 
-			corpus=corpus+" "+line
 		sentences=[]
 		sentence = ""
 		start = 0
-- 
cgit