From fce87d8ba5e8217128a5da786ffdbf1696dc0ff0 Mon Sep 17 00:00:00 2001
From: Santhosh Thottingal <santhosh.thottingal@gmail.com>
Date: Mon, 30 Mar 2009 22:47:46 +0530
Subject: Adding ngram module

---
 silpa/modules/ngram/visualizer.py | 102 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 silpa/modules/ngram/visualizer.py

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
new file mode 100644
index 0000000..0debe28
--- /dev/null
+++ b/silpa/modules/ngram/visualizer.py
@@ -0,0 +1,102 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
+# URL: http://www.smc.org.in
+import pydot
+import codecs
+
+class NGramVisualizer:
+	depth=0
+	def loadCorpus(self, corpus_file_name):	
+		graph_dict = dict()
+		line = []
+		line_number = 0
+		rule_number = 0
+		corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
+		while 1:
+			line_number = line_number +1 
+   			text = unicode( corpus_file.readline())
+			if text == "":
+			      break
+			if text[0] == '#': 
+			      continue 
+			line_number = line_number +1       
+			line = text.strip()
+			if(line == ""):
+				  continue 
+			words=line.split(" ")
+			word_count=len(words)
+			prev_word=""
+			for word in words:
+				word=word.strip()
+				if(prev_word==""):
+					prev_word=word	
+					continue;
+				if(prev_word!=""):
+					if(graph_dict.has_key(prev_word)):
+						graph_dict[prev_word]=graph_dict[prev_word]+" -> "+word
+					else:
+						graph_dict[prev_word]=word
+					prev_word=word	
+			prev_word=""		
+		return graph_dict
+	def generate_full_graph(self, start_word, graph_dict,outputimage):
+		
+		for key in graph_dict.iterkeys():
+			values=graph_dict[key].split("->")
+			for value in values:
+				value=value.strip()
+				#print key, value
+				if(start_word>""):
+					if(key==start_word):
+						graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
+				else:
+					graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))		
+		
+		
+	def generate_graph(self, graph_dict, graph, src):
+		self.depth=self.depth+1
+		#print self.depth ,src 
+		if(graph.get_node(src)!=[]):
+			return graph
+		if(self.depth>200):
+			return graph
+		values=[]		
+		if(graph_dict.has_key(src))	:
+			values=graph_dict[src].split("->")
+		for dest in values:
+			dest=dest.strip()
+			#print src, dest,graph.get_edge(src,dest)
+			if(graph.get_edge(src,dest)):
+				continue
+			else:	
+				graph.add_edge(pydot.Edge(src,dest))	
+				graph=self.generate_graph(graph_dict, graph, dest)
+				
+		return graph
+		
+if __name__ == "__main__":
+	ngv=NGramVisualizer () 
+	graph_dict=ngv.loadCorpus ("hi.txt")
+	graph=pydot.Dot()
+	graph=ngv.generate_graph(graph_dict, graph,u"भारत")
+	print graph.to_string().encode("utf-8")
+	#graph.write("ngvgraph-hi.png","dot", "raw" )
-- 
cgit 


From 2d44c4f029972bdba12cf2f8d1e863f71c05087c Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Sun, 5 Apr 2009 17:59:16 +0530
Subject: Updated the ngram module in silpa to deal with sentence breaks

---
 silpa/modules/ngram/ml.txt        |  1 +
 silpa/modules/ngram/visualizer.py | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)
 create mode 100644 silpa/modules/ngram/ml.txt

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/ml.txt b/silpa/modules/ngram/ml.txt
new file mode 100644
index 0000000..4c48980
--- /dev/null
+++ b/silpa/modules/ngram/ml.txt
@@ -0,0 +1 @@
+കടലില്‍ ജീവിക്കുന്ന ഒരു സസ്തനിയാണ് നീലത്തിമിംഗലം. ബലീന്‍ തിമിംഗലങ്ങളുടെ ഒരു ഉപജാതിയാണിവ. ലോകത്ത് ഇന്നുവരെയുള്ളതില്‍ ഏറ്റവും വലിയ ജീവിയായി കണക്കാക്കപ്പെടുന്ന നീലത്തിമിംഗലങ്ങള്‍ക്ക് 33 മീ. നീളവും 181 മെട്രിക് ടണിലധികം ഭാരവും ഉണ്ടാകാം. നീണ്ട ശരീരപ്രകൃതിയുള്ള നീലത്തിമിംഗലങ്ങളുടെ ശരീരം നീലകലര്‍ന്ന ചാരനിറത്തോടെയാണുണ്ടാവുക, ശരീരത്തിനടിഭാഗത്തേക്ക് നിറം കുറവായിരിക്കും. നീലത്തിമിംഗലങ്ങള്‍ക്ക് കുറഞ്ഞത് മൂന്നുപജാതികളെങ്കിലും ഉണ്ടെന്നു കരുതുന്നു.
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index 0debe28..d46baeb 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -26,10 +26,12 @@ import codecs
 class NGramVisualizer:
 	depth=0
 	def loadCorpus(self, corpus_file_name):	
+		limiters = [".","!","?",",",";"]
 		graph_dict = dict()
 		line = []
 		line_number = 0
 		rule_number = 0
+		corpus=""
 		corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
 		while 1:
 			line_number = line_number +1 
@@ -42,10 +44,22 @@ class NGramVisualizer:
 			line = text.strip()
 			if(line == ""):
 				  continue 
+			corpus=corpus+" "+line
+		sentences=[]
+		sentence = ""
+		start = 0
+		for index in range(0,len(corpus)):
+			for delimit in limiters:
+				if corpus[index] == delimit:
+					sentence = corpus[start:index]
+					sentences.append(sentence)
+					start = index+1
+		for line in sentences:
 			words=line.split(" ")
 			word_count=len(words)
 			prev_word=""
 			for word in words:
+				#print word
 				word=word.strip()
 				if(prev_word==""):
 					prev_word=word	
@@ -56,7 +70,8 @@ class NGramVisualizer:
 					else:
 						graph_dict[prev_word]=word
 					prev_word=word	
-			prev_word=""		
+			prev_word=""
+
 		return graph_dict
 	def generate_full_graph(self, start_word, graph_dict,outputimage):
 		
@@ -95,8 +110,8 @@ class NGramVisualizer:
 		
 if __name__ == "__main__":
 	ngv=NGramVisualizer () 
-	graph_dict=ngv.loadCorpus ("hi.txt")
+	graph_dict=ngv.loadCorpus ("ml.txt")
 	graph=pydot.Dot()
-	graph=ngv.generate_graph(graph_dict, graph,u"भारत")
+	graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
 	print graph.to_string().encode("utf-8")
 	#graph.write("ngvgraph-hi.png","dot", "raw" )
-- 
cgit 


From bba7b4bc146bb4623c6cc6ad27d70baf2e02497a Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Mon, 6 Apr 2009 03:37:15 +0530
Subject: corpus is now stored using pickle and can be reused

---
 silpa/modules/ngram/ngram_ml.txt    | 219 ++++++++++++++++++++++++++++++++++++
 silpa/modules/ngram/sample_ngram.py |  64 +++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 silpa/modules/ngram/ngram_ml.txt
 create mode 100644 silpa/modules/ngram/sample_ngram.py

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/ngram_ml.txt b/silpa/modules/ngram/ngram_ml.txt
new file mode 100644
index 0000000..c265b1c
--- /dev/null
+++ b/silpa/modules/ngram/ngram_ml.txt
@@ -0,0 +1,219 @@
+(dp0
+V\u0d35\u0d32\u0d3f\u0d2f
+p1
+V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f
+p2
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p3
+V\u0d36\u0d30\u0d40\u0d30\u0d02
+p4
+sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d
+p5
+V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+p6
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d
+p7
+V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
+p8
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28
+p9
+V\u0d12\u0d30\u0d41
+p10
+sV\u0d06\u0d23\u0d41
+p11
+V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d
+p12
+sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d
+p13
+V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p14
+sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28
+p15
+V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15
+p16
+sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d
+p17
+V\u0d28\u0d3f\u0d31\u0d02
+p18
+sV\u0d28\u0d40\u0d23\u0d4d\u0d1f
+p19
+V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33
+p20
+sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02
+p21
+V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41
+p22
+sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p23
+V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
+p24
+sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
+p25
+V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+p26
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
+p27
+V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+p28
+sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f
+p29
+V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
+p30
+sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d
+p31
+g9
+sg2
+V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+p32
+sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+p33
+V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
+p34
+sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p35
+V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f
+p36
+sg32
+g7
+sV\u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
+p37
+g21
+sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
+p38
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+p39
+sV181
+p40
+V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d
+p41
+sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02
+p42
+g1
+sV\u0d15\u0d23\u0d4d\u0d1f\u0d41
+p43
+V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+p44
+sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23
+p45
+V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p46
+sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p47
+V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p48
+sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d
+p49
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+p50
+sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d
+p51
+g42
+sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02
+p52
+V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02
+p53
+sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02
+p54
+g40
+sV\u0d2a\u0d4b\u0d32\u0d46
+p55
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
+p56
+sg41
+V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02
+p57
+sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02
+p58
+V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f
+p59
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
+p60
+V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41
+p61
+sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p62
+V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41
+p63
+sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
+p64
+V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
+p65
+sg10
+V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35
+p66
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+p67
+V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35
+p68
+sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p69
+V\u0d12\u0d30\u0d41
+p70
+sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+p71
+V\u0d2c\u0d3f -> \u0d2c\u0d3f
+p72
+sg22
+V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41
+p73
+sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d
+p74
+V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p75
+sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d
+p76
+V(B -> (B
+p77
+sg57
+g52
+sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d
+p78
+g51
+sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15
+p79
+V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+p80
+sV33
+p81
+V\u0d2e\u0d40
+p82
+sg18
+V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02
+p83
+sg20
+g3
+sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
+p84
+V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p85
+sg4
+g15
+sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p86
+V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p87
+sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+p88
+V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+p89
+sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+p90
+V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46
+p91
+sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46
+p92
+V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46
+p93
+sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+p94
+V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+p95
+sV\u0d1a\u0d46\u0d31\u0d41
+p96
+V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
+p97
+sV\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+p98
+V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41
+p99
+s.
\ No newline at end of file
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py
new file mode 100644
index 0000000..0f46ad6
--- /dev/null
+++ b/silpa/modules/ngram/sample_ngram.py
@@ -0,0 +1,64 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2009 Jinesh K J <jinesh.k@gmail.com>
+# Copyright 2009 Swathantra Malayalam Computing <smc-discuss@googlegroups.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: jinesh.k@gmail.com or smc-discuss@googlegroups.com
+# URL: http://www.smc.org.in
+import pydot
+import codecs
+import pickle
+import sys 
+from visualizer import NGramVisualizer
+def genCorpus(infile,corpus):
+	ngv=NGramVisualizer ()
+	try:
+		corpusfile = open(corpus)
+	except IOError:
+		graph_dict = dict()
+	else:
+		graph_dict = pickle.load(corpusfile)
+#	corpusfile.close()
+	graph_dict=ngv.loadCorpus (infile,graph_dict)
+	pickle.dump(graph_dict,open(corpus,'w'))
+def printGraph(corpus,start_word):
+	ngv=NGramVisualizer ()
+	graph_dict=pickle.load(open(corpus))
+	graph=pydot.Dot()
+	uni_start_word = start_word.decode("utf-8")
+#	print start_word
+#	print uni_start_word
+	graph=ngv.generate_graph(graph_dict, graph,uni_start_word)
+	print graph.to_string().encode("utf-8")
+
+if __name__ == "__main__":
+	"""
+		python sample_ngram.py <input file> <corpus path> 1 
+		this will generate the corpus for the given input file, if corpus specified at 
+		corpus path is empty. Else it will recreate the corpus for the with the additional files.
+
+		python sample_ngram.py <corpus path> <start word> 2
+		this will generate the graph for the given start word in the given corpus at corpus path.
+
+		This is just a crude attempt, a lot more improvement is to be done. 		 
+	"""
+	if sys.argv[3] == "1":
+		genCorpus(sys.argv[1],sys.argv[2])
+	elif sys.argv[3] == "2":
+		printGraph(sys.argv[1],sys.argv[2])
-- 
cgit 


From 97f2231e942f67450b27f161da11e863334921ba Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Mon, 6 Apr 2009 04:15:42 +0530
Subject: the ngram code is changed and removed all hard codings

---
 silpa/modules/ngram/ngram_ml.txt    | 251 +++++++++++++++++++-----------------
 silpa/modules/ngram/sample_ngram.py |  17 +--
 silpa/modules/ngram/visualizer.py   |  32 +++--
 3 files changed, 160 insertions(+), 140 deletions(-)

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/ngram_ml.txt b/silpa/modules/ngram/ngram_ml.txt
index c265b1c..a4db8f1 100644
--- a/silpa/modules/ngram/ngram_ml.txt
+++ b/silpa/modules/ngram/ngram_ml.txt
@@ -1,219 +1,238 @@
 (dp0
 V\u0d35\u0d32\u0d3f\u0d2f
 p1
-V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f
+V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f -> \u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f
 p2
-sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d
 p3
-V\u0d36\u0d30\u0d40\u0d30\u0d02
+V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
 p4
-sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d
 p5
-V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d -> 33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
 p6
-sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28
 p7
-V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
+V\u0d12\u0d30\u0d41 -> \u0d12\u0d30\u0d41
 p8
-sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28
+sV\u0d06\u0d23\u0d41
 p9
-V\u0d12\u0d30\u0d41
+V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d
 p10
-sV\u0d06\u0d23\u0d41
+sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d
 p11
-V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d
+V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
 p12
-sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d
+sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28
 p13
-V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15 -> \u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15
 p14
-sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28
+sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d
 p15
-V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15
+V\u0d28\u0d3f\u0d31\u0d02 -> \u0d28\u0d3f\u0d31\u0d02
 p16
-sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d
+sV\u0d28\u0d40\u0d23\u0d4d\u0d1f
 p17
-V\u0d28\u0d3f\u0d31\u0d02
+V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 -> \u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33
 p18
-sV\u0d28\u0d40\u0d23\u0d4d\u0d1f
+sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02
 p19
-V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33
+V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 -> \u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41
 p20
-sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02
+sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
 p21
-V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41
+V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
 p22
-sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
 p23
-V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
+V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d
 p24
-sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
+sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f
 p25
-V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
 p26
-sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
+sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d
 p27
-V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28
 p28
-sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f
 p29
-V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
+V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
 p30
-sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d
+sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
 p31
-g9
-sg2
-V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
 p32
-sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
 p33
-V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
+V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f
 p34
-sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+sV\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
 p35
-V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d
 p36
-sg32
-g7
 sV\u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
 p37
-g21
-sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
+V\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02
 p38
-V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
 p39
-sV181
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
 p40
-V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d
+sV181
 p41
-sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02
+V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d -> \u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d
 p42
-g1
-sV\u0d15\u0d23\u0d4d\u0d1f\u0d41
+sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02
 p43
-V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+V\u0d35\u0d32\u0d3f\u0d2f -> \u0d35\u0d32\u0d3f\u0d2f
 p44
-sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23
+sV\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33
 p45
-V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
 p46
-sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+sV\u0d15\u0d23\u0d4d\u0d1f\u0d41
 p47
-V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
 p48
-sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d
+sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23
 p49
-V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
 p50
-sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d
+sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
 p51
-g42
-sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02
+V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
 p52
-V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02
+sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d
 p53
-sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
 p54
-g40
-sV\u0d2a\u0d4b\u0d32\u0d46
+sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d
 p55
-V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
+V\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 -> \u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02
 p56
-sg41
-V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
 p57
-sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02
+V\u0d36\u0d30\u0d40\u0d30\u0d02 -> \u0d36\u0d30\u0d40\u0d30\u0d02
 p58
-V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f
+sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02
 p59
-sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
+V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02 -> \u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02
 p60
-V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41
+sV\u0d2a\u0d4b\u0d32\u0d46
 p61
-sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
 p62
-V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41
+sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
 p63
-sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
+V\u0d12\u0d30\u0d41 -> \u0d12\u0d30\u0d41
 p64
-V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
+sV\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d
 p65
-sg10
-V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35
+V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 -> \u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02
 p66
-sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02
 p67
-V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35
+V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f
 p68
-sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
 p69
-V\u0d12\u0d30\u0d41
+V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41
 p70
-sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
 p71
-V\u0d2c\u0d3f -> \u0d2c\u0d3f
+V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41
 p72
-sg22
-V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41
+sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
 p73
-sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d
+V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
 p74
-V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+sV\u0d12\u0d30\u0d41
 p75
-sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d
+V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35 -> \u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35
 p76
-V(B -> (B
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
 p77
-sg57
-g52
-sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d
+V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35
 p78
-g51
-sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
 p79
-V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
 p80
-sV33
+sV\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41
 p81
-V\u0d2e\u0d40
+V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41 -> \u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41
 p82
-sg18
-V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02
+sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d
 p83
-sg20
-g3
-sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
+V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
 p84
-V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d
 p85
-sg4
-g15
-sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+V(B -> (B -> (B
 p86
-V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+sV\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02
 p87
-sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+V\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 -> \u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02
 p88
-V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d
 p89
-sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+V\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d
 p90
-V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46
+sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15
 p91
-sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46
+V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
 p92
-V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46
+sV33
 p93
-sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+V\u0d2e\u0d40 -> \u0d2e\u0d40
 p94
-V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+sV\u0d28\u0d3f\u0d31\u0d02
 p95
-sV\u0d1a\u0d46\u0d31\u0d41
+V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02 -> \u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02
 p96
-V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
-p97
 sV\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+p97
+V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41
 p98
-V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41
+sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
 p99
+V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p100
+sV\u0d36\u0d30\u0d40\u0d30\u0d02
+p101
+V\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 -> \u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28
+p102
+sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p103
+V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p104
+sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+p105
+V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+p106
+sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+p107
+V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p108
+sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46
+p109
+V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46
+p110
+sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02
+p111
+V181 -> 181
+p112
+sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+p113
+V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+p114
+sV\u0d1a\u0d46\u0d31\u0d41
+p115
+V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
+p116
+sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+p117
+V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f
+p118
 s.
\ No newline at end of file
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py
index 0f46ad6..d373aa8 100644
--- a/silpa/modules/ngram/sample_ngram.py
+++ b/silpa/modules/ngram/sample_ngram.py
@@ -26,17 +26,7 @@ import codecs
 import pickle
 import sys 
 from visualizer import NGramVisualizer
-def genCorpus(infile,corpus):
-	ngv=NGramVisualizer ()
-	try:
-		corpusfile = open(corpus)
-	except IOError:
-		graph_dict = dict()
-	else:
-		graph_dict = pickle.load(corpusfile)
-#	corpusfile.close()
-	graph_dict=ngv.loadCorpus (infile,graph_dict)
-	pickle.dump(graph_dict,open(corpus,'w'))
+
 def printGraph(corpus,start_word):
 	ngv=NGramVisualizer ()
 	graph_dict=pickle.load(open(corpus))
@@ -51,7 +41,7 @@ if __name__ == "__main__":
 	"""
 		python sample_ngram.py <input file> <corpus path> 1 
 		this will generate the corpus for the given input file, if corpus specified at 
-		corpus path is empty. Else it will recreate the corpus for the with the additional files.
+		corpus path is empty. Else it will recreate the corpus with the additional data.
 
 		python sample_ngram.py <corpus path> <start word> 2
 		this will generate the graph for the given start word in the given corpus at corpus path.
@@ -59,6 +49,7 @@ if __name__ == "__main__":
 		This is just a crude attempt, a lot more improvement is to be done. 		 
 	"""
 	if sys.argv[3] == "1":
-		genCorpus(sys.argv[1],sys.argv[2])
+		ngv=NGramVisualizer ()
+		ngv.loadCorpus(sys.argv[1],sys.argv[2])
 	elif sys.argv[3] == "2":
 		printGraph(sys.argv[1],sys.argv[2])
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index d46baeb..e2c1136 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -22,20 +22,27 @@
 # URL: http://www.smc.org.in
 import pydot
 import codecs
+import pickle
 
 class NGramVisualizer:
 	depth=0
-	def loadCorpus(self, corpus_file_name):	
+	def loadCorpus(self,new_file_name,corpus_file_name):	
 		limiters = [".","!","?",",",";"]
-		graph_dict = dict()
+		try:
+			corpusfile = open(corpus_file_name)
+		except IOError:
+			graph_dict = dict()
+		else:
+			graph_dict = pickle.load(corpusfile)
+	#	graph_dict = dict()
 		line = []
 		line_number = 0
 		rule_number = 0
 		corpus=""
-		corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore')
+		data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
 		while 1:
 			line_number = line_number +1 
-   			text = unicode( corpus_file.readline())
+   			text = unicode( data_file.readline())
 			if text == "":
 			      break
 			if text[0] == '#': 
@@ -72,7 +79,8 @@ class NGramVisualizer:
 					prev_word=word	
 			prev_word=""
 
-		return graph_dict
+		pickle.dump(graph_dict,open(corpus_file_name,'w'))
+		#return graph_dict
 	def generate_full_graph(self, start_word, graph_dict,outputimage):
 		
 		for key in graph_dict.iterkeys():
@@ -108,10 +116,12 @@ class NGramVisualizer:
 				
 		return graph
 		
-if __name__ == "__main__":
-	ngv=NGramVisualizer () 
-	graph_dict=ngv.loadCorpus ("ml.txt")
-	graph=pydot.Dot()
-	graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
-	print graph.to_string().encode("utf-8")
+#if __name__ == "__main__":
+#	ngv=NGramVisualizer () 
+#	graph_dict = dict()
+#	graph_dict=ngv.loadCorpus ("ml.txt",graph_dict)
+#	pickle.dump(graph_dict,open('ngram_ml.txt','w'))
+#	graph=pydot.Dot()
+#	graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
+#	print graph.to_string().encode("utf-8")
 	#graph.write("ngvgraph-hi.png","dot", "raw" )
-- 
cgit 


From 1b62cfff2a910765b700bbea15786a1d62d800ef Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Tue, 7 Apr 2009 03:10:02 +0530
Subject: Updated the ngram module with option parser

---
 silpa/modules/ngram/sample_ngram.py | 57 +++++++++++++++++++++++++++----------
 silpa/modules/ngram/visualizer.py   | 19 +------------
 2 files changed, 43 insertions(+), 33 deletions(-)

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py
index d373aa8..ec4b184 100644
--- a/silpa/modules/ngram/sample_ngram.py
+++ b/silpa/modules/ngram/sample_ngram.py
@@ -26,6 +26,27 @@ import codecs
 import pickle
 import sys 
 from visualizer import NGramVisualizer
+from optparse import OptionParser
+
+def getData(new_file_name):
+	line = []
+	line_number = 0
+	rule_number = 0
+	corpus=""
+	data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
+	while 1:
+		line_number = line_number +1 
+		text = unicode( data_file.readline())
+		if text == "":
+		      break
+		if text[0] == '#': 
+		      continue 
+		line_number = line_number +1       
+		line = text.strip()
+		if(line == ""):
+			  continue 
+		corpus=corpus+" "+line
+	return corpus
 
 def printGraph(corpus,start_word):
 	ngv=NGramVisualizer ()
@@ -38,18 +59,24 @@ def printGraph(corpus,start_word):
 	print graph.to_string().encode("utf-8")
 
 if __name__ == "__main__":
-	"""
-		python sample_ngram.py <input file> <corpus path> 1 
-		this will generate the corpus for the given input file, if corpus specified at 
-		corpus path is empty. Else it will recreate the corpus with the additional data.
-
-		python sample_ngram.py <corpus path> <start word> 2
-		this will generate the graph for the given start word in the given corpus at corpus path.
-
-		This is just a crude attempt, a lot more improvement is to be done. 		 
-	"""
-	if sys.argv[3] == "1":
-		ngv=NGramVisualizer ()
-		ngv.loadCorpus(sys.argv[1],sys.argv[2])
-	elif sys.argv[3] == "2":
-		printGraph(sys.argv[1],sys.argv[2])
+	usage = "usage: %prog [options] INPUTDATA CORPUSFILE"
+	parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE")
+	parser.set_usage(usage)
+	parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA")
+	parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA")
+	(options, args) = parser.parse_args()
+	if len(args) != 2 :	
+		parser.error("incorrect number of arguments")
+	if options.infile and options.start_word:
+	    parser.error("options -f and -s are mutually exclusive")
+	ngv=NGramVisualizer ()
+	if options.infile:
+		data = getData(args[0])
+	#	print data
+		ngv.loadCorpus(data,args[1])
+	elif options.start_word:
+		printGraph(args[1],args[0])
+	else:
+	#	print args[0]
+		data = args[0].decode("utf-8")
+		ngv.loadCorpus(data,args[1])
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
index e2c1136..585c11f 100644
--- a/silpa/modules/ngram/visualizer.py
+++ b/silpa/modules/ngram/visualizer.py
@@ -26,7 +26,7 @@ import pickle
 
 class NGramVisualizer:
 	depth=0
-	def loadCorpus(self,new_file_name,corpus_file_name):	
+	def loadCorpus(self,corpus,corpus_file_name):	
 		limiters = [".","!","?",",",";"]
 		try:
 			corpusfile = open(corpus_file_name)
@@ -35,23 +35,6 @@ class NGramVisualizer:
 		else:
 			graph_dict = pickle.load(corpusfile)
 	#	graph_dict = dict()
-		line = []
-		line_number = 0
-		rule_number = 0
-		corpus=""
-		data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
-		while 1:
-			line_number = line_number +1 
-   			text = unicode( data_file.readline())
-			if text == "":
-			      break
-			if text[0] == '#': 
-			      continue 
-			line_number = line_number +1       
-			line = text.strip()
-			if(line == ""):
-				  continue 
-			corpus=corpus+" "+line
 		sentences=[]
 		sentence = ""
 		start = 0
-- 
cgit 


From 8c71596c75ce7ca1d716fc0f93e24b9ca821f081 Mon Sep 17 00:00:00 2001
From: Santhosh Thottingal <santhosh.thottingal@gmail.com>
Date: Thu, 16 Apr 2009 20:49:49 +0530
Subject: NGRAM model for Malayalam

---
 silpa/modules/ngram/ngram.py | 347 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 347 insertions(+)
 create mode 100644 silpa/modules/ngram/ngram.py

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/ngram.py b/silpa/modules/ngram/ngram.py
new file mode 100644
index 0000000..8f2d65b
--- /dev/null
+++ b/silpa/modules/ngram/ngram.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2008-2009 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+
+import codecs
+import pickle
+import pydot
+import os,sys
+from optparse import OptionParser
+VERSION=0.1
+MAX_TREE_DEPTH=1000
+PICKLED_TREE="ngram.pyo"
+class NgramNode:
+	def __init__(self, node_value="*", rank=1, child_list=None):
+		self.node_value=node_value
+		self.rank=rank
+		self.child_list=child_list
+		self.desc="Start Node"
+	def setNode(self, node_value="*", rank=None,childs=None, child_list=None):
+		self.node_value=node_value
+		self.rank=rank
+		self.child_list=child_list
+	def getName(self):
+		return self.node_value	
+	def getDesc(self):
+		return self.desc
+	def setDesc(self,desc):
+		self.desc	=  desc
+		return self.desc	
+	def getRank(self):
+		return self.rank	
+	def setRank(self, rank):
+		self.rank = rank
+	def incrRank(self, incr=1):
+		self.rank =  self.rank	+ incr
+		return self.rank		
+	def getChildList(self):
+		if(self.child_list!=None):
+			return self.child_list
+		else:
+			return None	
+	def getChildByName(self,child_name):
+		if(self.child_list==None):
+			return None
+		for child in self.child_list:
+			if(child.getName()==child_name):
+				return child
+	def childIndex(self,childnode):
+		if(self.child_list==None):
+			return -1
+		for child in self.child_list:
+			if(child.getName()==childnode.getName()):
+				return self.child_list.index(child)
+		return -2		
+	def printChildList(self):
+		if(self.child_list==None):
+			return None
+		for child in self.child_list:
+			print child,
+	def addChildNode(self, node):
+		if(node!=None):
+			if(self.child_list==None):
+				self.child_list=[]
+			#Check whether this node is already present in the Ngram Tree	
+			member_index=self.childIndex(node)
+			if(member_index>=0):
+				#Node already present.Incrementing Rank
+				self.child_list[member_index].incrRank()
+			else:
+				self.child_list.append(node)		
+		#Keep it sorted as per the ranks		
+		self.child_list.sort()		
+	def removeChildNode(self, node):
+		if(node!=None & self.child_list!=None):
+			self.child_list.remove(node)		
+	def __str__(self):
+		return "Node:  %s[%d]" % (self.node_value, self.rank)
+	'''Recursively traverse through the tree and print the nodes-Depth First Traversal'''	
+	def toString(self):
+		print "Node:  %s[%d]" % (self.node_value, self.rank)
+		child_list=self.getChildList()
+		if(child_list!=None):
+			for child_node in child_list :
+				child_node.toString()
+	'''Defining the less than operater of the object'''			
+	def __lt__(self, node):
+		return 	self.getRank() < node.getRank()
+	'''Defining the greater than operater of the object'''
+	def __gt__(self, node):
+		return 	self.getRank() > node.getRank()
+	'''Defining the equal-to operater of the object'''	
+	def __eq__(self, node):
+		if(node==None):
+			return False
+		return 	(self.getName() == node.getName()) & (self.getRank() == node.getRank())
+	'''Defining the comparison of two object instances. Required for sorting the list of objects'''	
+	def __cmp__(self, node):
+		if(node==None):
+			return 1
+		if(self.getName()==node.getName()):
+			return cmp(self.getRank(), node.getRank())
+		else:
+			return 1	
+	
+
+#Syllable Node Class
+#Extends NgramNode class
+class SyllableNode(NgramNode):
+	def __str__(self):
+		return ("Syllable: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')
+#Word Node Class
+#Extends NgramNode class
+class WordNode(NgramNode):
+	def __str__(self):
+		return  ("Word: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')
+
+class NGram:
+	def __init__(self, text=None, language=None):
+		self.text=None
+		self.language=None
+		try:
+			#Try loading picked tree object
+			self.ngrams=pickle.load(open(PICKLED_TREE))
+			print "Loaded the ngram from " + PICKLED_TREE
+		except:	
+			#Initialize with empty node
+			self.ngrams=NgramNode()
+			print "New one"
+		self.search_depth=0	
+	def getRoot(self, node_name=None):
+		if(node_name==None):
+			return self.ngrams
+		else:
+			return self.searchNode(node_name)
+				
+	def searchNodeByName(self, node_name, current_node=None, depth=MAX_TREE_DEPTH):
+		if(current_node==None):
+			current_node=self.getRoot()
+			self.search_depth = 0
+		if(self.search_depth==depth):
+			return None
+		if(current_node.getName() == node_name):
+			print "Found at depth", self.search_depth
+			return current_node
+		else:
+			child_list=current_node.getChildList()
+			if(child_list==None):
+				return None
+			else:
+				child_list=child_list	
+			self.search_depth = self.search_depth+1				
+			for child_node in child_list :
+				result_node=self.searchNodeByName(node_name,child_node, depth)
+				if(result_node!=None):
+					return result_node
+	def printNgram(self, current_node=None):
+		if(current_node==None):
+			current_node=self.getRoot()
+		print current_node
+		child_list=current_node.getChildList()
+		
+		if(child_list==None):
+			return None
+		else:
+			child_list.sort()	
+		for child_node in child_list :
+			self.printNgram(child_node)
+	def toDot(self,  graph , current_node=None):
+		if(current_node==None):
+			current_node=self.getRoot()
+		child_list=current_node.getChildList()
+		if(child_list!=None):
+			key=current_node.getName()
+			for child_node in child_list:
+				value=child_node.getName()
+				if((key!=None) & ord(key[len(key)-1])<=0x0901 & len(key)>1):
+					key=key[0:len(key)-1]
+				if(value!=None):	
+					if((ord(value[len(value)-1])<=0x0901) & len(value)>1):
+						value=value[0:len(value)-2]	
+					graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
+					self.toDot(graph,child_node)
+	def toGraph(self, output_image_file):
+		graph=pydot.Dot()
+		self.toDot(graph)
+		#print graph.to_string().encode('utf-8')
+		graph.write(output_image_file,"dot", "png" )
+		
+	def addSyllables(self,text, window_size=2):
+		words=text.split(" ")
+		ngrams = []
+		for word in words:
+			#TODO-Normalize before taking ngram!!!
+			word = "*"+word+"]"
+			syllables = self.syllabalize_ml(word)
+			syllable_count = len(syllables)
+			window_start = 0
+			window_end = 0
+			while window_start + window_size <= syllable_count:
+				if(window_start + window_size < syllable_count):
+					window_end = window_start + window_size
+				else:
+					window_end = syllable_count	
+				ngrams.append(syllables[window_start:window_end])
+				window_start = window_start+1
+		return 	ngrams
+	'''Syllabalize a given Malayalam string. Based on ml-split code by Baiju M'''		
+	def syllabalize_ml(self,text):
+		signs = [
+		u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
+		u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
+		u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
+		limiters = ['.','\"','\'','`','!',';',',','?', ']']
+		chandrakkala = u'\u0d4d'
+		lst_chars = []
+		for char in text:
+			if char in limiters:
+				lst_chars.append(char)
+			elif char in signs:
+				lst_chars[-1] = lst_chars[-1] + char
+			else:
+				try:
+					if lst_chars[-1][-1] == chandrakkala :
+						lst_chars[-1] = lst_chars[-1] + char
+					else:
+						lst_chars.append(char)
+				except IndexError:
+					lst_chars.append(char)
+
+		return lst_chars	
+	def addWords(self,text, window_size=2):
+		text = "* "+text+" ]"
+		words = text.split(" ")
+		ngrams = []
+		word_count = len(words)
+		window_start = 0
+		window_end = 0
+		while window_start + window_size <= word_count:
+			if(window_start + window_size < word_count):
+				window_end = window_start + window_size
+			else:
+				window_end = word_count	
+			words[window_start:window_end]	
+			ngrams.append(words[window_start:window_end])
+			window_start = window_start+1
+		return 	ngrams
+	def populateSyllableNgram(self, text):
+		ngrams = self.addSyllables(text)
+		for ngram in ngrams:
+			ngram_str=""
+			for item in ngram:
+				if(item.strip()>""):
+					if(ngram_str==""):
+						ngram_str=ngram_str+ item 
+					else:
+						
+						if(ngram_str=="["):
+							parent_node=self.getRoot()
+						else:	
+							parent_node=self.searchNodeByName(ngram_str,self.getRoot())
+						if(parent_node==None):
+							print "Parent node not found for " + item
+						else:	
+							parent_node.addChildNode(SyllableNode(item))
+							print ngram_str+ " -> "+item 	
+		#pickle the tree				
+		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))
+ 	def populateWordNgram(self, text):
+		ng = NGram () 
+		ngrams = ng.addWords(text)
+		for ngram in ngrams:
+			ngram_str=""
+			for item in ngram:
+				if(item.strip()>""):
+					if(ngram_str==""):
+						ngram_str=ngram_str+ item 
+					else:
+						if(ngram_str=="*"):
+							parent_node=self.getRoot()
+						else:	
+							parent_node=self.searchNodeByName(ngram_str,self.getRoot())
+						if(parent_node==None):
+							print "Parent node not found for " + item
+						else:	
+							parent_node.addChildNode(WordNode(item))
+							print ngram_str+ " -> "+item 	
+		#pickle the tree				
+		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))	
+if __name__ == "__main__":
+	usage = "usage: %prog [options] inputfile"
+	parser = OptionParser(version="%prog 0.1",description="Malayalama NGram Analyser")
+	parser.set_usage(usage)
+	parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram")
+	parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram")
+	parser.add_option("-i", "--input-file", dest="input_file",help="Input File for learning")
+	parser.add_option("-s", "--suggest-syllables", dest="suggest_syllables",help="Suggest next possible syllables for the given letter/syllable ")
+	parser.add_option("-w", "--suggest-words", dest="suggest_words",help="Suggest next possible words for the given word ")
+	(options, args) = parser.parse_args()
+	
+	if(options.gen_graph):
+		ng = NGram () 	
+		ng.toGraph(options.gen_graph)
+	if(options.	input_file):
+		if not os.path.exists(options.input_file):
+			print "File Doesnot Existis"
+			sys.exit(1)
+		else:
+			corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore')
+			ng = NGram () 	
+			while 1:
+	   			text = unicode( corpus_file.readline())	
+	   			if text == "":
+					break
+				text= text + " ]"	
+				ng.populateSyllableNgram(text)
+				ng.populateWordNgram(text)
+			print "Populated"
+	if(options.	print_ngram):
+		ng = NGram () 	
+		print ng.getRoot().toString()
+	if(options.	suggest_syllables):
+		ng = NGram () 	
+		print "Searching for" + options.suggest_words
+		print ng.searchNodeByName(unicode(options.	suggest_syllables))
+	if(options.	suggest_syllables):
+		ng = NGram () 	
+		print "Searching  for "+ options.suggest_words
+		print ng.searchNodeByName(unicode(options.	suggest_words))
+
+	
-- 
cgit 


From b4c9aab679ee466431a64688226ed870380d5b29 Mon Sep 17 00:00:00 2001
From: Santhosh Thottingal <santhosh.thottingal@gmail.com>
Date: Thu, 16 Apr 2009 20:51:39 +0530
Subject: Ngram model algorithm notes

---
 silpa/modules/ngram/algorithm | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 silpa/modules/ngram/algorithm

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/algorithm b/silpa/modules/ngram/algorithm
new file mode 100644
index 0000000..495b85a
--- /dev/null
+++ b/silpa/modules/ngram/algorithm
@@ -0,0 +1,23 @@
+We have a TREE data structure. Each node in the tree is an instance of NgramNode. 
+Each NgramNode objects contains a string value of the node and a Rank
+Rank is the incremented frequency of occurance of the corresponding string in the training corpus
+
+NGramNode is a super class of SyllableNgramNode and WordNgramNode
+That means, each node in the tree can be either a syllable or a word.
+We have only one tree for both words and syllables as of now
+
+In the tree, the root node is an empty node with label *. That indicates that all its childs, either syllables or words,
+are start of word or sentence respectively.
+
+Child of a node meaning:
+Y is a child ofX means , Y can follow immediately after the occurance of X in the text, Where X,Y are either syllable or word(only one time in  a tree route)
+X can have any number of childs.
+The probability that a node in the list of childs occur in a given context is controlled by Rank(node)
+Rank is nothing but integer values incremented based on frequency of occurance.
+Higher the rank, higher the probability that the node can follow immediately after X
+
+Persistance of the populated tree is achieved through pickling the entire tree structure.
+
+Tree operations:
+a) Adding a syllable-ngram, n=2
+
-- 
cgit 


From 10d9985caf134fcf7ac85de8105de53c5d2442f1 Mon Sep 17 00:00:00 2001
From: Jinesh K J <jinsbond007@jinesh.cvit>
Date: Sat, 18 Apr 2009 22:00:56 +0530
Subject: corrected some spellings

---
 silpa/modules/ngram/ngram.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'silpa/modules/ngram')

diff --git a/silpa/modules/ngram/ngram.py b/silpa/modules/ngram/ngram.py
index 8f2d65b..cab2ed9 100644
--- a/silpa/modules/ngram/ngram.py
+++ b/silpa/modules/ngram/ngram.py
@@ -305,7 +305,7 @@ class NGram:
 		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))	
 if __name__ == "__main__":
 	usage = "usage: %prog [options] inputfile"
-	parser = OptionParser(version="%prog 0.1",description="Malayalama NGram Analyser")
+	parser = OptionParser(version="%prog 0.1",description="Malayalam NGram Analyser")
 	parser.set_usage(usage)
 	parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram")
 	parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram")
@@ -317,9 +317,9 @@ if __name__ == "__main__":
 	if(options.gen_graph):
 		ng = NGram () 	
 		ng.toGraph(options.gen_graph)
-	if(options.	input_file):
+	if(options.input_file):
 		if not os.path.exists(options.input_file):
-			print "File Doesnot Existis"
+			print "File Doesnot Exist"
 			sys.exit(1)
 		else:
 			corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore')
@@ -332,16 +332,16 @@ if __name__ == "__main__":
 				ng.populateSyllableNgram(text)
 				ng.populateWordNgram(text)
 			print "Populated"
-	if(options.	print_ngram):
+	if(options.print_ngram):
 		ng = NGram () 	
 		print ng.getRoot().toString()
-	if(options.	suggest_syllables):
+	if(options.suggest_syllables):
 		ng = NGram () 	
 		print "Searching for" + options.suggest_words
-		print ng.searchNodeByName(unicode(options.	suggest_syllables))
-	if(options.	suggest_syllables):
+		print ng.searchNodeByName(unicode(options.suggest_syllables))
+	if(options.suggest_syllables):
 		ng = NGram () 	
 		print "Searching  for "+ options.suggest_words
-		print ng.searchNodeByName(unicode(options.	suggest_words))
+		print ng.searchNodeByName(unicode(options.suggest_words))
 
 	
-- 
cgit