From fce87d8ba5e8217128a5da786ffdbf1696dc0ff0 Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Mon, 30 Mar 2009 22:47:46 +0530 Subject: Adding ngram module --- silpa/modules/ngram/visualizer.py | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 silpa/modules/ngram/visualizer.py (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py new file mode 100644 index 0000000..0debe28 --- /dev/null +++ b/silpa/modules/ngram/visualizer.py @@ -0,0 +1,102 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Ngram +# Copyright 2008 Santhosh Thottingal +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com +# URL: http://www.smc.org.in +import pydot +import codecs + +class NGramVisualizer: + depth=0 + def loadCorpus(self, corpus_file_name): + graph_dict = dict() + line = [] + line_number = 0 + rule_number = 0 + corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') + while 1: + line_number = line_number +1 + text = unicode( corpus_file.readline()) + if text == "": + break + if text[0] == '#': + continue + line_number = line_number +1 + line = text.strip() + if(line == ""): + continue + words=line.split(" ") + word_count=len(words) + prev_word="" + for word in words: + word=word.strip() + if(prev_word==""): + prev_word=word + continue; + if(prev_word!=""): + if(graph_dict.has_key(prev_word)): + graph_dict[prev_word]=graph_dict[prev_word]+" -> "+word + else: + graph_dict[prev_word]=word + prev_word=word + prev_word="" + return graph_dict + def generate_full_graph(self, start_word, graph_dict,outputimage): + + for key in graph_dict.iterkeys(): + values=graph_dict[key].split("->") + for value in values: + value=value.strip() + #print key, value + if(start_word>""): + if(key==start_word): + graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8'))) + else: + graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8'))) + + + def generate_graph(self, graph_dict, graph, src): + self.depth=self.depth+1 + #print self.depth ,src + if(graph.get_node(src)!=[]): + return graph + if(self.depth>200): + return graph + values=[] + if(graph_dict.has_key(src)) : + values=graph_dict[src].split("->") + for dest in values: + dest=dest.strip() + #print src, dest,graph.get_edge(src,dest) + if(graph.get_edge(src,dest)): + continue + else: + graph.add_edge(pydot.Edge(src,dest)) + graph=self.generate_graph(graph_dict, graph, dest) + + return graph + +if __name__ == "__main__": + ngv=NGramVisualizer () + graph_dict=ngv.loadCorpus ("hi.txt") + graph=pydot.Dot() + graph=ngv.generate_graph(graph_dict, graph,u"भारत") + print graph.to_string().encode("utf-8") + #graph.write("ngvgraph-hi.png","dot", "raw" ) -- cgit From 2d44c4f029972bdba12cf2f8d1e863f71c05087c Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Sun, 5 Apr 2009 17:59:16 +0530 Subject: Updated the ngram module in silpa to deal with sentence breaks --- silpa/modules/ngram/ml.txt | 1 + silpa/modules/ngram/visualizer.py | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 silpa/modules/ngram/ml.txt (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/ml.txt b/silpa/modules/ngram/ml.txt new file mode 100644 index 0000000..4c48980 --- /dev/null +++ b/silpa/modules/ngram/ml.txt @@ -0,0 +1 @@ +കടലില്‍ ജീവിക്കുന്ന ഒരു സസ്തനിയാണ് നീലത്തിമിംഗലം. ബലീന്‍ തിമിംഗലങ്ങളുടെ ഒരു ഉപജാതിയാണിവ. ലോകത്ത് ഇന്നുവരെയുള്ളതില്‍ ഏറ്റവും വലിയ ജീവിയായി കണക്കാക്കപ്പെടുന്ന നീലത്തിമിംഗലങ്ങള്‍ക്ക് 33 മീ. നീളവും 181 മെട്രിക് ടണിലധികം ഭാരവും ഉണ്ടാകാം. നീണ്ട ശരീരപ്രകൃതിയുള്ള നീലത്തിമിംഗലങ്ങളുടെ ശരീരം നീലകലര്‍ന്ന ചാരനിറത്തോടെയാണുണ്ടാവുക, ശരീരത്തിനടിഭാഗത്തേക്ക് നിറം കുറവായിരിക്കും. നീലത്തിമിംഗലങ്ങള്‍ക്ക് കുറഞ്ഞത് മൂന്നുപജാതികളെങ്കിലും ഉണ്ടെന്നു കരുതുന്നു. diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index 0debe28..d46baeb 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -26,10 +26,12 @@ import codecs class NGramVisualizer: depth=0 def loadCorpus(self, corpus_file_name): + limiters = [".","!","?",",",";"] graph_dict = dict() line = [] line_number = 0 rule_number = 0 + corpus="" corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') while 1: line_number = line_number +1 @@ -42,10 +44,22 @@ class NGramVisualizer: line = text.strip() if(line == ""): continue + corpus=corpus+" "+line + sentences=[] + sentence = "" + start = 0 + for index in range(0,len(corpus)): + for delimit in limiters: + if corpus[index] == delimit: + sentence = corpus[start:index] + sentences.append(sentence) + start = index+1 + for line in sentences: words=line.split(" ") word_count=len(words) prev_word="" for word in words: + #print word word=word.strip() if(prev_word==""): prev_word=word @@ -56,7 +70,8 @@ class NGramVisualizer: else: graph_dict[prev_word]=word prev_word=word - prev_word="" + prev_word="" + return graph_dict def generate_full_graph(self, start_word, graph_dict,outputimage): @@ -95,8 +110,8 @@ class NGramVisualizer: if __name__ == "__main__": ngv=NGramVisualizer () - graph_dict=ngv.loadCorpus ("hi.txt") + graph_dict=ngv.loadCorpus ("ml.txt") graph=pydot.Dot() - graph=ngv.generate_graph(graph_dict, graph,u"भारत") + graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്") print graph.to_string().encode("utf-8") #graph.write("ngvgraph-hi.png","dot", "raw" ) -- cgit From bba7b4bc146bb4623c6cc6ad27d70baf2e02497a Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Mon, 6 Apr 2009 03:37:15 +0530 Subject: corpus is now stored using pickle and can be reused --- silpa/modules/ngram/ngram_ml.txt | 219 ++++++++++++++++++++++++++++++++++++ silpa/modules/ngram/sample_ngram.py | 64 +++++++++++ 2 files changed, 283 insertions(+) create mode 100644 silpa/modules/ngram/ngram_ml.txt create mode 100644 silpa/modules/ngram/sample_ngram.py (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/ngram_ml.txt b/silpa/modules/ngram/ngram_ml.txt new file mode 100644 index 0000000..c265b1c --- /dev/null +++ b/silpa/modules/ngram/ngram_ml.txt @@ -0,0 +1,219 @@ +(dp0 +V\u0d35\u0d32\u0d3f\u0d2f +p1 +V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f +p2 +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 +p3 +V\u0d36\u0d30\u0d40\u0d30\u0d02 +p4 +sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d +p5 +V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d +p6 +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d +p7 +V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d +p8 +sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 +p9 +V\u0d12\u0d30\u0d41 +p10 +sV\u0d06\u0d23\u0d41 +p11 +V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d +p12 +sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d +p13 +V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +p14 +sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 +p15 +V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15 +p16 +sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d +p17 +V\u0d28\u0d3f\u0d31\u0d02 +p18 +sV\u0d28\u0d40\u0d23\u0d4d\u0d1f +p19 +V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 +p20 +sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 +p21 +V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 +p22 +sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +p23 +V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d +p24 +sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 +p25 +V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d +p26 +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 +p27 +V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 +p28 +sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f +p29 +V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 +p30 +sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d +p31 +g9 +sg2 +V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 +p32 +sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 +p33 +V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d +p34 +sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +p35 +V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f +p36 +sg32 +g7 +sV\u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d +p37 +g21 +sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d +p38 +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 +p39 +sV181 +p40 +V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d +p41 +sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 +p42 +g1 +sV\u0d15\u0d23\u0d4d\u0d1f\u0d41 +p43 +V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 +p44 +sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23 +p45 +V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +p46 +sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +p47 +V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +p48 +sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d +p49 +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 +p50 +sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d +p51 +g42 +sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 +p52 +V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02 +p53 +sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02 +p54 +g40 +sV\u0d2a\u0d4b\u0d32\u0d46 +p55 +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 +p56 +sg41 +V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 +p57 +sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02 +p58 +V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f +p59 +sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 +p60 +V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41 +p61 +sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +p62 +V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 +p63 +sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d +p64 +V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d +p65 +sg10 +V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35 +p66 +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 +p67 +V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 +p68 +sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 +p69 +V\u0d12\u0d30\u0d41 +p70 +sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 +p71 +V\u0d2c\u0d3f -> \u0d2c\u0d3f +p72 +sg22 +V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41 +p73 +sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d +p74 +V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +p75 +sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d +p76 +V(B -> (B +p77 +sg57 +g52 +sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d +p78 +g51 +sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15 +p79 +V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 +p80 +sV33 +p81 +V\u0d2e\u0d40 +p82 +sg18 +V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02 +p83 +sg20 +g3 +sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d +p84 +V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +p85 +sg4 +g15 +sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +p86 +V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +p87 +sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 +p88 +V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 +p89 +sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d +p90 +V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 +p91 +sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 +p92 +V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46 +p93 +sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 +p94 +V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d +p95 +sV\u0d1a\u0d46\u0d31\u0d41 +p96 +V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 +p97 +sV\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 +p98 +V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41 +p99 +s. \ No newline at end of file diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py new file mode 100644 index 0000000..0f46ad6 --- /dev/null +++ b/silpa/modules/ngram/sample_ngram.py @@ -0,0 +1,64 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Ngram +# Copyright 2009 Jinesh K J +# Copyright 2009 Swathantra Malayalam Computing +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: jinesh.k@gmail.com or smc-discuss@googlegroups.com +# URL: http://www.smc.org.in +import pydot +import codecs +import pickle +import sys +from visualizer import NGramVisualizer +def genCorpus(infile,corpus): + ngv=NGramVisualizer () + try: + corpusfile = open(corpus) + except IOError: + graph_dict = dict() + else: + graph_dict = pickle.load(corpusfile) +# corpusfile.close() + graph_dict=ngv.loadCorpus (infile,graph_dict) + pickle.dump(graph_dict,open(corpus,'w')) +def printGraph(corpus,start_word): + ngv=NGramVisualizer () + graph_dict=pickle.load(open(corpus)) + graph=pydot.Dot() + uni_start_word = start_word.decode("utf-8") +# print start_word +# print uni_start_word + graph=ngv.generate_graph(graph_dict, graph,uni_start_word) + print graph.to_string().encode("utf-8") + +if __name__ == "__main__": + """ + python sample_ngram.py 1 + this will generate the corpus for the given input file, if corpus specified at + corpus path is empty. Else it will recreate the corpus for the with the additional files. + + python sample_ngram.py 2 + this will generate the graph for the given start word in the given corpus at corpus path. + + This is just a crude attempt, a lot more improvement is to be done. + """ + if sys.argv[3] == "1": + genCorpus(sys.argv[1],sys.argv[2]) + elif sys.argv[3] == "2": + printGraph(sys.argv[1],sys.argv[2]) -- cgit From 97f2231e942f67450b27f161da11e863334921ba Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Mon, 6 Apr 2009 04:15:42 +0530 Subject: the ngram code is changed and removed all hard codings --- silpa/modules/ngram/ngram_ml.txt | 251 +++++++++++++++++++----------------- silpa/modules/ngram/sample_ngram.py | 17 +-- silpa/modules/ngram/visualizer.py | 32 +++-- 3 files changed, 160 insertions(+), 140 deletions(-) (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/ngram_ml.txt b/silpa/modules/ngram/ngram_ml.txt index c265b1c..a4db8f1 100644 --- a/silpa/modules/ngram/ngram_ml.txt +++ b/silpa/modules/ngram/ngram_ml.txt @@ -1,219 +1,238 @@ (dp0 V\u0d35\u0d32\u0d3f\u0d2f p1 -V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f +V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f -> \u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f p2 -sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 +sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d p3 -V\u0d36\u0d30\u0d40\u0d30\u0d02 +V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d p4 -sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d p5 -V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d +V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d -> 33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d p6 -sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d +sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 p7 -V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d +V\u0d12\u0d30\u0d41 -> \u0d12\u0d30\u0d41 p8 -sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 +sV\u0d06\u0d23\u0d41 p9 -V\u0d12\u0d30\u0d41 +V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d p10 -sV\u0d06\u0d23\u0d41 +sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d p11 -V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d +V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 p12 -sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d +sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 p13 -V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15 -> \u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15 p14 -sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 +sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d p15 -V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15 +V\u0d28\u0d3f\u0d31\u0d02 -> \u0d28\u0d3f\u0d31\u0d02 p16 -sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d +sV\u0d28\u0d40\u0d23\u0d4d\u0d1f p17 -V\u0d28\u0d3f\u0d31\u0d02 +V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 -> \u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 p18 -sV\u0d28\u0d40\u0d23\u0d4d\u0d1f +sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 p19 -V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 +V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 -> \u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 p20 -sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 +sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 p21 -V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 +V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d p22 -sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 p23 -V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d +V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d p24 -sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 +sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f p25 -V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d +V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 p26 -sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 +sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d p27 -V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 +V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 p28 -sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f +sV\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f p29 -V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 +V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 p30 -sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d +sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 p31 -g9 -sg2 -V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 +V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d p32 -sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 +sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 p33 -V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d +V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f p34 -sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +sV\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 p35 -V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d p36 -sg32 -g7 sV\u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d p37 -g21 -sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d +V\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 p38 -V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 +sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d p39 -sV181 +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 p40 -V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d +sV181 p41 -sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 +V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d -> \u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d p42 -g1 -sV\u0d15\u0d23\u0d4d\u0d1f\u0d41 +sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 p43 -V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 +V\u0d35\u0d32\u0d3f\u0d2f -> \u0d35\u0d32\u0d3f\u0d2f p44 -sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23 +sV\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 p45 -V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 p46 -sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +sV\u0d15\u0d23\u0d4d\u0d1f\u0d41 p47 -V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 p48 -sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d +sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23 p49 -V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 +V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d p50 -sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d +sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d p51 -g42 -sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 +V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 p52 -V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02 +sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d p53 -sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02 +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 p54 -g40 -sV\u0d2a\u0d4b\u0d32\u0d46 +sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d p55 -V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 +V\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 -> \u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 p56 -sg41 -V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 p57 -sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02 +V\u0d36\u0d30\u0d40\u0d30\u0d02 -> \u0d36\u0d30\u0d40\u0d30\u0d02 p58 -V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f +sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 p59 -sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 +V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02 -> \u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02 p60 -V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41 +sV\u0d2a\u0d4b\u0d32\u0d46 p61 -sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 p62 -V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 +sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 p63 -sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d +V\u0d12\u0d30\u0d41 -> \u0d12\u0d30\u0d41 p64 -V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d +sV\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d p65 -sg10 -V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35 +V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 -> \u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 p66 -sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 +sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02 p67 -V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 +V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f p68 -sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 +sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 p69 -V\u0d12\u0d30\u0d41 +V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41 p70 -sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 +sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d p71 -V\u0d2c\u0d3f -> \u0d2c\u0d3f +V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 p72 -sg22 -V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41 +sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d p73 -sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d +V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d p74 -V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d +sV\u0d12\u0d30\u0d41 p75 -sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d +V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35 -> \u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35 p76 -V(B -> (B +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 p77 -sg57 -g52 -sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d +V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 p78 -g51 -sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15 +sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 p79 -V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 +V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 p80 -sV33 +sV\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 p81 -V\u0d2e\u0d40 +V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41 -> \u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41 p82 -sg18 -V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02 +sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d p83 -sg20 -g3 -sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d +V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d p84 -V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d p85 -sg4 -g15 -sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +V(B -> (B -> (B p86 -V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +sV\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 p87 -sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 +V\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 -> \u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 p88 -V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 +sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d p89 -sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d +V\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d p90 -V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 +sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15 p91 -sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 +V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 p92 -V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46 +sV33 p93 -sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 +V\u0d2e\u0d40 -> \u0d2e\u0d40 p94 -V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d +sV\u0d28\u0d3f\u0d31\u0d02 p95 -sV\u0d1a\u0d46\u0d31\u0d41 +V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02 -> \u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02 p96 -V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -p97 sV\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 +p97 +V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41 p98 -V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41 +sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d p99 +V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +p100 +sV\u0d36\u0d30\u0d40\u0d30\u0d02 +p101 +V\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 -> \u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 +p102 +sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 +p103 +V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 +p104 +sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 +p105 +V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 +p106 +sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d +p107 +V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 +p108 +sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 +p109 +V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46 +p110 +sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02 +p111 +V181 -> 181 +p112 +sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 +p113 +V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d +p114 +sV\u0d1a\u0d46\u0d31\u0d41 +p115 +V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 +p116 +sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 +p117 +V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f +p118 s. \ No newline at end of file diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py index 0f46ad6..d373aa8 100644 --- a/silpa/modules/ngram/sample_ngram.py +++ b/silpa/modules/ngram/sample_ngram.py @@ -26,17 +26,7 @@ import codecs import pickle import sys from visualizer import NGramVisualizer -def genCorpus(infile,corpus): - ngv=NGramVisualizer () - try: - corpusfile = open(corpus) - except IOError: - graph_dict = dict() - else: - graph_dict = pickle.load(corpusfile) -# corpusfile.close() - graph_dict=ngv.loadCorpus (infile,graph_dict) - pickle.dump(graph_dict,open(corpus,'w')) + def printGraph(corpus,start_word): ngv=NGramVisualizer () graph_dict=pickle.load(open(corpus)) @@ -51,7 +41,7 @@ if __name__ == "__main__": """ python sample_ngram.py 1 this will generate the corpus for the given input file, if corpus specified at - corpus path is empty. Else it will recreate the corpus for the with the additional files. + corpus path is empty. Else it will recreate the corpus with the additional data. python sample_ngram.py 2 this will generate the graph for the given start word in the given corpus at corpus path. @@ -59,6 +49,7 @@ if __name__ == "__main__": This is just a crude attempt, a lot more improvement is to be done. """ if sys.argv[3] == "1": - genCorpus(sys.argv[1],sys.argv[2]) + ngv=NGramVisualizer () + ngv.loadCorpus(sys.argv[1],sys.argv[2]) elif sys.argv[3] == "2": printGraph(sys.argv[1],sys.argv[2]) diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index d46baeb..e2c1136 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -22,20 +22,27 @@ # URL: http://www.smc.org.in import pydot import codecs +import pickle class NGramVisualizer: depth=0 - def loadCorpus(self, corpus_file_name): + def loadCorpus(self,new_file_name,corpus_file_name): limiters = [".","!","?",",",";"] - graph_dict = dict() + try: + corpusfile = open(corpus_file_name) + except IOError: + graph_dict = dict() + else: + graph_dict = pickle.load(corpusfile) + # graph_dict = dict() line = [] line_number = 0 rule_number = 0 corpus="" - corpus_file = codecs. open(corpus_file_name,encoding='utf-8', errors='ignore') + data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') while 1: line_number = line_number +1 - text = unicode( corpus_file.readline()) + text = unicode( data_file.readline()) if text == "": break if text[0] == '#': @@ -72,7 +79,8 @@ class NGramVisualizer: prev_word=word prev_word="" - return graph_dict + pickle.dump(graph_dict,open(corpus_file_name,'w')) + #return graph_dict def generate_full_graph(self, start_word, graph_dict,outputimage): for key in graph_dict.iterkeys(): @@ -108,10 +116,12 @@ class NGramVisualizer: return graph -if __name__ == "__main__": - ngv=NGramVisualizer () - graph_dict=ngv.loadCorpus ("ml.txt") - graph=pydot.Dot() - graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്") - print graph.to_string().encode("utf-8") +#if __name__ == "__main__": +# ngv=NGramVisualizer () +# graph_dict = dict() +# graph_dict=ngv.loadCorpus ("ml.txt",graph_dict) +# pickle.dump(graph_dict,open('ngram_ml.txt','w')) +# graph=pydot.Dot() +# graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്") +# print graph.to_string().encode("utf-8") #graph.write("ngvgraph-hi.png","dot", "raw" ) -- cgit From 1b62cfff2a910765b700bbea15786a1d62d800ef Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Tue, 7 Apr 2009 03:10:02 +0530 Subject: Updated the ngram module with option parser --- silpa/modules/ngram/sample_ngram.py | 57 +++++++++++++++++++++++++++---------- silpa/modules/ngram/visualizer.py | 19 +------------ 2 files changed, 43 insertions(+), 33 deletions(-) (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py index d373aa8..ec4b184 100644 --- a/silpa/modules/ngram/sample_ngram.py +++ b/silpa/modules/ngram/sample_ngram.py @@ -26,6 +26,27 @@ import codecs import pickle import sys from visualizer import NGramVisualizer +from optparse import OptionParser + +def getData(new_file_name): + line = [] + line_number = 0 + rule_number = 0 + corpus="" + data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') + while 1: + line_number = line_number +1 + text = unicode( data_file.readline()) + if text == "": + break + if text[0] == '#': + continue + line_number = line_number +1 + line = text.strip() + if(line == ""): + continue + corpus=corpus+" "+line + return corpus def printGraph(corpus,start_word): ngv=NGramVisualizer () @@ -38,18 +59,24 @@ def printGraph(corpus,start_word): print graph.to_string().encode("utf-8") if __name__ == "__main__": - """ - python sample_ngram.py 1 - this will generate the corpus for the given input file, if corpus specified at - corpus path is empty. Else it will recreate the corpus with the additional data. - - python sample_ngram.py 2 - this will generate the graph for the given start word in the given corpus at corpus path. - - This is just a crude attempt, a lot more improvement is to be done. - """ - if sys.argv[3] == "1": - ngv=NGramVisualizer () - ngv.loadCorpus(sys.argv[1],sys.argv[2]) - elif sys.argv[3] == "2": - printGraph(sys.argv[1],sys.argv[2]) + usage = "usage: %prog [options] INPUTDATA CORPUSFILE" + parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE") + parser.set_usage(usage) + parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA") + parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA") + (options, args) = parser.parse_args() + if len(args) != 2 : + parser.error("incorrect number of arguments") + if options.infile and options.start_word: + parser.error("options -f and -s are mutually exclusive") + ngv=NGramVisualizer () + if options.infile: + data = getData(args[0]) + # print data + ngv.loadCorpus(data,args[1]) + elif options.start_word: + printGraph(args[1],args[0]) + else: + # print args[0] + data = args[0].decode("utf-8") + ngv.loadCorpus(data,args[1]) diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py index e2c1136..585c11f 100644 --- a/silpa/modules/ngram/visualizer.py +++ b/silpa/modules/ngram/visualizer.py @@ -26,7 +26,7 @@ import pickle class NGramVisualizer: depth=0 - def loadCorpus(self,new_file_name,corpus_file_name): + def loadCorpus(self,corpus,corpus_file_name): limiters = [".","!","?",",",";"] try: corpusfile = open(corpus_file_name) @@ -35,23 +35,6 @@ class NGramVisualizer: else: graph_dict = pickle.load(corpusfile) # graph_dict = dict() - line = [] - line_number = 0 - rule_number = 0 - corpus="" - data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore') - while 1: - line_number = line_number +1 - text = unicode( data_file.readline()) - if text == "": - break - if text[0] == '#': - continue - line_number = line_number +1 - line = text.strip() - if(line == ""): - continue - corpus=corpus+" "+line sentences=[] sentence = "" start = 0 -- cgit From 8c71596c75ce7ca1d716fc0f93e24b9ca821f081 Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Thu, 16 Apr 2009 20:49:49 +0530 Subject: NGRAM model for Malayalam --- silpa/modules/ngram/ngram.py | 347 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 347 insertions(+) create mode 100644 silpa/modules/ngram/ngram.py (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/ngram.py b/silpa/modules/ngram/ngram.py new file mode 100644 index 0000000..8f2d65b --- /dev/null +++ b/silpa/modules/ngram/ngram.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Ngram +# Copyright 2008-2009 Santhosh Thottingal +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# + +import codecs +import pickle +import pydot +import os,sys +from optparse import OptionParser +VERSION=0.1 +MAX_TREE_DEPTH=1000 +PICKLED_TREE="ngram.pyo" +class NgramNode: + def __init__(self, node_value="*", rank=1, child_list=None): + self.node_value=node_value + self.rank=rank + self.child_list=child_list + self.desc="Start Node" + def setNode(self, node_value="*", rank=None,childs=None, child_list=None): + self.node_value=node_value + self.rank=rank + self.child_list=child_list + def getName(self): + return self.node_value + def getDesc(self): + return self.desc + def setDesc(self,desc): + self.desc = desc + return self.desc + def getRank(self): + return self.rank + def setRank(self, rank): + self.rank = rank + def incrRank(self, incr=1): + self.rank = self.rank + incr + return self.rank + def getChildList(self): + if(self.child_list!=None): + return self.child_list + else: + return None + def getChildByName(self,child_name): + if(self.child_list==None): + return None + for child in self.child_list: + if(child.getName()==child_name): + return child + def childIndex(self,childnode): + if(self.child_list==None): + return -1 + for child in self.child_list: + if(child.getName()==childnode.getName()): + return self.child_list.index(child) + return -2 + def printChildList(self): + if(self.child_list==None): + return None + for child in self.child_list: + print child, + def addChildNode(self, node): + if(node!=None): + if(self.child_list==None): + self.child_list=[] + #Check whether this node is already present in the Ngram Tree + member_index=self.childIndex(node) + if(member_index>=0): + #Node already present.Incrementing Rank + self.child_list[member_index].incrRank() + else: + self.child_list.append(node) + #Keep it sorted as per the ranks + self.child_list.sort() + def removeChildNode(self, node): + if(node!=None & self.child_list!=None): + self.child_list.remove(node) + def __str__(self): + return "Node: %s[%d]" % (self.node_value, self.rank) + '''Recursively traverse through the tree and print the nodes-Depth First Traversal''' + def toString(self): + print "Node: %s[%d]" % (self.node_value, self.rank) + child_list=self.getChildList() + if(child_list!=None): + for child_node in child_list : + child_node.toString() + '''Defining the less than operater of the object''' + def __lt__(self, node): + return self.getRank() < node.getRank() + '''Defining the greater than operater of the object''' + def __gt__(self, node): + return self.getRank() > node.getRank() + '''Defining the equal-to operater of the object''' + def __eq__(self, node): + if(node==None): + return False + return (self.getName() == node.getName()) & (self.getRank() == node.getRank()) + '''Defining the comparison of two object instances. Required for sorting the list of objects''' + def __cmp__(self, node): + if(node==None): + return 1 + if(self.getName()==node.getName()): + return cmp(self.getRank(), node.getRank()) + else: + return 1 + + +#Syllable Node Class +#Extends NgramNode class +class SyllableNode(NgramNode): + def __str__(self): + return ("Syllable: %s[%d]" % (self.node_value, self.rank )).encode('utf-8') +#Word Node Class +#Extends NgramNode class +class WordNode(NgramNode): + def __str__(self): + return ("Word: %s[%d]" % (self.node_value, self.rank )).encode('utf-8') + +class NGram: + def __init__(self, text=None, language=None): + self.text=None + self.language=None + try: + #Try loading picked tree object + self.ngrams=pickle.load(open(PICKLED_TREE)) + print "Loaded the ngram from " + PICKLED_TREE + except: + #Initialize with empty node + self.ngrams=NgramNode() + print "New one" + self.search_depth=0 + def getRoot(self, node_name=None): + if(node_name==None): + return self.ngrams + else: + return self.searchNode(node_name) + + def searchNodeByName(self, node_name, current_node=None, depth=MAX_TREE_DEPTH): + if(current_node==None): + current_node=self.getRoot() + self.search_depth = 0 + if(self.search_depth==depth): + return None + if(current_node.getName() == node_name): + print "Found at depth", self.search_depth + return current_node + else: + child_list=current_node.getChildList() + if(child_list==None): + return None + else: + child_list=child_list + self.search_depth = self.search_depth+1 + for child_node in child_list : + result_node=self.searchNodeByName(node_name,child_node, depth) + if(result_node!=None): + return result_node + def printNgram(self, current_node=None): + if(current_node==None): + current_node=self.getRoot() + print current_node + child_list=current_node.getChildList() + + if(child_list==None): + return None + else: + child_list.sort() + for child_node in child_list : + self.printNgram(child_node) + def toDot(self, graph , current_node=None): + if(current_node==None): + current_node=self.getRoot() + child_list=current_node.getChildList() + if(child_list!=None): + key=current_node.getName() + for child_node in child_list: + value=child_node.getName() + if((key!=None) & ord(key[len(key)-1])<=0x0901 & len(key)>1): + key=key[0:len(key)-1] + if(value!=None): + if((ord(value[len(value)-1])<=0x0901) & len(value)>1): + value=value[0:len(value)-2] + graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8'))) + self.toDot(graph,child_node) + def toGraph(self, output_image_file): + graph=pydot.Dot() + self.toDot(graph) + #print graph.to_string().encode('utf-8') + graph.write(output_image_file,"dot", "png" ) + + def addSyllables(self,text, window_size=2): + words=text.split(" ") + ngrams = [] + for word in words: + #TODO-Normalize before taking ngram!!! + word = "*"+word+"]" + syllables = self.syllabalize_ml(word) + syllable_count = len(syllables) + window_start = 0 + window_end = 0 + while window_start + window_size <= syllable_count: + if(window_start + window_size < syllable_count): + window_end = window_start + window_size + else: + window_end = syllable_count + ngrams.append(syllables[window_start:window_end]) + window_start = window_start+1 + return ngrams + '''Syllabalize a given Malayalam string. Based on ml-split code by Baiju M''' + def syllabalize_ml(self,text): + signs = [ + u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', + u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', + u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + limiters = ['.','\"','\'','`','!',';',',','?', ']'] + chandrakkala = u'\u0d4d' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala : + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + def addWords(self,text, window_size=2): + text = "* "+text+" ]" + words = text.split(" ") + ngrams = [] + word_count = len(words) + window_start = 0 + window_end = 0 + while window_start + window_size <= word_count: + if(window_start + window_size < word_count): + window_end = window_start + window_size + else: + window_end = word_count + words[window_start:window_end] + ngrams.append(words[window_start:window_end]) + window_start = window_start+1 + return ngrams + def populateSyllableNgram(self, text): + ngrams = self.addSyllables(text) + for ngram in ngrams: + ngram_str="" + for item in ngram: + if(item.strip()>""): + if(ngram_str==""): + ngram_str=ngram_str+ item + else: + + if(ngram_str=="["): + parent_node=self.getRoot() + else: + parent_node=self.searchNodeByName(ngram_str,self.getRoot()) + if(parent_node==None): + print "Parent node not found for " + item + else: + parent_node.addChildNode(SyllableNode(item)) + print ngram_str+ " -> "+item + #pickle the tree + pickle.dump(self.getRoot(),open(PICKLED_TREE,'w')) + def populateWordNgram(self, text): + ng = NGram () + ngrams = ng.addWords(text) + for ngram in ngrams: + ngram_str="" + for item in ngram: + if(item.strip()>""): + if(ngram_str==""): + ngram_str=ngram_str+ item + else: + if(ngram_str=="*"): + parent_node=self.getRoot() + else: + parent_node=self.searchNodeByName(ngram_str,self.getRoot()) + if(parent_node==None): + print "Parent node not found for " + item + else: + parent_node.addChildNode(WordNode(item)) + print ngram_str+ " -> "+item + #pickle the tree + pickle.dump(self.getRoot(),open(PICKLED_TREE,'w')) +if __name__ == "__main__": + usage = "usage: %prog [options] inputfile" + parser = OptionParser(version="%prog 0.1",description="Malayalama NGram Analyser") + parser.set_usage(usage) + parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram") + parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram") + parser.add_option("-i", "--input-file", dest="input_file",help="Input File for learning") + parser.add_option("-s", "--suggest-syllables", dest="suggest_syllables",help="Suggest next possible syllables for the given letter/syllable ") + parser.add_option("-w", "--suggest-words", dest="suggest_words",help="Suggest next possible words for the given word ") + (options, args) = parser.parse_args() + + if(options.gen_graph): + ng = NGram () + ng.toGraph(options.gen_graph) + if(options. input_file): + if not os.path.exists(options.input_file): + print "File Doesnot Existis" + sys.exit(1) + else: + corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore') + ng = NGram () + while 1: + text = unicode( corpus_file.readline()) + if text == "": + break + text= text + " ]" + ng.populateSyllableNgram(text) + ng.populateWordNgram(text) + print "Populated" + if(options. print_ngram): + ng = NGram () + print ng.getRoot().toString() + if(options. suggest_syllables): + ng = NGram () + print "Searching for" + options.suggest_words + print ng.searchNodeByName(unicode(options. suggest_syllables)) + if(options. suggest_syllables): + ng = NGram () + print "Searching for "+ options.suggest_words + print ng.searchNodeByName(unicode(options. suggest_words)) + + -- cgit From b4c9aab679ee466431a64688226ed870380d5b29 Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Thu, 16 Apr 2009 20:51:39 +0530 Subject: Ngram model algorithm notes --- silpa/modules/ngram/algorithm | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 silpa/modules/ngram/algorithm (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/algorithm b/silpa/modules/ngram/algorithm new file mode 100644 index 0000000..495b85a --- /dev/null +++ b/silpa/modules/ngram/algorithm @@ -0,0 +1,23 @@ +We have a TREE data structure. Each node in the tree is an instance of NgramNode. +Each NgramNode objects contains a string value of the node and a Rank +Rank is the incremented frequency of occurance of the corresponding string in the training corpus + +NGramNode is a super class of SyllableNgramNode and WordNgramNode +That means, each node in the tree can be either a syllable or a word. +We have only one tree for both words and syllables as of now + +In the tree, the root node is an empty node with label *. That indicates that all its childs, either syllables or words, +are start of word or sentence respectively. + +Child of a node meaning: +Y is a child ofX means , Y can follow immediately after the occurance of X in the text, Where X,Y are either syllable or word(only one time in a tree route) +X can have any number of childs. +The probability that a node in the list of childs occur in a given context is controlled by Rank(node) +Rank is nothing but integer values incremented based on frequency of occurance. +Higher the rank, higher the probability that the node can follow immediately after X + +Persistance of the populated tree is achieved through pickling the entire tree structure. + +Tree operations: +a) Adding a syllable-ngram, n=2 + -- cgit From 10d9985caf134fcf7ac85de8105de53c5d2442f1 Mon Sep 17 00:00:00 2001 From: Jinesh K J Date: Sat, 18 Apr 2009 22:00:56 +0530 Subject: corrected some spellings --- silpa/modules/ngram/ngram.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'silpa/modules/ngram') diff --git a/silpa/modules/ngram/ngram.py b/silpa/modules/ngram/ngram.py index 8f2d65b..cab2ed9 100644 --- a/silpa/modules/ngram/ngram.py +++ b/silpa/modules/ngram/ngram.py @@ -305,7 +305,7 @@ class NGram: pickle.dump(self.getRoot(),open(PICKLED_TREE,'w')) if __name__ == "__main__": usage = "usage: %prog [options] inputfile" - parser = OptionParser(version="%prog 0.1",description="Malayalama NGram Analyser") + parser = OptionParser(version="%prog 0.1",description="Malayalam NGram Analyser") parser.set_usage(usage) parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram") parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram") @@ -317,9 +317,9 @@ if __name__ == "__main__": if(options.gen_graph): ng = NGram () ng.toGraph(options.gen_graph) - if(options. input_file): + if(options.input_file): if not os.path.exists(options.input_file): - print "File Doesnot Existis" + print "File Doesnot Exist" sys.exit(1) else: corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore') @@ -332,16 +332,16 @@ if __name__ == "__main__": ng.populateSyllableNgram(text) ng.populateWordNgram(text) print "Populated" - if(options. print_ngram): + if(options.print_ngram): ng = NGram () print ng.getRoot().toString() - if(options. suggest_syllables): + if(options.suggest_syllables): ng = NGram () print "Searching for" + options.suggest_words - print ng.searchNodeByName(unicode(options. suggest_syllables)) - if(options. suggest_syllables): + print ng.searchNodeByName(unicode(options.suggest_syllables)) + if(options.suggest_syllables): ng = NGram () print "Searching for "+ options.suggest_words - print ng.searchNodeByName(unicode(options. suggest_words)) + print ng.searchNodeByName(unicode(options.suggest_words)) -- cgit