summaryrefslogtreecommitdiffstats
path: root/silpa/modules/ngram
diff options
context:
space:
mode:
Diffstat (limited to 'silpa/modules/ngram')
-rw-r--r--silpa/modules/ngram/algorithm23
-rw-r--r--silpa/modules/ngram/ml.txt1
-rw-r--r--silpa/modules/ngram/ngram.py347
-rw-r--r--silpa/modules/ngram/ngram_ml.txt238
-rw-r--r--silpa/modules/ngram/sample_ngram.py82
-rw-r--r--silpa/modules/ngram/visualizer.py110
6 files changed, 801 insertions, 0 deletions
diff --git a/silpa/modules/ngram/algorithm b/silpa/modules/ngram/algorithm
new file mode 100644
index 0000000..495b85a
--- /dev/null
+++ b/silpa/modules/ngram/algorithm
@@ -0,0 +1,23 @@
+We have a TREE data structure. Each node in the tree is an instance of NgramNode.
+Each NgramNode objects contains a string value of the node and a Rank
+Rank is the incremented frequency of occurance of the corresponding string in the training corpus
+
+NGramNode is a super class of SyllableNgramNode and WordNgramNode
+That means, each node in the tree can be either a syllable or a word.
+We have only one tree for both words and syllables as of now
+
+In the tree, the root node is an empty node with label *. That indicates that all its childs, either syllables or words,
+are start of word or sentence respectively.
+
+Child of a node meaning:
+Y is a child ofX means , Y can follow immediately after the occurance of X in the text, Where X,Y are either syllable or word(only one time in a tree route)
+X can have any number of childs.
+The probability that a node in the list of childs occur in a given context is controlled by Rank(node)
+Rank is nothing but integer values incremented based on frequency of occurance.
+Higher the rank, higher the probability that the node can follow immediately after X
+
+Persistance of the populated tree is achieved through pickling the entire tree structure.
+
+Tree operations:
+a) Adding a syllable-ngram, n=2
+
diff --git a/silpa/modules/ngram/ml.txt b/silpa/modules/ngram/ml.txt
new file mode 100644
index 0000000..4c48980
--- /dev/null
+++ b/silpa/modules/ngram/ml.txt
@@ -0,0 +1 @@
+കടലില്‍ ജീവിക്കുന്ന ഒരു സസ്തനിയാണ് നീലത്തിമിംഗലം. ബലീന്‍ തിമിംഗലങ്ങളുടെ ഒരു ഉപജാതിയാണിവ. ലോകത്ത് ഇന്നുവരെയുള്ളതില്‍ ഏറ്റവും വലിയ ജീവിയായി കണക്കാക്കപ്പെടുന്ന നീലത്തിമിംഗലങ്ങള്‍ക്ക് 33 മീ. നീളവും 181 മെട്രിക് ടണിലധികം ഭാരവും ഉണ്ടാകാം. നീണ്ട ശരീരപ്രകൃതിയുള്ള നീലത്തിമിംഗലങ്ങളുടെ ശരീരം നീലകലര്‍ന്ന ചാരനിറത്തോടെയാണുണ്ടാവുക, ശരീരത്തിനടിഭാഗത്തേക്ക് നിറം കുറവായിരിക്കും. നീലത്തിമിംഗലങ്ങള്‍ക്ക് കുറഞ്ഞത് മൂന്നുപജാതികളെങ്കിലും ഉണ്ടെന്നു കരുതുന്നു.
diff --git a/silpa/modules/ngram/ngram.py b/silpa/modules/ngram/ngram.py
new file mode 100644
index 0000000..cab2ed9
--- /dev/null
+++ b/silpa/modules/ngram/ngram.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2008-2009 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+
+import codecs
+import pickle
+import pydot
+import os,sys
+from optparse import OptionParser
+VERSION=0.1
+MAX_TREE_DEPTH=1000
+PICKLED_TREE="ngram.pyo"
+class NgramNode:
+ def __init__(self, node_value="*", rank=1, child_list=None):
+ self.node_value=node_value
+ self.rank=rank
+ self.child_list=child_list
+ self.desc="Start Node"
+ def setNode(self, node_value="*", rank=None,childs=None, child_list=None):
+ self.node_value=node_value
+ self.rank=rank
+ self.child_list=child_list
+ def getName(self):
+ return self.node_value
+ def getDesc(self):
+ return self.desc
+ def setDesc(self,desc):
+ self.desc = desc
+ return self.desc
+ def getRank(self):
+ return self.rank
+ def setRank(self, rank):
+ self.rank = rank
+ def incrRank(self, incr=1):
+ self.rank = self.rank + incr
+ return self.rank
+ def getChildList(self):
+ if(self.child_list!=None):
+ return self.child_list
+ else:
+ return None
+ def getChildByName(self,child_name):
+ if(self.child_list==None):
+ return None
+ for child in self.child_list:
+ if(child.getName()==child_name):
+ return child
+ def childIndex(self,childnode):
+ if(self.child_list==None):
+ return -1
+ for child in self.child_list:
+ if(child.getName()==childnode.getName()):
+ return self.child_list.index(child)
+ return -2
+ def printChildList(self):
+ if(self.child_list==None):
+ return None
+ for child in self.child_list:
+ print child,
+ def addChildNode(self, node):
+ if(node!=None):
+ if(self.child_list==None):
+ self.child_list=[]
+ #Check whether this node is already present in the Ngram Tree
+ member_index=self.childIndex(node)
+ if(member_index>=0):
+ #Node already present.Incrementing Rank
+ self.child_list[member_index].incrRank()
+ else:
+ self.child_list.append(node)
+ #Keep it sorted as per the ranks
+ self.child_list.sort()
+ def removeChildNode(self, node):
+ if(node!=None & self.child_list!=None):
+ self.child_list.remove(node)
+ def __str__(self):
+ return "Node: %s[%d]" % (self.node_value, self.rank)
+ '''Recursively traverse through the tree and print the nodes-Depth First Traversal'''
+ def toString(self):
+ print "Node: %s[%d]" % (self.node_value, self.rank)
+ child_list=self.getChildList()
+ if(child_list!=None):
+ for child_node in child_list :
+ child_node.toString()
+ '''Defining the less than operater of the object'''
+ def __lt__(self, node):
+ return self.getRank() < node.getRank()
+ '''Defining the greater than operater of the object'''
+ def __gt__(self, node):
+ return self.getRank() > node.getRank()
+ '''Defining the equal-to operater of the object'''
+ def __eq__(self, node):
+ if(node==None):
+ return False
+ return (self.getName() == node.getName()) & (self.getRank() == node.getRank())
+ '''Defining the comparison of two object instances. Required for sorting the list of objects'''
+ def __cmp__(self, node):
+ if(node==None):
+ return 1
+ if(self.getName()==node.getName()):
+ return cmp(self.getRank(), node.getRank())
+ else:
+ return 1
+
+
+#Syllable Node Class
+#Extends NgramNode class
+class SyllableNode(NgramNode):
+ def __str__(self):
+ return ("Syllable: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')
+#Word Node Class
+#Extends NgramNode class
+class WordNode(NgramNode):
+ def __str__(self):
+ return ("Word: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')
+
+class NGram:
+ def __init__(self, text=None, language=None):
+ self.text=None
+ self.language=None
+ try:
+ #Try loading picked tree object
+ self.ngrams=pickle.load(open(PICKLED_TREE))
+ print "Loaded the ngram from " + PICKLED_TREE
+ except:
+ #Initialize with empty node
+ self.ngrams=NgramNode()
+ print "New one"
+ self.search_depth=0
+ def getRoot(self, node_name=None):
+ if(node_name==None):
+ return self.ngrams
+ else:
+ return self.searchNode(node_name)
+
+ def searchNodeByName(self, node_name, current_node=None, depth=MAX_TREE_DEPTH):
+ if(current_node==None):
+ current_node=self.getRoot()
+ self.search_depth = 0
+ if(self.search_depth==depth):
+ return None
+ if(current_node.getName() == node_name):
+ print "Found at depth", self.search_depth
+ return current_node
+ else:
+ child_list=current_node.getChildList()
+ if(child_list==None):
+ return None
+ else:
+ child_list=child_list
+ self.search_depth = self.search_depth+1
+ for child_node in child_list :
+ result_node=self.searchNodeByName(node_name,child_node, depth)
+ if(result_node!=None):
+ return result_node
+ def printNgram(self, current_node=None):
+ if(current_node==None):
+ current_node=self.getRoot()
+ print current_node
+ child_list=current_node.getChildList()
+
+ if(child_list==None):
+ return None
+ else:
+ child_list.sort()
+ for child_node in child_list :
+ self.printNgram(child_node)
+ def toDot(self, graph , current_node=None):
+ if(current_node==None):
+ current_node=self.getRoot()
+ child_list=current_node.getChildList()
+ if(child_list!=None):
+ key=current_node.getName()
+ for child_node in child_list:
+ value=child_node.getName()
+ if((key!=None) & ord(key[len(key)-1])<=0x0901 & len(key)>1):
+ key=key[0:len(key)-1]
+ if(value!=None):
+ if((ord(value[len(value)-1])<=0x0901) & len(value)>1):
+ value=value[0:len(value)-2]
+ graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
+ self.toDot(graph,child_node)
+ def toGraph(self, output_image_file):
+ graph=pydot.Dot()
+ self.toDot(graph)
+ #print graph.to_string().encode('utf-8')
+ graph.write(output_image_file,"dot", "png" )
+
+ def addSyllables(self,text, window_size=2):
+ words=text.split(" ")
+ ngrams = []
+ for word in words:
+ #TODO-Normalize before taking ngram!!!
+ word = "*"+word+"]"
+ syllables = self.syllabalize_ml(word)
+ syllable_count = len(syllables)
+ window_start = 0
+ window_end = 0
+ while window_start + window_size <= syllable_count:
+ if(window_start + window_size < syllable_count):
+ window_end = window_start + window_size
+ else:
+ window_end = syllable_count
+ ngrams.append(syllables[window_start:window_end])
+ window_start = window_start+1
+ return ngrams
+ '''Syllabalize a given Malayalam string. Based on ml-split code by Baiju M'''
+ def syllabalize_ml(self,text):
+ signs = [
+ u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
+ u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
+ u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
+ limiters = ['.','\"','\'','`','!',';',',','?', ']']
+ chandrakkala = u'\u0d4d'
+ lst_chars = []
+ for char in text:
+ if char in limiters:
+ lst_chars.append(char)
+ elif char in signs:
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ try:
+ if lst_chars[-1][-1] == chandrakkala :
+ lst_chars[-1] = lst_chars[-1] + char
+ else:
+ lst_chars.append(char)
+ except IndexError:
+ lst_chars.append(char)
+
+ return lst_chars
+ def addWords(self,text, window_size=2):
+ text = "* "+text+" ]"
+ words = text.split(" ")
+ ngrams = []
+ word_count = len(words)
+ window_start = 0
+ window_end = 0
+ while window_start + window_size <= word_count:
+ if(window_start + window_size < word_count):
+ window_end = window_start + window_size
+ else:
+ window_end = word_count
+ words[window_start:window_end]
+ ngrams.append(words[window_start:window_end])
+ window_start = window_start+1
+ return ngrams
+ def populateSyllableNgram(self, text):
+ ngrams = self.addSyllables(text)
+ for ngram in ngrams:
+ ngram_str=""
+ for item in ngram:
+ if(item.strip()>""):
+ if(ngram_str==""):
+ ngram_str=ngram_str+ item
+ else:
+
+ if(ngram_str=="["):
+ parent_node=self.getRoot()
+ else:
+ parent_node=self.searchNodeByName(ngram_str,self.getRoot())
+ if(parent_node==None):
+ print "Parent node not found for " + item
+ else:
+ parent_node.addChildNode(SyllableNode(item))
+ print ngram_str+ " -> "+item
+ #pickle the tree
+ pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))
+ def populateWordNgram(self, text):
+ ng = NGram ()
+ ngrams = ng.addWords(text)
+ for ngram in ngrams:
+ ngram_str=""
+ for item in ngram:
+ if(item.strip()>""):
+ if(ngram_str==""):
+ ngram_str=ngram_str+ item
+ else:
+ if(ngram_str=="*"):
+ parent_node=self.getRoot()
+ else:
+ parent_node=self.searchNodeByName(ngram_str,self.getRoot())
+ if(parent_node==None):
+ print "Parent node not found for " + item
+ else:
+ parent_node.addChildNode(WordNode(item))
+ print ngram_str+ " -> "+item
+ #pickle the tree
+ pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))
+if __name__ == "__main__":
+ usage = "usage: %prog [options] inputfile"
+ parser = OptionParser(version="%prog 0.1",description="Malayalam NGram Analyser")
+ parser.set_usage(usage)
+ parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram")
+ parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram")
+ parser.add_option("-i", "--input-file", dest="input_file",help="Input File for learning")
+ parser.add_option("-s", "--suggest-syllables", dest="suggest_syllables",help="Suggest next possible syllables for the given letter/syllable ")
+ parser.add_option("-w", "--suggest-words", dest="suggest_words",help="Suggest next possible words for the given word ")
+ (options, args) = parser.parse_args()
+
+ if(options.gen_graph):
+ ng = NGram ()
+ ng.toGraph(options.gen_graph)
+ if(options.input_file):
+ if not os.path.exists(options.input_file):
+ print "File Doesnot Exist"
+ sys.exit(1)
+ else:
+ corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore')
+ ng = NGram ()
+ while 1:
+ text = unicode( corpus_file.readline())
+ if text == "":
+ break
+ text= text + " ]"
+ ng.populateSyllableNgram(text)
+ ng.populateWordNgram(text)
+ print "Populated"
+ if(options.print_ngram):
+ ng = NGram ()
+ print ng.getRoot().toString()
+ if(options.suggest_syllables):
+ ng = NGram ()
+ print "Searching for" + options.suggest_words
+ print ng.searchNodeByName(unicode(options.suggest_syllables))
+ if(options.suggest_syllables):
+ ng = NGram ()
+ print "Searching for "+ options.suggest_words
+ print ng.searchNodeByName(unicode(options.suggest_words))
+
+
diff --git a/silpa/modules/ngram/ngram_ml.txt b/silpa/modules/ngram/ngram_ml.txt
new file mode 100644
index 0000000..a4db8f1
--- /dev/null
+++ b/silpa/modules/ngram/ngram_ml.txt
@@ -0,0 +1,238 @@
+(dp0
+V\u0d35\u0d32\u0d3f\u0d2f
+p1
+V\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f -> \u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f
+p2
+sV\u0d2e\u0d31\u0d4d\u0d31\u0d4d
+p3
+V\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d -> \u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+p4
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d
+p5
+V33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d -> 33 -> \u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
+p6
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28
+p7
+V\u0d12\u0d30\u0d41 -> \u0d12\u0d30\u0d41
+p8
+sV\u0d06\u0d23\u0d41
+p9
+V\u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d -> \u0d2d\u0d15\u0d4d\u0d37\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d
+p10
+sV\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d
+p11
+V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p12
+sV\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28
+p13
+V\u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15 -> \u0d1a\u0d3e\u0d30\u0d28\u0d3f\u0d31\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46\u0d2f\u0d3e\u0d23\u0d41\u0d23\u0d4d\u0d1f\u0d3e\u0d35\u0d41\u0d15
+p14
+sV\u0d36\u0d30\u0d40\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d1f\u0d3f\u0d2d\u0d3e\u0d17\u0d24\u0d4d\u0d24\u0d47\u0d15\u0d4d\u0d15\u0d4d
+p15
+V\u0d28\u0d3f\u0d31\u0d02 -> \u0d28\u0d3f\u0d31\u0d02
+p16
+sV\u0d28\u0d40\u0d23\u0d4d\u0d1f
+p17
+V\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33 -> \u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33
+p18
+sV\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02
+p19
+V\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41 -> \u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41
+p20
+sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p21
+V\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d -> \u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
+p22
+sV\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
+p23
+V\u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d05\u0d15\u0d24\u0d4d\u0d24\u0d3e\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+p24
+sV\u0d1a\u0d46\u0d31\u0d3f\u0d2f
+p25
+V\u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 -> \u0d28\u0d40\u0d30\u0d3e\u0d33\u0d3f\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02
+p26
+sV\u0d15\u0d1f\u0d32\u0d3f\u0d32\u0d4d\u200d
+p27
+V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28
+p28
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d2f\u0d3e\u0d2f\u0d3f
+p29
+V\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+p30
+sV\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+p31
+V\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d -> \u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
+p32
+sV\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p33
+V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f
+p34
+sV\u0d15\u0d23\u0d15\u0d4d\u0d15\u0d3e\u0d15\u0d4d\u0d15\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28
+p35
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d
+p36
+sV\u0d15\u0d41\u0d31\u0d1e\u0d4d\u0d1e\u0d24\u0d4d
+p37
+V\u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d42\u0d28\u0d4d\u0d28\u0d41\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d15\u0d33\u0d46\u0d19\u0d4d\u0d15\u0d3f\u0d32\u0d41\u0d02
+p38
+sV\u0d15\u0d41\u0d33\u0d4d\u0d33\u0d28\u0d4d\u200d
+p39
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+p40
+sV181
+p41
+V\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d -> \u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d
+p42
+sV\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02
+p43
+V\u0d35\u0d32\u0d3f\u0d2f -> \u0d35\u0d32\u0d3f\u0d2f
+p44
+sV\u0d36\u0d30\u0d40\u0d30\u0d2a\u0d4d\u0d30\u0d15\u0d43\u0d24\u0d3f\u0d2f\u0d41\u0d33\u0d4d\u0d33
+p45
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p46
+sV\u0d15\u0d23\u0d4d\u0d1f\u0d41
+p47
+V\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28 -> \u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+p48
+sV\u0d26\u0d15\u0d4d\u0d37\u0d3f\u0d23
+p49
+V\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p50
+sV\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p51
+V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p52
+sV\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d
+p53
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+p54
+sV\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d
+p55
+V\u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02 -> \u0d0f\u0d31\u0d4d\u0d31\u0d35\u0d41\u0d02
+p56
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p57
+V\u0d36\u0d30\u0d40\u0d30\u0d02 -> \u0d36\u0d30\u0d40\u0d30\u0d02
+p58
+sV\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02
+p59
+V\u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02 -> \u0d09\u0d23\u0d4d\u0d1f\u0d3e\u0d15\u0d3e\u0d02
+p60
+sV\u0d2a\u0d4b\u0d32\u0d46
+p61
+V\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02 -> \u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
+p62
+sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p63
+V\u0d12\u0d30\u0d41 -> \u0d12\u0d30\u0d41
+p64
+sV\u0d2e\u0d46\u0d1f\u0d4d\u0d30\u0d3f\u0d15\u0d4d
+p65
+V\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02 -> \u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02
+p66
+sV\u0d1a\u0d46\u0d31\u0d41\u0d2e\u0d24\u0d4d\u0d38\u0d4d\u0d2f\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02
+p67
+V\u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f -> \u0d1a\u0d46\u0d31\u0d3f\u0d2f
+p68
+sV\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
+p69
+V\u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41 -> \u0d06\u0d23\u0d41
+p70
+sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p71
+V\u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41 -> \u0d15\u0d3e\u0d23\u0d2a\u0d4d\u0d2a\u0d46\u0d1f\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d23\u0d4d\u0d1f\u0d41
+p72
+sV\u0d35\u0d1f\u0d15\u0d4d\u0d15\u0d28\u0d4d\u200d
+p73
+V\u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d -> \u0d05\u0d31\u0d4d\u0d31\u0d4d\u0d32\u0d3e\u0d28\u0d4d\u0d31\u0d3f\u0d15\u0d4d -> \u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
+p74
+sV\u0d12\u0d30\u0d41
+p75
+V\u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35 -> \u0d38\u0d38\u0d4d\u0d24\u0d28\u0d3f\u0d2f\u0d3e\u0d23\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d23\u0d3f\u0d35
+p76
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d02
+p77
+V\u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35 -> \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d2f\u0d3e\u0d23\u0d35
+p78
+sV\u0d28\u0d40\u0d32\u0d24\u0d4d\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d02
+p79
+V\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33 -> \u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+p80
+sV\u0d09\u0d23\u0d4d\u0d1f\u0d46\u0d28\u0d4d\u0d28\u0d41
+p81
+V\u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41 -> \u0d15\u0d30\u0d41\u0d24\u0d41\u0d28\u0d4d\u0d28\u0d41
+p82
+sV\u0d07\u0d28\u0d4d\u0d24\u0d4d\u0d2f\u0d28\u0d4d\u200d
+p83
+V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d4d\u200d
+p84
+sV\u0d2e\u0d38\u0d4d\u0d15\u0d41\u0d32\u0d38\u0d4d
+p85
+V(B -> (B -> (B
+p86
+sV\u0d1f\u0d23\u0d3f\u0d32\u0d27\u0d3f\u0d15\u0d02
+p87
+V\u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02 -> \u0d2d\u0d3e\u0d30\u0d35\u0d41\u0d02
+p88
+sV\u0d32\u0d4b\u0d15\u0d24\u0d4d\u0d24\u0d4d
+p89
+V\u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d -> \u0d07\u0d28\u0d4d\u0d28\u0d41\u0d35\u0d30\u0d46\u0d2f\u0d41\u0d33\u0d4d\u0d33\u0d24\u0d3f\u0d32\u0d4d\u200d
+p90
+sV\u0d07\u0d28\u0d4d\u200d\u0d21\u0d3f\u0d15
+p91
+V\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41 -> \u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+p92
+sV33
+p93
+V\u0d2e\u0d40 -> \u0d2e\u0d40
+p94
+sV\u0d28\u0d3f\u0d31\u0d02
+p95
+V\u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02 -> \u0d15\u0d41\u0d31\u0d35\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d02
+p96
+sV\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+p97
+V\u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41 -> \u0d1a\u0d46\u0d31\u0d41
+p98
+sV\u0d2a\u0d38\u0d2b\u0d3f\u0d15\u0d4d
+p99
+V\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02 -> \u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p100
+sV\u0d36\u0d30\u0d40\u0d30\u0d02
+p101
+V\u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28 -> \u0d28\u0d40\u0d32\u0d15\u0d32\u0d30\u0d4d\u200d\u0d28\u0d4d\u0d28
+p102
+sV\u0d2e\u0d39\u0d3e\u0d38\u0d2e\u0d41\u0d26\u0d4d\u0d30\u0d24\u0d4d\u0d24\u0d3f\u0d32\u0d41\u0d02
+p103
+V\u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28 -> \u0d15\u0d3e\u0d23\u0d41\u0d28\u0d4d\u0d28
+p104
+sV\u0d15\u0d4a\u0d1e\u0d4d\u0d1a\u0d41\u0d2a\u0d4b\u0d32\u0d41\u0d33\u0d4d\u0d33
+p105
+V\u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33 -> \u0d2a\u0d41\u0d31\u0d02\u0d24\u0d4b\u0d1f\u0d41\u0d33\u0d4d\u0d33
+p106
+sV\u0d2c\u0d32\u0d40\u0d28\u0d4d\u200d
+p107
+V\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46 -> \u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d41\u0d1f\u0d46
+p108
+sV\u0d24\u0d3f\u0d2e\u0d3f\u0d02\u0d17\u0d32\u0d19\u0d4d\u0d19\u0d33\u0d46
+p109
+V\u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46 -> \u0d2a\u0d4b\u0d32\u0d46
+p110
+sV\u0d28\u0d40\u0d33\u0d35\u0d41\u0d02
+p111
+V181 -> 181
+p112
+sV\u0d2e\u0d31\u0d4d\u0d31\u0d4a\u0d30\u0d41
+p113
+V\u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d -> \u0d09\u0d2a\u0d1c\u0d3e\u0d24\u0d3f\u0d2f\u0d3e\u0d35\u0d3e\u0d28\u0d3f\u0d1f\u0d2f\u0d41\u0d23\u0d4d\u0d1f\u0d4d
+p114
+sV\u0d1a\u0d46\u0d31\u0d41
+p115
+V\u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46 -> \u0d1c\u0d40\u0d35\u0d3f\u0d15\u0d33\u0d46
+p116
+sV\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28
+p117
+V\u0d2c\u0d3f -> \u0d2c\u0d3f -> \u0d2c\u0d3f
+p118
+s. \ No newline at end of file
diff --git a/silpa/modules/ngram/sample_ngram.py b/silpa/modules/ngram/sample_ngram.py
new file mode 100644
index 0000000..ec4b184
--- /dev/null
+++ b/silpa/modules/ngram/sample_ngram.py
@@ -0,0 +1,82 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2009 Jinesh K J <jinesh.k@gmail.com>
+# Copyright 2009 Swathantra Malayalam Computing <smc-discuss@googlegroups.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: jinesh.k@gmail.com or smc-discuss@googlegroups.com
+# URL: http://www.smc.org.in
+import pydot
+import codecs
+import pickle
+import sys
+from visualizer import NGramVisualizer
+from optparse import OptionParser
+
+def getData(new_file_name):
+ line = []
+ line_number = 0
+ rule_number = 0
+ corpus=""
+ data_file = codecs. open(new_file_name,encoding='utf-8', errors='ignore')
+ while 1:
+ line_number = line_number +1
+ text = unicode( data_file.readline())
+ if text == "":
+ break
+ if text[0] == '#':
+ continue
+ line_number = line_number +1
+ line = text.strip()
+ if(line == ""):
+ continue
+ corpus=corpus+" "+line
+ return corpus
+
+def printGraph(corpus,start_word):
+ ngv=NGramVisualizer ()
+ graph_dict=pickle.load(open(corpus))
+ graph=pydot.Dot()
+ uni_start_word = start_word.decode("utf-8")
+# print start_word
+# print uni_start_word
+ graph=ngv.generate_graph(graph_dict, graph,uni_start_word)
+ print graph.to_string().encode("utf-8")
+
+if __name__ == "__main__":
+ usage = "usage: %prog [options] INPUTDATA CORPUSFILE"
+ parser = OptionParser(version="%prog 1.0",description="Sample program to add data INPUTDATA to the corpus in CORPUSFILE")
+ parser.set_usage(usage)
+ parser.add_option("-s", "--start-word", dest="start_word",action="store_true",default=False,help="Creates a graph beginning from INPUTDATA")
+ parser.add_option("-f", "--file", action="store_true",default=False,dest="infile",help="Gets Data from file INPUTDATA")
+ (options, args) = parser.parse_args()
+ if len(args) != 2 :
+ parser.error("incorrect number of arguments")
+ if options.infile and options.start_word:
+ parser.error("options -f and -s are mutually exclusive")
+ ngv=NGramVisualizer ()
+ if options.infile:
+ data = getData(args[0])
+ # print data
+ ngv.loadCorpus(data,args[1])
+ elif options.start_word:
+ printGraph(args[1],args[0])
+ else:
+ # print args[0]
+ data = args[0].decode("utf-8")
+ ngv.loadCorpus(data,args[1])
diff --git a/silpa/modules/ngram/visualizer.py b/silpa/modules/ngram/visualizer.py
new file mode 100644
index 0000000..585c11f
--- /dev/null
+++ b/silpa/modules/ngram/visualizer.py
@@ -0,0 +1,110 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Ngram
+# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
+# http://www.smc.org.in
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
+# URL: http://www.smc.org.in
+import pydot
+import codecs
+import pickle
+
+class NGramVisualizer:
+ depth=0
+ def loadCorpus(self,corpus,corpus_file_name):
+ limiters = [".","!","?",",",";"]
+ try:
+ corpusfile = open(corpus_file_name)
+ except IOError:
+ graph_dict = dict()
+ else:
+ graph_dict = pickle.load(corpusfile)
+ # graph_dict = dict()
+ sentences=[]
+ sentence = ""
+ start = 0
+ for index in range(0,len(corpus)):
+ for delimit in limiters:
+ if corpus[index] == delimit:
+ sentence = corpus[start:index]
+ sentences.append(sentence)
+ start = index+1
+ for line in sentences:
+ words=line.split(" ")
+ word_count=len(words)
+ prev_word=""
+ for word in words:
+ #print word
+ word=word.strip()
+ if(prev_word==""):
+ prev_word=word
+ continue;
+ if(prev_word!=""):
+ if(graph_dict.has_key(prev_word)):
+ graph_dict[prev_word]=graph_dict[prev_word]+" -> "+word
+ else:
+ graph_dict[prev_word]=word
+ prev_word=word
+ prev_word=""
+
+ pickle.dump(graph_dict,open(corpus_file_name,'w'))
+ #return graph_dict
+ def generate_full_graph(self, start_word, graph_dict,outputimage):
+
+ for key in graph_dict.iterkeys():
+ values=graph_dict[key].split("->")
+ for value in values:
+ value=value.strip()
+ #print key, value
+ if(start_word>""):
+ if(key==start_word):
+ graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
+ else:
+ graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
+
+
+ def generate_graph(self, graph_dict, graph, src):
+ self.depth=self.depth+1
+ #print self.depth ,src
+ if(graph.get_node(src)!=[]):
+ return graph
+ if(self.depth>200):
+ return graph
+ values=[]
+ if(graph_dict.has_key(src)) :
+ values=graph_dict[src].split("->")
+ for dest in values:
+ dest=dest.strip()
+ #print src, dest,graph.get_edge(src,dest)
+ if(graph.get_edge(src,dest)):
+ continue
+ else:
+ graph.add_edge(pydot.Edge(src,dest))
+ graph=self.generate_graph(graph_dict, graph, dest)
+
+ return graph
+
+#if __name__ == "__main__":
+# ngv=NGramVisualizer ()
+# graph_dict = dict()
+# graph_dict=ngv.loadCorpus ("ml.txt",graph_dict)
+# pickle.dump(graph_dict,open('ngram_ml.txt','w'))
+# graph=pydot.Dot()
+# graph=ngv.generate_graph(graph_dict, graph,u"നീലത്തിമിംഗലങ്ങള്‍ക്ക്")
+# print graph.to_string().encode("utf-8")
+ #graph.write("ngvgraph-hi.png","dot", "raw" )