diff options
Diffstat (limited to 'silpa/modules/ngram/ngram.py')
-rw-r--r-- | silpa/modules/ngram/ngram.py | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/silpa/modules/ngram/ngram.py b/silpa/modules/ngram/ngram.py new file mode 100644 index 0000000..cab2ed9 --- /dev/null +++ b/silpa/modules/ngram/ngram.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Ngram +# Copyright 2008-2009 Santhosh Thottingal <santhosh.thottingal@gmail.com> +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# + +import codecs +import pickle +import pydot +import os,sys +from optparse import OptionParser +VERSION=0.1 +MAX_TREE_DEPTH=1000 +PICKLED_TREE="ngram.pyo" +class NgramNode: + def __init__(self, node_value="*", rank=1, child_list=None): + self.node_value=node_value + self.rank=rank + self.child_list=child_list + self.desc="Start Node" + def setNode(self, node_value="*", rank=None,childs=None, child_list=None): + self.node_value=node_value + self.rank=rank + self.child_list=child_list + def getName(self): + return self.node_value + def getDesc(self): + return self.desc + def setDesc(self,desc): + self.desc = desc + return self.desc + def getRank(self): + return self.rank + def setRank(self, rank): + self.rank = rank + def incrRank(self, incr=1): + self.rank = self.rank + incr + return self.rank + def getChildList(self): + if(self.child_list!=None): + return self.child_list + else: + return None + def getChildByName(self,child_name): + if(self.child_list==None): + return None + for child in self.child_list: + if(child.getName()==child_name): + return child + def childIndex(self,childnode): + if(self.child_list==None): + return -1 + for child in self.child_list: + if(child.getName()==childnode.getName()): + return self.child_list.index(child) + return -2 + def printChildList(self): + if(self.child_list==None): + return None + for child in self.child_list: + print child, + def addChildNode(self, node): + if(node!=None): + if(self.child_list==None): + self.child_list=[] + #Check whether this node is already present in the Ngram Tree + member_index=self.childIndex(node) + if(member_index>=0): + #Node already present.Incrementing Rank + self.child_list[member_index].incrRank() + else: + self.child_list.append(node) + #Keep it sorted as per the ranks + self.child_list.sort() + def removeChildNode(self, node): + if(node!=None & self.child_list!=None): + self.child_list.remove(node) + def __str__(self): + return "Node: %s[%d]" % (self.node_value, self.rank) + '''Recursively traverse through the tree and print the nodes-Depth First Traversal''' + def toString(self): + print "Node: %s[%d]" % (self.node_value, self.rank) + child_list=self.getChildList() + if(child_list!=None): + for child_node in child_list : + child_node.toString() + '''Defining the less than operater of the object''' + def __lt__(self, node): + return self.getRank() < node.getRank() + '''Defining the greater than operater of the object''' + def __gt__(self, node): + return self.getRank() > node.getRank() + '''Defining the equal-to operater of the object''' + def __eq__(self, node): + if(node==None): + return False + return (self.getName() == node.getName()) & (self.getRank() == node.getRank()) + '''Defining the comparison of two object instances. Required for sorting the list of objects''' + def __cmp__(self, node): + if(node==None): + return 1 + if(self.getName()==node.getName()): + return cmp(self.getRank(), node.getRank()) + else: + return 1 + + +#Syllable Node Class +#Extends NgramNode class +class SyllableNode(NgramNode): + def __str__(self): + return ("Syllable: %s[%d]" % (self.node_value, self.rank )).encode('utf-8') +#Word Node Class +#Extends NgramNode class +class WordNode(NgramNode): + def __str__(self): + return ("Word: %s[%d]" % (self.node_value, self.rank )).encode('utf-8') + +class NGram: + def __init__(self, text=None, language=None): + self.text=None + self.language=None + try: + #Try loading picked tree object + self.ngrams=pickle.load(open(PICKLED_TREE)) + print "Loaded the ngram from " + PICKLED_TREE + except: + #Initialize with empty node + self.ngrams=NgramNode() + print "New one" + self.search_depth=0 + def getRoot(self, node_name=None): + if(node_name==None): + return self.ngrams + else: + return self.searchNode(node_name) + + def searchNodeByName(self, node_name, current_node=None, depth=MAX_TREE_DEPTH): + if(current_node==None): + current_node=self.getRoot() + self.search_depth = 0 + if(self.search_depth==depth): + return None + if(current_node.getName() == node_name): + print "Found at depth", self.search_depth + return current_node + else: + child_list=current_node.getChildList() + if(child_list==None): + return None + else: + child_list=child_list + self.search_depth = self.search_depth+1 + for child_node in child_list : + result_node=self.searchNodeByName(node_name,child_node, depth) + if(result_node!=None): + return result_node + def printNgram(self, current_node=None): + if(current_node==None): + current_node=self.getRoot() + print current_node + child_list=current_node.getChildList() + + if(child_list==None): + return None + else: + child_list.sort() + for child_node in child_list : + self.printNgram(child_node) + def toDot(self, graph , current_node=None): + if(current_node==None): + current_node=self.getRoot() + child_list=current_node.getChildList() + if(child_list!=None): + key=current_node.getName() + for child_node in child_list: + value=child_node.getName() + if((key!=None) & ord(key[len(key)-1])<=0x0901 & len(key)>1): + key=key[0:len(key)-1] + if(value!=None): + if((ord(value[len(value)-1])<=0x0901) & len(value)>1): + value=value[0:len(value)-2] + graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8'))) + self.toDot(graph,child_node) + def toGraph(self, output_image_file): + graph=pydot.Dot() + self.toDot(graph) + #print graph.to_string().encode('utf-8') + graph.write(output_image_file,"dot", "png" ) + + def addSyllables(self,text, window_size=2): + words=text.split(" ") + ngrams = [] + for word in words: + #TODO-Normalize before taking ngram!!! + word = "*"+word+"]" + syllables = self.syllabalize_ml(word) + syllable_count = len(syllables) + window_start = 0 + window_end = 0 + while window_start + window_size <= syllable_count: + if(window_start + window_size < syllable_count): + window_end = window_start + window_size + else: + window_end = syllable_count + ngrams.append(syllables[window_start:window_end]) + window_start = window_start+1 + return ngrams + '''Syllabalize a given Malayalam string. Based on ml-split code by Baiju M''' + def syllabalize_ml(self,text): + signs = [ + u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', + u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', + u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + limiters = ['.','\"','\'','`','!',';',',','?', ']'] + chandrakkala = u'\u0d4d' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala : + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + def addWords(self,text, window_size=2): + text = "* "+text+" ]" + words = text.split(" ") + ngrams = [] + word_count = len(words) + window_start = 0 + window_end = 0 + while window_start + window_size <= word_count: + if(window_start + window_size < word_count): + window_end = window_start + window_size + else: + window_end = word_count + words[window_start:window_end] + ngrams.append(words[window_start:window_end]) + window_start = window_start+1 + return ngrams + def populateSyllableNgram(self, text): + ngrams = self.addSyllables(text) + for ngram in ngrams: + ngram_str="" + for item in ngram: + if(item.strip()>""): + if(ngram_str==""): + ngram_str=ngram_str+ item + else: + + if(ngram_str=="["): + parent_node=self.getRoot() + else: + parent_node=self.searchNodeByName(ngram_str,self.getRoot()) + if(parent_node==None): + print "Parent node not found for " + item + else: + parent_node.addChildNode(SyllableNode(item)) + print ngram_str+ " -> "+item + #pickle the tree + pickle.dump(self.getRoot(),open(PICKLED_TREE,'w')) + def populateWordNgram(self, text): + ng = NGram () + ngrams = ng.addWords(text) + for ngram in ngrams: + ngram_str="" + for item in ngram: + if(item.strip()>""): + if(ngram_str==""): + ngram_str=ngram_str+ item + else: + if(ngram_str=="*"): + parent_node=self.getRoot() + else: + parent_node=self.searchNodeByName(ngram_str,self.getRoot()) + if(parent_node==None): + print "Parent node not found for " + item + else: + parent_node.addChildNode(WordNode(item)) + print ngram_str+ " -> "+item + #pickle the tree + pickle.dump(self.getRoot(),open(PICKLED_TREE,'w')) +if __name__ == "__main__": + usage = "usage: %prog [options] inputfile" + parser = OptionParser(version="%prog 0.1",description="Malayalam NGram Analyser") + parser.set_usage(usage) + parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram") + parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram") + parser.add_option("-i", "--input-file", dest="input_file",help="Input File for learning") + parser.add_option("-s", "--suggest-syllables", dest="suggest_syllables",help="Suggest next possible syllables for the given letter/syllable ") + parser.add_option("-w", "--suggest-words", dest="suggest_words",help="Suggest next possible words for the given word ") + (options, args) = parser.parse_args() + + if(options.gen_graph): + ng = NGram () + ng.toGraph(options.gen_graph) + if(options.input_file): + if not os.path.exists(options.input_file): + print "File Doesnot Exist" + sys.exit(1) + else: + corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore') + ng = NGram () + while 1: + text = unicode( corpus_file.readline()) + if text == "": + break + text= text + " ]" + ng.populateSyllableNgram(text) + ng.populateWordNgram(text) + print "Populated" + if(options.print_ngram): + ng = NGram () + print ng.getRoot().toString() + if(options.suggest_syllables): + ng = NGram () + print "Searching for" + options.suggest_words + print ng.searchNodeByName(unicode(options.suggest_syllables)) + if(options.suggest_syllables): + ng = NGram () + print "Searching for "+ options.suggest_words + print ng.searchNodeByName(unicode(options.suggest_words)) + + |