silpa/modules/ngram/ngram.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Ngram
# Copyright 2008-2009 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

import codecs
import pickle
import pydot
import os,sys
from optparse import OptionParser
VERSION=0.1
MAX_TREE_DEPTH=1000
PICKLED_TREE="ngram.pyo"
class NgramNode:
	def __init__(self, node_value="*", rank=1, child_list=None):
		self.node_value=node_value
		self.rank=rank
		self.child_list=child_list
		self.desc="Start Node"
	def setNode(self, node_value="*", rank=None,childs=None, child_list=None):
		self.node_value=node_value
		self.rank=rank
		self.child_list=child_list
	def getName(self):
		return self.node_value	
	def getDesc(self):
		return self.desc
	def setDesc(self,desc):
		self.desc	=  desc
		return self.desc	
	def getRank(self):
		return self.rank	
	def setRank(self, rank):
		self.rank = rank
	def incrRank(self, incr=1):
		self.rank =  self.rank	+ incr
		return self.rank		
	def getChildList(self):
		if(self.child_list!=None):
			return self.child_list
		else:
			return None	
	def getChildByName(self,child_name):
		if(self.child_list==None):
			return None
		for child in self.child_list:
			if(child.getName()==child_name):
				return child
	def childIndex(self,childnode):
		if(self.child_list==None):
			return -1
		for child in self.child_list:
			if(child.getName()==childnode.getName()):
				return self.child_list.index(child)
		return -2		
	def printChildList(self):
		if(self.child_list==None):
			return None
		for child in self.child_list:
			print child,
	def addChildNode(self, node):
		if(node!=None):
			if(self.child_list==None):
				self.child_list=[]
			#Check whether this node is already present in the Ngram Tree	
			member_index=self.childIndex(node)
			if(member_index>=0):
				#Node already present.Incrementing Rank
				self.child_list[member_index].incrRank()
			else:
				self.child_list.append(node)		
		#Keep it sorted as per the ranks		
		self.child_list.sort()		
	def removeChildNode(self, node):
		if(node!=None & self.child_list!=None):
			self.child_list.remove(node)		
	def __str__(self):
		return "Node:  %s[%d]" % (self.node_value, self.rank)
	'''Recursively traverse through the tree and print the nodes-Depth First Traversal'''	
	def toString(self):
		print "Node:  %s[%d]" % (self.node_value, self.rank)
		child_list=self.getChildList()
		if(child_list!=None):
			for child_node in child_list :
				child_node.toString()
	'''Defining the less than operater of the object'''			
	def __lt__(self, node):
		return 	self.getRank() < node.getRank()
	'''Defining the greater than operater of the object'''
	def __gt__(self, node):
		return 	self.getRank() > node.getRank()
	'''Defining the equal-to operater of the object'''	
	def __eq__(self, node):
		if(node==None):
			return False
		return 	(self.getName() == node.getName()) & (self.getRank() == node.getRank())
	'''Defining the comparison of two object instances. Required for sorting the list of objects'''	
	def __cmp__(self, node):
		if(node==None):
			return 1
		if(self.getName()==node.getName()):
			return cmp(self.getRank(), node.getRank())
		else:
			return 1	
	

#Syllable Node Class
#Extends NgramNode class
class SyllableNode(NgramNode):
	def __str__(self):
		return ("Syllable: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')
#Word Node Class
#Extends NgramNode class
class WordNode(NgramNode):
	def __str__(self):
		return  ("Word: %s[%d]" % (self.node_value, self.rank )).encode('utf-8')

class NGram:
	def __init__(self, text=None, language=None):
		self.text=None
		self.language=None
		try:
			#Try loading picked tree object
			self.ngrams=pickle.load(open(PICKLED_TREE))
			print "Loaded the ngram from " + PICKLED_TREE
		except:	
			#Initialize with empty node
			self.ngrams=NgramNode()
			print "New one"
		self.search_depth=0	
	def getRoot(self, node_name=None):
		if(node_name==None):
			return self.ngrams
		else:
			return self.searchNode(node_name)
				
	def searchNodeByName(self, node_name, current_node=None, depth=MAX_TREE_DEPTH):
		if(current_node==None):
			current_node=self.getRoot()
			self.search_depth = 0
		if(self.search_depth==depth):
			return None
		if(current_node.getName() == node_name):
			print "Found at depth", self.search_depth
			return current_node
		else:
			child_list=current_node.getChildList()
			if(child_list==None):
				return None
			else:
				child_list=child_list	
			self.search_depth = self.search_depth+1				
			for child_node in child_list :
				result_node=self.searchNodeByName(node_name,child_node, depth)
				if(result_node!=None):
					return result_node
	def printNgram(self, current_node=None):
		if(current_node==None):
			current_node=self.getRoot()
		print current_node
		child_list=current_node.getChildList()
		
		if(child_list==None):
			return None
		else:
			child_list.sort()	
		for child_node in child_list :
			self.printNgram(child_node)
	def toDot(self,  graph , current_node=None):
		if(current_node==None):
			current_node=self.getRoot()
		child_list=current_node.getChildList()
		if(child_list!=None):
			key=current_node.getName()
			for child_node in child_list:
				value=child_node.getName()
				if((key!=None) & ord(key[len(key)-1])<=0x0901 & len(key)>1):
					key=key[0:len(key)-1]
				if(value!=None):	
					if((ord(value[len(value)-1])<=0x0901) & len(value)>1):
						value=value[0:len(value)-2]	
					graph.add_edge(pydot.Edge(key.encode('utf-8'),value.encode('utf-8')))
					self.toDot(graph,child_node)
	def toGraph(self, output_image_file):
		graph=pydot.Dot()
		self.toDot(graph)
		#print graph.to_string().encode('utf-8')
		graph.write(output_image_file,"dot", "png" )
		
	def addSyllables(self,text, window_size=2):
		words=text.split(" ")
		ngrams = []
		for word in words:
			#TODO-Normalize before taking ngram!!!
			word = "*"+word+"]"
			syllables = self.syllabalize_ml(word)
			syllable_count = len(syllables)
			window_start = 0
			window_end = 0
			while window_start + window_size <= syllable_count:
				if(window_start + window_size < syllable_count):
					window_end = window_start + window_size
				else:
					window_end = syllable_count	
				ngrams.append(syllables[window_start:window_end])
				window_start = window_start+1
		return 	ngrams
	'''Syllabalize a given Malayalam string. Based on ml-split code by Baiju M'''		
	def syllabalize_ml(self,text):
		signs = [
		u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41',
		u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48',
		u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d']
		limiters = ['.','\"','\'','`','!',';',',','?', ']']
		chandrakkala = u'\u0d4d'
		lst_chars = []
		for char in text:
			if char in limiters:
				lst_chars.append(char)
			elif char in signs:
				lst_chars[-1] = lst_chars[-1] + char
			else:
				try:
					if lst_chars[-1][-1] == chandrakkala :
						lst_chars[-1] = lst_chars[-1] + char
					else:
						lst_chars.append(char)
				except IndexError:
					lst_chars.append(char)

		return lst_chars	
	def addWords(self,text, window_size=2):
		text = "* "+text+" ]"
		words = text.split(" ")
		ngrams = []
		word_count = len(words)
		window_start = 0
		window_end = 0
		while window_start + window_size <= word_count:
			if(window_start + window_size < word_count):
				window_end = window_start + window_size
			else:
				window_end = word_count	
			words[window_start:window_end]	
			ngrams.append(words[window_start:window_end])
			window_start = window_start+1
		return 	ngrams
	def populateSyllableNgram(self, text):
		ngrams = self.addSyllables(text)
		for ngram in ngrams:
			ngram_str=""
			for item in ngram:
				if(item.strip()>""):
					if(ngram_str==""):
						ngram_str=ngram_str+ item 
					else:
						
						if(ngram_str=="["):
							parent_node=self.getRoot()
						else:	
							parent_node=self.searchNodeByName(ngram_str,self.getRoot())
						if(parent_node==None):
							print "Parent node not found for " + item
						else:	
							parent_node.addChildNode(SyllableNode(item))
							print ngram_str+ " -> "+item 	
		#pickle the tree				
		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))
 	def populateWordNgram(self, text):
		ng = NGram () 
		ngrams = ng.addWords(text)
		for ngram in ngrams:
			ngram_str=""
			for item in ngram:
				if(item.strip()>""):
					if(ngram_str==""):
						ngram_str=ngram_str+ item 
					else:
						if(ngram_str=="*"):
							parent_node=self.getRoot()
						else:	
							parent_node=self.searchNodeByName(ngram_str,self.getRoot())
						if(parent_node==None):
							print "Parent node not found for " + item
						else:	
							parent_node.addChildNode(WordNode(item))
							print ngram_str+ " -> "+item 	
		#pickle the tree				
		pickle.dump(self.getRoot(),open(PICKLED_TREE,'w'))	
if __name__ == "__main__":
	usage = "usage: %prog [options] inputfile"
	parser = OptionParser(version="%prog 0.1",description="Malayalam NGram Analyser")
	parser.set_usage(usage)
	parser.add_option("-g", "--generate-graph", dest="gen_graph",help="Generates a graph in png format to visualize the ngram")
	parser.add_option("-p", "--print", action="store_true",default=False,dest="print_ngram",help="Print the Ngram")
	parser.add_option("-i", "--input-file", dest="input_file",help="Input File for learning")
	parser.add_option("-s", "--suggest-syllables", dest="suggest_syllables",help="Suggest next possible syllables for the given letter/syllable ")
	parser.add_option("-w", "--suggest-words", dest="suggest_words",help="Suggest next possible words for the given word ")
	(options, args) = parser.parse_args()
	
	if(options.gen_graph):
		ng = NGram () 	
		ng.toGraph(options.gen_graph)
	if(options.input_file):
		if not os.path.exists(options.input_file):
			print "File Doesnot Exist"
			sys.exit(1)
		else:
			corpus_file = codecs. open(options.input_file,encoding='utf-8', errors='ignore')
			ng = NGram () 	
			while 1:
	   			text = unicode( corpus_file.readline())	
	   			if text == "":
					break
				text= text + " ]"	
				ng.populateSyllableNgram(text)
				ng.populateWordNgram(text)
			print "Populated"
	if(options.print_ngram):
		ng = NGram () 	
		print ng.getRoot().toString()
	if(options.suggest_syllables):
		ng = NGram () 	
		print "Searching for" + options.suggest_words
		print ng.searchNodeByName(unicode(options.suggest_syllables))
	if(options.suggest_syllables):
		ng = NGram () 	
		print "Searching  for "+ options.suggest_words
		print ng.searchNodeByName(unicode(options.suggest_words))