silpa/modules/lemmatizer/lemmatizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

#! /usr/bin/env python
# -*- coding: utf-8 -*-

import sys  
import codecs  
import os  
import string
import curses.ascii 
from common import SilpaModule
class Lemmatizer(SilpaModule):

	def __init__(self):
		self.rules_file = "./modules/lemmatizer/lemmatizer_ml.rules"
		self.rulesDict = dict()
		
	def lemmatize(self, text):
		result = ""
		self.rulesDict = self.LoadRules()
		words=text.split(" ")
		word_count=len(words)
		result_dict = dict()
		word_iter=0
		word=""
		while word_iter < word_count:
			word = words[word_iter]
			word = self.trim(word)
			word_length = len(word)
			suffix_pos_itr = 2
			word_lemmatized=""
			while suffix_pos_itr < word_length :
				suffix = word[suffix_pos_itr:word_length]
				if suffix in self.rulesDict:
					word_lemmatized= word[0:suffix_pos_itr] +  self.rulesDict[suffix]
					break;
				suffix_pos_itr = suffix_pos_itr+1	
			word_iter = word_iter+1
			if(word_lemmatized==""):
				word_lemmatized=word
			result_dict[ word ] = word_lemmatized
		return result_dict
					
	def LoadRules(self):	
		print "Loading the rules..."
		rules_dict = dict()
		line = []
		line_number = 0
		rule_number = 0
		rules_file = codecs. open(self.rules_file,encoding='utf-8', errors='ignore')
		while 1:
			line_number = line_number +1 
   			text = unicode( rules_file.readline())
			if text == "":
			      break
			if text[0] == '#': 
			      continue  #this is a comment - ignore
			text = text.split("#")[0]   #remove the comment part of the line     
			line_number = line_number +1       
			line = text.strip()  # remove unwanted space
			if(line == ""):
				  continue 
			if(len(line.split("=")) != 2):
					print "[Error] Syntax Error in the Rules. Line number: ",  line_number
				  	print "Line: "+ text
				  	continue 
	 		lhs = line.split("=") [ 0 ]  .strip()
	 		rhs = line.split("=") [ 1 ]  .strip()
	 		if(len(rhs)>0):
	 			if(lhs[0]=='"'):
	 				lhs=lhs[1:len(lhs)] # if the string is "quoted"
	 			if(lhs[len(lhs)-1]=='"'):
	 				lhs=lhs[0:len(lhs)-1] # if the string is "quoted"
	 		if(len(rhs)>0):
	 			if(rhs[0]=='"'):
	 				rhs=rhs[1:len(rhs)]  # if the string is "quoted"
	 			if(rhs[len(rhs)-1]=='"'):
	 				rhs=rhs[0:len(rhs)-1]	 # if the string is "quoted"			
	 		rule_number=rule_number+1
			rules_dict[lhs]=rhs
			#print "[", rule_number ,"] " +lhs + " : " +rhs
		print "Found ",rule_number, " rules."
		return rules_dict
	
	def trim(self,word):
		punctuations=['~','!','@','#','$','%','^','&','*','(',')','-','+','_','=','{','}','|' ,':',';','<','>','\,','.','?']
		word=word.strip()
		index=len(word)-1
		while index>0:
			if word[index] in punctuations:
				word=word[0:index]
			else:
				break 
			index=index-1	
		return word
	def process(self, form):
		response = """
		<h2>Lemmatization</h2></hr>
		<p>Enter the text for lemmatization in the below text area.
		 Language of each  word will be detected. 
		 You can give the text in any language and even with mixed language
		</p>
		<form action="" method="post">
		<textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea>
		<input  type="submit" id="Lemmatize" value="Lemmatize"  name="action" style="width:12em;"/>
		<input type="reset" value="Clear" style="width:12em;"/>
		</br>
		</form>
		"""
		if(form.has_key('input_text')):
			text = action=form['input_text'].value	.decode('utf-8')
			response=response % text
			result_dict = self.lemmatize(text)
			response = response+"<h2>Lemmatization Results</h2></hr>"
			response = response+"<table class=\"table1\"><tr><th>Word</th><th>Lemmatized form</th></tr>"
			for key in result_dict:
				response = response+"<tr><td>"+key+"</td><td>"+result_dict[key]+"</td></tr>"
			response = response+"</table>	"
		else:
			response=response % ""	
		return response
	def get_module_name(self):
		return "Lemmatizer"
	def get_info(self):
		return 	"Malayalam Lemmatizer(Experimental)"
		
def getInstance():
	return Lemmatizer()	
if __name__ == "__main__":
	lemmatizer= Lemmatizer()
	lemmatizer.rules_file="/home/santhosh/www/malayalam.map"
	lemmatizer.lemmatize("മുദ്രാവാക്യവുമായി മുറ്റത്തില്‍")