silpa/modules/lemmatizer/lemmatizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

#! /usr/bin/env python
# -*- coding: utf-8 -*-

import sys  
import codecs  
import os  
import string
import curses.ascii 
class Lemmatizer:

	def __init__(self):
		self.input_filename =""
		self.output_filename = ""
		self.rules_file = ""
		self.rulesDict = dict()
		
	def Lemmatize(self):
		result = ""
		self.rulesDict = self.LoadRules()
		if self.input_filename :
			uni_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
		else :
			uni_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')			
		text = ""
		if self.output_filename :
			output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore',  mode='w+')			
		line_number = 0
		while 1:
   			text = uni_file.readline()
   			line_number = line_number + 1
			if text == "":
				break
			words = text.split(" ")
			word_count = len(words)
			word_iter = 0
			word = ""
			while word_iter < word_count:
				word = words[word_iter]
				word_length = len(word)
				print word_length
				suffix_pos_itr = 2
				while suffix_pos_itr   <  word_length : 
					suffix = word[suffix_pos_itr:word_length]
					if suffix in self.rulesDict:
						word = word[0:suffix_pos_itr] +  self.rulesDict[suffix]
						break
					suffix_pos_itr = suffix_pos_itr + 1	
				word_iter = word_iter + 1
				print word	
				result = result + word + ""
			result="\n"	
		return result
	def Lemmatize(self, text):
		result = ""
		self.rulesDict = self.LoadRules()
		words=text.split(" ")
		word_count=len(words)
		word_iter=0
		word=""
		while word_iter < word_count:
			word = words[word_iter]
			word = self.trim(word)
			word_length = len(word)
			suffix_pos_itr = 2
			while suffix_pos_itr < word_length :
				suffix = word[suffix_pos_itr:word_length]
				if suffix in self.rulesDict:
					word= word[0:suffix_pos_itr] +  self.rulesDict[suffix]
					break;
				suffix_pos_itr = suffix_pos_itr+1	
			word_iter = word_iter+1
			#print word	
			result = result + word + " "
		return result
					
	def LoadRules(self):	
		print "Loading the rules..."
		rules_dict = dict()
		line = []
		line_number = 0
		rule_number = 0
		rules_file = codecs. open(self.rules_file,encoding='utf-8', errors='ignore')
		while 1:
			line_number = line_number +1 
   			text = unicode( rules_file.readline())
			if text == "":
			      break
			if text[0] == '#': 
			      continue  #this is a comment - ignore
			text = text.split("#")[0]   #remove the comment part of the line     
			line_number = line_number +1       
			line = text.strip()  # remove unwanted space
			if(line == ""):
				  continue 
			if(len(line.split("=")) != 2):
					print "[Error] Syntax Error in the Rules. Line number: ",  line_number
				  	print "Line: "+ text
				  	continue 
	 		lhs = line.split("=") [ 0 ]  .strip()
	 		rhs = line.split("=") [ 1 ]  .strip()
	 		if(len(rhs)>0):
	 			if(lhs[0]=='"'):
	 				lhs=lhs[1:len(lhs)] # if the string is "quoted"
	 			if(lhs[len(lhs)-1]=='"'):
	 				lhs=lhs[0:len(lhs)-1] # if the string is "quoted"
	 		if(len(rhs)>0):
	 			if(rhs[0]=='"'):
	 				rhs=rhs[1:len(rhs)]  # if the string is "quoted"
	 			if(rhs[len(rhs)-1]=='"'):
	 				rhs=rhs[0:len(rhs)-1]	 # if the string is "quoted"			
	 		rule_number=rule_number+1
			rules_dict[lhs]=rhs
			#print "[", rule_number ,"] " +lhs + " : " +rhs
		print "Found ",rule_number, " rules."
		return rules_dict
	
	def trim(self,word):
		punctuations=['~','!','@','#','$','%','^','&','*','(',')','-','+','_','=','{','}','|' ,':',';','<','>','\,','.','?']
		word=word.strip()
		index=len(word)-1
		while index>0:
			if word[index] in punctuations:
				word=word[0:index]
			else:
				break 
			index=index-1	
		return word
if __name__ == "__main__":
	lemmatizer= Lemmatizer()
	lemmatizer.rules_file="/home/santhosh/www/malayalam.map"
	lemmatizer.Lemmatize("മുദ്രാവാക്യവുമായി മുറ്റത്തില്‍")