new modules

author: Santhosh Thottingal <santhosh.thottingal@gmail.com> 2009-03-29 17:59:40 +0530
committer: Santhosh Thottingal <santhosh.thottingal@gmail.com> 2009-03-29 17:59:40 +0530
commit: c5368252e3091368ae55475757ed3134d6f84249 (patch)
tree: 09687b84ab042ba9d339dec9d1d92ecc377d0bac /silpa/modules/lemmatizer/lemmatizer.py
parent: f37edebde2304ee0643804166bf19ffee5c9dba5 (diff)
download: Rachana.git-c5368252e3091368ae55475757ed3134d6f84249.tar.gz
Rachana.git-c5368252e3091368ae55475757ed3134d6f84249.tar.xz
Rachana.git-c5368252e3091368ae55475757ed3134d6f84249.zip
1 files changed, 45 insertions, 46 deletions
diff --git a/silpa/modules/lemmatizer/lemmatizer.py b/silpa/modules/lemmatizer/lemmatizer.py
index 8b9a35c..212cded 100644
--- a/silpa/modules/lemmatizer/lemmatizer.py
+++ b/silpa/modules/lemmatizer/lemmatizer.py
@@ -6,55 +6,19 @@ import codecs
 import os  
 import string
 import curses.ascii 
-class Lemmatizer:
+from common import SilpaModule
+class Lemmatizer(SilpaModule):
 
 	def __init__(self):
-		self.input_filename =""
-		self.output_filename = ""
-		self.rules_file = ""
+		self.rules_file = "./modules/lemmatizer/lemmatizer_ml.rules"
 		self.rulesDict = dict()
 		
-	def Lemmatize(self):
-		result = ""
-		self.rulesDict = self.LoadRules()
-		if self.input_filename :
-			uni_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
-		else :
-			uni_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')			
-		text = ""
-		if self.output_filename :
-			output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore',  mode='w+')			
-		line_number = 0
-		while 1:
-   			text = uni_file.readline()
-   			line_number = line_number + 1
-			if text == "":
-				break
-			words = text.split(" ")
-			word_count = len(words)
-			word_iter = 0
-			word = ""
-			while word_iter < word_count:
-				word = words[word_iter]
-				word_length = len(word)
-				print word_length
-				suffix_pos_itr = 2
-				while suffix_pos_itr   <  word_length : 
-					suffix = word[suffix_pos_itr:word_length]
-					if suffix in self.rulesDict:
-						word = word[0:suffix_pos_itr] +  self.rulesDict[suffix]
-						break
-					suffix_pos_itr = suffix_pos_itr + 1	
-				word_iter = word_iter + 1
-				print word	
-				result = result + word + ""
-			result="\n"	
-		return result
-	def Lemmatize(self, text):
+	def lemmatize(self, text):
 		result = ""
 		self.rulesDict = self.LoadRules()
 		words=text.split(" ")
 		word_count=len(words)
+		result_dict = dict()
 		word_iter=0
 		word=""
 		while word_iter < word_count:
@@ -62,16 +26,18 @@ class Lemmatizer:
 			word = self.trim(word)
 			word_length = len(word)
 			suffix_pos_itr = 2
+			word_lemmatized=""
 			while suffix_pos_itr < word_length :
 				suffix = word[suffix_pos_itr:word_length]
 				if suffix in self.rulesDict:
-					word= word[0:suffix_pos_itr] +  self.rulesDict[suffix]
+					word_lemmatized= word[0:suffix_pos_itr] +  self.rulesDict[suffix]
 					break;
 				suffix_pos_itr = suffix_pos_itr+1	
 			word_iter = word_iter+1
-			#print word	
-			result = result + word + " "
-		return result
+			if(word_lemmatized==""):
+				word_lemmatized=word
+			result_dict[ word ] = word_lemmatized
+		return result_dict
 					
 	def LoadRules(self):	
 		print "Loading the rules..."
@@ -125,8 +91,41 @@ class Lemmatizer:
 				break 
 			index=index-1	
 		return word
+	def process(self, form):
+		response = """
+		<h2>Lemmatization</h2></hr>
+		<p>Enter the text for lemmatization in the below text area.
+		 Language of each  word will be detected. 
+		 You can give the text in any language and even with mixed language
+		</p>
+		<form action="" method="post">
+		<textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea>
+		<input  type="submit" id="Lemmatize" value="Lemmatize"  name="action" style="width:12em;"/>
+		<input type="reset" value="Clear" style="width:12em;"/>
+		</br>
+		</form>
+		"""
+		if(form.has_key('input_text')):
+			text = action=form['input_text'].value	.decode('utf-8')
+			response=response % text
+			result_dict = self.lemmatize(text)
+			response = response+"<h2>Lemmatization Results</h2></hr>"
+			response = response+"<table class=\"table1\"><tr><th>Word</th><th>Lemmatized form</th></tr>"
+			for key in result_dict:
+				response = response+"<tr><td>"+key+"</td><td>"+result_dict[key]+"</td></tr>"
+			response = response+"</table>	"
+		else:
+			response=response % ""	
+		return response
+	def get_module_name(self):
+		return "Lemmatizer"
+	def get_info(self):
+		return 	"Malayalam Lemmatizer(Experimental)"
+		
+def getInstance():
+	return Lemmatizer()	
 if __name__ == "__main__":
 	lemmatizer= Lemmatizer()
 	lemmatizer.rules_file="/home/santhosh/www/malayalam.map"
-	lemmatizer.Lemmatize("മുദ്രാവാക്യവുമായി മുറ്റത്തില്‍")
+	lemmatizer.lemmatize("മുദ്രാവാക്യവുമായി മുറ്റത്തില്‍")
author	Santhosh Thottingal <santhosh.thottingal@gmail.com>	2009-03-29 17:59:40 +0530
committer	Santhosh Thottingal <santhosh.thottingal@gmail.com>	2009-03-29 17:59:40 +0530
commit	c5368252e3091368ae55475757ed3134d6f84249 (patch)
tree	09687b84ab042ba9d339dec9d1d92ecc377d0bac /silpa/modules/lemmatizer/lemmatizer.py
parent	f37edebde2304ee0643804166bf19ffee5c9dba5 (diff)
download	Rachana.git-c5368252e3091368ae55475757ed3134d6f84249.tar.gz Rachana.git-c5368252e3091368ae55475757ed3134d6f84249.tar.xz Rachana.git-c5368252e3091368ae55475757ed3134d6f84249.zip