From b81278c14258c81c2ca3307491e74c504e84d26b Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Sun, 7 Jun 2009 11:12:09 +0530 Subject: Soundex Module --- silpa/modules/soundex/soundex.py | 128 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 silpa/modules/soundex/soundex.py (limited to 'silpa/modules/soundex/soundex.py') diff --git a/silpa/modules/soundex/soundex.py b/silpa/modules/soundex/soundex.py new file mode 100644 index 0000000..3325dd1 --- /dev/null +++ b/silpa/modules/soundex/soundex.py @@ -0,0 +1,128 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2009 Santhosh Thottingal +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com +# URL: http://www.smc.org.in + +import charmap +import sys +import re +from common import * +class Soundex(SilpaModule): + def soundexDigit(self,char): + index=0 + cm=charmap.charmap + lang= charmap.language(char) + try: + if lang == "en_US": + return cm["soundex_en"][cm[lang].index(char)] + else: + return cm["soundex"][cm[lang].index(char)] + except: + '''In case of any exception- Mostly because of character not found in charmap''' + return 0 + return None + + def soundex(self,name, len=5, indic=False): + """ soundex module conforming to Knuth's algorithm + implementation 2000-12-24 by Gregory Jorgensen + public domain + """ + sndx ='' + fc = '' + # translate alpha chars in name to soundex digits + for c in name.lower(): + if not fc: fc = c # remember first letter + d = str(self.soundexDigit(c)) + # duplicate consecutive soundex digits are skipped + if not sndx or (d != sndx[-1]): + sndx += d + + # replace first digit with first alpha character + if not indic: sndx = fc + sndx[1:] + + # remove all 0s from the soundex code + sndx = sndx.replace('0','') + + # return soundex code padded to len characters + return (sndx + (len * '0'))[:len] + + def compare(self,string1, string2, indic=True): + if indic: + if charmap.charCompare( string1[0] , string2[0]) >=0 : + return self.soundex(string1, indic=True)==self.soundex(string2, indic=True) + else: + return self.soundex(string1, indic=False)==self.soundex(string2, indic=False) + + def process(self,form): + response = """ +

Soundex

+

'Sounds like' search across Indian Languages. +

+

Enter the text for searching in the below text area. You can enter the text in say, Hindi and search a Malayalam word in that. If the pronunciation of the search key is similar to any word in the text, it will be highlighted. + All Indian Languages and English are supported. More about soundex

+
+ +
+

+ Search : +
+ + +

+
+ """ + if(form.has_key('input_text')): + text = action=form['input_text'].value .decode('utf-8') + if(form.has_key('search_key')): + key =form['search_key'].value .decode('utf-8') + response=response % (text,key) + words=text.split(" ") + response = response+"

Search Results

" + else: + response = response+ "Enter a string to search." + return response % (text,"", algorithm) + for word in words: + word=word.strip() + if(word>""): + if word[0]>'0' and word[0]<'Z': + if self.compare(word, key, False) : + response += "
"+word+"
" + else: + response += "
"+word+"
" + else: + if self.compare(word, key, True) : + response += "
"+word+"
" + else: + response += "
"+word+"
" + + response = response+ "
 
" + else: + response=response % ("","") + return response + def get_module_name(self): + return "Soundex" + def get_info(self): + return "Soundex Algorithm for Indian Languages and 'sounds like' search across Indian Languages" + +def getInstance(): + return Soundex() + + -- cgit