diff options
Diffstat (limited to 'silpa/modules')
-rwxr-xr-x[-rw-r--r--] | silpa/modules/dictionary/dictionary.py | 47 | ||||
-rwxr-xr-x[-rw-r--r--] | silpa/modules/inexactsearch/inexactsearch.py | 102 | ||||
-rwxr-xr-x[-rw-r--r--] | silpa/modules/syllabalizer/syllabalizer.py | 58 |
3 files changed, 143 insertions, 64 deletions
diff --git a/silpa/modules/dictionary/dictionary.py b/silpa/modules/dictionary/dictionary.py index 4de0f76..927c06c 100644..100755 --- a/silpa/modules/dictionary/dictionary.py +++ b/silpa/modules/dictionary/dictionary.py @@ -1,6 +1,6 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -# English Malayalam Dictionary +# Dictionary # Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com> # http://www.smc.org.in # @@ -24,49 +24,56 @@ from common import * import os -import pickle +from dictdlib import DictDB class Dictionary(SilpaModule): - def lookup_en_ml(self, key): - key=key.lower() - self.dictFile=os.path.dirname(__file__) + "/data/dict.dat" - pickled_dict=open(self.dictFile,'r') - self.dictionary=pickle.load(pickled_dict) - meaning="" - if self.dictionary.has_key(key): - meaningList=self.dictionary[key] - for meaning_item in meaningList: - meaning=meaning+meaning_item.strip() +"<br/>" - else : - meaning="No Meaning found" - return meaning.decode('utf-8') + def getdef(self, word, dictionary): + dict_dir=os.path.join(os.path.dirname(__file__), 'dictionaries') + dictdata=dict_dir+ "/"+dictionary + dict=DictDB(dictdata) + meanings = dict.getdef(word) + meaningstring= "" + if (meanings==None): + meaningstring = "No definition found" + return meaningstring + for meaning in meanings: + meaningstring += meaning + return meaningstring.decode("utf-8") def process(self,form): response = """ - <h2>English Malayalam Dictionary</h2></hr> + <h2>Dictionary</h2></hr> <p>Enter the word to lookup in the dictionary </p> <form action="" method="post"> - <input type="text" value="%s" name="word"/> + <p align="center"> + Word : <input type="text" value="%s" name="word"/> + Dictionary :<select id="dictionary" name="dictionary" style="width:12em;"> + <option value="freedict-eng-hin">English-Hindi</option> + <option value="freedict-eng-mal">English-Malayalam</option> + </select> <input type="hidden" name="action" value="Dictionary"> + </br> <input type="submit" id="Find_Meaning" value="Find Meaning" style="width:12em;"/> </br> + </p> </form> """ if(form.has_key('word')): search_key = form['word'].value + dictionary = form['dictionary'].value response=response % search_key response = response+"<h2>Search Results</h2></hr>" if(search_key==None): response = response+ "Enter a word to find meaning." else: - response = response+ self.lookup_en_ml(search_key) + response = response+ "<pre> "+ self.getdef(search_key,dictionary) + "</pre> " else: response=response % "" return response def get_module_name(self): - return "English Malayalam Dictionary" + return "Dictionary" def get_info(self): - return "English Malayalam Dictionary. Dictionary is compiled by Kerala state IT Mission" + return "Bilingual Dictionaries" def getInstance(): return Dictionary() diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py index 0d1f35b..67e47d9 100644..100755 --- a/silpa/modules/inexactsearch/inexactsearch.py +++ b/silpa/modules/inexactsearch/inexactsearch.py @@ -1,6 +1,6 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -# Paralperu +# Approximate Search # Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com> # http://www.smc.org.in # @@ -28,32 +28,11 @@ from common import * class ApproximateSearch(SilpaModule): - def syllabalize_ml(self, text): - signs = [ - u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', - u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', - u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] - limiters = ['.','\"','\'','`','!',';',',','?'] - - chandrakkala = u'\u0d4d' - lst_chars = [] - for char in text: - if char in limiters: - lst_chars.append(char) - elif char in signs: - lst_chars[-1] = lst_chars[-1] + char - else: - try: - if lst_chars[-1][-1] == chandrakkala: - lst_chars[-1] = lst_chars[-1] + char - else: - lst_chars.append(char) - except IndexError: - lst_chars.append(char) - - return lst_chars - - + def syllabalize(self, text): + mm=ModuleManager() + syllabalizer = mm.getModuleInstance("Syllabalize") + return syllabalizer.syllabalize(text) + def bigram_search(self, str1, str2, syllable_search=False): """Return approximate string comparator measure (between 0.0 and 1.0) using bigrams. @@ -84,10 +63,19 @@ class ApproximateSearch(SilpaModule): # Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - - # - for i in range(1,len(str1)): - bigr1.append(str1[i-1:i+1]) - for i in range(1,len(str2)): - bigr2.append(str2[i-1:i+1]) + if(syllable_search): + str1_syllables = self. syllabalize(str1) + str2_syllables = self. syllabalize(str2) + for i in range(1,len(str1_syllables)): + bigr1.append(str1_syllables[i-1:i+1]) + for i in range(1,len(str2_syllables)): + bigr2.append(str2_syllables[i-1:i+1]) + else: + for i in range(1,len(str1)): + bigr1.append(str1[i-1:i+1]) + for i in range(1,len(str2)): + bigr2.append(str2[i-1:i+1]) + # Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - - # @@ -105,11 +93,22 @@ class ApproximateSearch(SilpaModule): else: short_bigr = bigr2 long_bigr = bigr1 - - for b in short_bigr: - if (b in long_bigr): - common += 1.0 - long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted + if(syllable_search): + for b in short_bigr: + if (b in long_bigr): + if long_bigr.index(b) == short_bigr.index(b) : + common += 1.0 + else: + dislocation=(long_bigr.index(b) - short_bigr.index(b))/ average + if dislocation < 0 : + dislocation = dislocation * -1 + common += 1.0 - dislocation + long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted + else: + for b in short_bigr: + if (b in long_bigr): + common += 1.0 + long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted w = common / average if(w>=0.6): @@ -139,30 +138,47 @@ class ApproximateSearch(SilpaModule): <form action="" method="post"> <textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea> <br/> - <input type="text" name="search_key" value="%s"/> - <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/> + <p align="center"> + Search :<input type="text" name="search_key" value="%s"/> + Algorithm : <select id="algorithm" name="algorithm" value="%s" style="width:12em;"> + <option value="sb">Syllable Bigram</option> + <option value="lb">Letter Bigram</option> + </select> </br> + <input type="hidden" name="action" value="Approximate Search"> + + <input type="submit" id="ApproximateSearch" value="Search" style="width:12em;"/> + </p> </form> """ + algorithm = 'sb' + if(form.has_key('algorithm')): + algorithm = form['algorithm'].value if(form.has_key('input_text')): text = action=form['input_text'].value .decode('utf-8') if(form.has_key('search_key')): - key = action=form['search_key'].value .decode('utf-8') - response=response % (text,key) + key =form['search_key'].value .decode('utf-8') + response=response % (text,key,algorithm) words=text.split(" ") response = response+"<h2>Search Results</h2></hr>" response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match." response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>" else: response = response+ "Enter a string to search." - return response % (text,"") + return response % (text,"", algorithm) for word in words: word=word.strip() if(word>""): - response = response+ self.bigram_search(word, key) + if word[0]>'0' and word[0]<'Z': + response = response+ self.bigram_search(word, key,False) + else: + if algorithm == 'sb': + response = response+ self.bigram_search(word, key, True) + else: + response = response+ self.bigram_search(word, key, False) response = response+ "<div style='float: left;'> </div>" else: - response=response % ("","") + response=response % ("","","sb") return response def get_module_name(self): return "Approximate Search" diff --git a/silpa/modules/syllabalizer/syllabalizer.py b/silpa/modules/syllabalizer/syllabalizer.py index 706ee77..39c140f 100644..100755 --- a/silpa/modules/syllabalizer/syllabalizer.py +++ b/silpa/modules/syllabalizer/syllabalizer.py @@ -51,6 +51,54 @@ class Syllabalizer(SilpaModule): lst_chars.append(char) return lst_chars + def syllabalize_kn(self,text): + signs = [ + u'\u0c82', u'\u0c83', u'\u0cbd', u'\u0cbe', u'\u0cbf', u'\u0cc0', u'\u0cc1', + u'\u0cc2', u'\u0cc3', u'\u0cc4', u'\u0cc6', u'\u0cc7', u'\u0cc8', + u'\u0cca', u'\u0ccb', u'\u0ccc', u'\u0ccd'] + limiters = ['.','\"','\'','`','!',';',',','?'] + + halant = u'\u0ccd' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == halant: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + def syllabalize_bn(self,text): + signs = [ + u'\u0981', u'\u0982', u'\u0983', u'\u09bd', u'\u09be', u'\u09bf', u'\u09c0', u'\u09c1', + u'\u09c2', u'\u09c3', u'\u09c4', u'\u09c6', u'\u09c7', u'\u09c8', + u'\u09ca', u'\u09cb', u'\u09cc', u'\u09cd', u'\u09d7'] + limiters = ['.','\"','\'','`','!',';',',','?'] + + halant = u'\u09cd' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == halant: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars def syllabalize_hi(self,text): signs = [ u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941', @@ -176,11 +224,19 @@ class Syllabalizer(SilpaModule): def syllabalize(self,text): mm=ModuleManager() ld = mm.getModuleInstance("Detect Language") - lang=ld.detect_lang(text)[text] + lang = None + try: + lang=ld.detect_lang(text)[text] + except: + pass #FIXME if(lang=="ml_IN"): return self.syllabalize_ml(text) if(lang=="hi_IN"): return self.syllabalize_hi(text) + if(lang=="kn_IN"): + return self.syllabalize_kn(text) + if(lang=="bn_IN"): + return self.syllabalize_bn(text) if(lang=="en_US"): return self.syllabalize_en(text) lst_chars=[] |