diff options
-rw-r--r-- | silpa/common/modulemanager.py | 8 | ||||
-rw-r--r-- | silpa/modules/__init__.py | 1 | ||||
-rw-r--r-- | silpa/modules/dictionary/dictionary.py | 8 | ||||
-rw-r--r-- | silpa/modules/fortune/fortune.py | 27 | ||||
-rw-r--r-- | silpa/modules/hyphenator/hyphenator.py | 3 | ||||
-rw-r--r-- | silpa/modules/hyphenator/rules/hyph_bn_IN.dic | 5 | ||||
-rw-r--r-- | silpa/modules/hyphenator/rules/hyph_ml_IN.dic | 2 | ||||
-rw-r--r-- | silpa/modules/inexactsearch/__init__.py | 4 | ||||
-rw-r--r-- | silpa/modules/inexactsearch/inexactsearch.py | 173 | ||||
-rw-r--r-- | silpa/modules/payyans/payyans.py | 3 | ||||
-rw-r--r-- | silpa/modules/transliterator/transliterate.py | 51 | ||||
-rw-r--r-- | silpa/silpa.conf | 7 |
12 files changed, 259 insertions, 33 deletions
diff --git a/silpa/common/modulemanager.py b/silpa/common/modulemanager.py index a3e9f1b..c6185dd 100644 --- a/silpa/common/modulemanager.py +++ b/silpa/common/modulemanager.py @@ -35,15 +35,15 @@ class ModuleManager: def getModulesInfoAsHTML(self): module_dict=getModulesList () response = "<h2>Available Modules</h2></hr>" - response = response+"<table class=\"table1\"><tr><th>Module</th><th>Description</th><th>Status</th></tr>" + response = response+"<table class=\"table1\"><tr><th>Module</th><th>Description</th></tr>" for action in module_dict: module_instance=self.getModuleInstance(action) if(module_instance!=None): - response = response+"<tr><td><a href='?action="+ action +"'>"+module_instance.get_module_name()+"</a></td>" - response = response+"<td>"+module_instance.get_info()+"</td><td>OK</td></tr>" + response = response+"<tr><td><a href='"+ action +"'>"+module_instance.get_module_name()+"</a></td>" + response = response+"<td>"+module_instance.get_info()+"</td></tr>" else: response = response+"<tr><td>"+action.replace("_"," ")+"</td>" - response = response+"<td>Error while retrieving module details</td><td>Fail</td></tr>" + response = response+"<td>Error while retrieving module details</td></tr>" return response+"</table>" if __name__ == '__main__': mm=ModuleManager() diff --git a/silpa/modules/__init__.py b/silpa/modules/__init__.py index 4fe98d1..78f7b19 100644 --- a/silpa/modules/__init__.py +++ b/silpa/modules/__init__.py @@ -10,4 +10,5 @@ from fortune import * from inexactsearch import * from dictionary import * from anagram import * +from normalizer import * diff --git a/silpa/modules/dictionary/dictionary.py b/silpa/modules/dictionary/dictionary.py index 7ba877a..4de0f76 100644 --- a/silpa/modules/dictionary/dictionary.py +++ b/silpa/modules/dictionary/dictionary.py @@ -28,6 +28,7 @@ import pickle class Dictionary(SilpaModule): def lookup_en_ml(self, key): + key=key.lower() self.dictFile=os.path.dirname(__file__) + "/data/dict.dat" pickled_dict=open(self.dictFile,'r') self.dictionary=pickle.load(pickled_dict) @@ -44,14 +45,15 @@ class Dictionary(SilpaModule): <h2>English Malayalam Dictionary</h2></hr> <p>Enter the word to lookup in the dictionary </p> - <form action="" method="get"> + <form action="" method="post"> <input type="text" value="%s" name="word"/> - <input type="submit" id="Find_Meaning" value="Find Meaning" name="action" style="width:12em;"/> + <input type="hidden" name="action" value="Dictionary"> + <input type="submit" id="Find_Meaning" value="Find Meaning" style="width:12em;"/> </br> </form> """ if(form.has_key('word')): - search_key = form['word'].value.decode('utf-8') + search_key = form['word'].value response=response % search_key response = response+"<h2>Search Results</h2></hr>" if(search_key==None): diff --git a/silpa/modules/fortune/fortune.py b/silpa/modules/fortune/fortune.py index cf31aad..bc83b51 100644 --- a/silpa/modules/fortune/fortune.py +++ b/silpa/modules/fortune/fortune.py @@ -1,7 +1,7 @@ # Fortune # -*- coding: utf-8 -*- # -# Copyright © 2008 Santhosh Thottingal +# Copyright © 2009 Santhosh Thottingal <santhosh.thottingal@gmai.com> # Released under the GPLV3+ license import os,random @@ -13,24 +13,24 @@ class Fortune(SilpaModule): for line in infile: line=line.decode("utf-8") if line == "%\n": - yield result - result = [] + continue else: if(pattern==None): result.append(line) else: - if(line.find(pattern)==-1): + if(line.find(pattern)>0): result.append(line) if result: - yield result + return result + - def fortune_ml(self, word): - filename="./modules/fortune/database/fortune-ml" + def fortune_ml(self, pattern): + filename = os.path.join(os.path.dirname(__file__), 'database/fortune-ml') """ Pick a random fortune from a file """ - for index, fortune in enumerate(self.fortunes(file(filename),None)): - if random.random() < (1.0 / (index+1)): - chosen = fortune - + fortunes_list=self.fortunes(file(filename),pattern) + chosen="" + if fortunes_list: + chosen= random.choice(fortunes_list) return "".join(chosen) def process(self, form): @@ -46,9 +46,10 @@ class Fortune(SilpaModule): """ if(form.has_key('input_text')): text = form['input_text'].value .decode('utf-8') + response=response % text else: - text="" - response=response % text + text= None + response=response % "" result = self.fortune_ml(text) response = response+"<h2>Random Quote</h2></hr>" response = response+"<b>"+result+"</b>" diff --git a/silpa/modules/hyphenator/hyphenator.py b/silpa/modules/hyphenator/hyphenator.py index f5e8fe2..889aee3 100644 --- a/silpa/modules/hyphenator/hyphenator.py +++ b/silpa/modules/hyphenator/hyphenator.py @@ -234,9 +234,8 @@ class Hyphenator(SilpaModule): You can give the text in any language and even with mixed language </p> <form action="" method="post"> - <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea> + <textarea name='input_text' id='id1'>%s</textarea> <input type="submit" id="Hyphenate" value="Hyphenate" name="action" style="width:12em;"/> - <input type="reset" value="Clear" style="width:12em;"/> </br> </form> """ diff --git a/silpa/modules/hyphenator/rules/hyph_bn_IN.dic b/silpa/modules/hyphenator/rules/hyph_bn_IN.dic index 52acee5..42dca20 100644 --- a/silpa/modules/hyphenator/rules/hyph_bn_IN.dic +++ b/silpa/modules/hyphenator/rules/hyph_bn_IN.dic @@ -37,8 +37,9 @@ UTF-8 ৌ1 ৗ1 ্2 -ঃ1 -ং1 +2ঃ1 +2ং1 +2ঁ1 1ন ন্2 2ন্ diff --git a/silpa/modules/hyphenator/rules/hyph_ml_IN.dic b/silpa/modules/hyphenator/rules/hyph_ml_IN.dic index 44b50b9..36a70f2 100644 --- a/silpa/modules/hyphenator/rules/hyph_ml_IN.dic +++ b/silpa/modules/hyphenator/rules/hyph_ml_IN.dic @@ -43,7 +43,7 @@ UTF-8 ൌ1 ൗ1 ്2 -ഃ1 +2ഃ1 2ം1 1ന ന്2 diff --git a/silpa/modules/inexactsearch/__init__.py b/silpa/modules/inexactsearch/__init__.py new file mode 100644 index 0000000..d47e9c0 --- /dev/null +++ b/silpa/modules/inexactsearch/__init__.py @@ -0,0 +1,4 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +import inexactsearch + diff --git a/silpa/modules/inexactsearch/inexactsearch.py b/silpa/modules/inexactsearch/inexactsearch.py new file mode 100644 index 0000000..0d1f35b --- /dev/null +++ b/silpa/modules/inexactsearch/inexactsearch.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# Paralperu +# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com> +# http://www.smc.org.in +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com +# URL: http://www.smc.org.in + + +import sys +import re +from common import * + +class ApproximateSearch(SilpaModule): + + def syllabalize_ml(self, text): + signs = [ + u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', + u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', + u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d'] + limiters = ['.','\"','\'','`','!',';',',','?'] + + chandrakkala = u'\u0d4d' + lst_chars = [] + for char in text: + if char in limiters: + lst_chars.append(char) + elif char in signs: + lst_chars[-1] = lst_chars[-1] + char + else: + try: + if lst_chars[-1][-1] == chandrakkala: + lst_chars[-1] = lst_chars[-1] + char + else: + lst_chars.append(char) + except IndexError: + lst_chars.append(char) + + return lst_chars + + + def bigram_search(self, str1, str2, syllable_search=False): + """Return approximate string comparator measure (between 0.0 and 1.0) + using bigrams. + USAGE: + score = bigram(str1, str2) + + ARGUMENTS: + str1 The first string + str2 The second string + + DESCRIPTION: + Bigrams are two-character sub-strings contained in a string. For example, + 'peter' contains the bigrams: pe,et,te,er. + + This routine counts the number of common bigrams and divides by the + average number of bigrams. The resulting number is returned. + """ + + # Quick check if the strings are the same - - - - - - - - - - - - - - - - - - + # + if (str1 == str2): + result_string = "<div style='float: left; background-color: green;' title=\" Bigram comparator : string1: %s, string2: %s. Exact Match found" % (str1, str2) + result_string = result_string + "\">"+str1+ "</div>" + return result_string + + bigr1 = [] + bigr2 = [] + + # Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - - + # + for i in range(1,len(str1)): + bigr1.append(str1[i-1:i+1]) + for i in range(1,len(str2)): + bigr2.append(str2[i-1:i+1]) + + # Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - - + # + average = (len(bigr1)+len(bigr2)) / 2.0 + if (average == 0.0): + return str1 + + # Get common bigrams - - - - - - - - - - - - - - - - - - - - - - - - - - - - + # + common = 0.0 + + if (len(bigr1) < len(bigr2)): # Count using the shorter bigram list + short_bigr = bigr1 + long_bigr = bigr2 + else: + short_bigr = bigr2 + long_bigr = bigr1 + + for b in short_bigr: + if (b in long_bigr): + common += 1.0 + long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted + + w = common / average + if(w>=0.6): + result_string = "<div style='float: left; background-color: yellow;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2) + else: + if((w>0.4) & (w<0.6)): + result_string = "<div style='float: left; background-color: grey;' title=\" Bigram comparator string 1: %s, string 2: %s" % (str1, str2) + else: + result_string = "<div style='float: left;' title=\" Bigram comparator string1: %s, string2: %s" % (str1, str2) + result_string = result_string + " Number of bigrams in String1: %i" % (len(bigr1)) + result_string = result_string + " Number of bigrams in String2: %i" % (len(bigr2)) + result_string = result_string + " Average: %i" % (average) + result_string = result_string + " Common: %i" % (common) + result_string = result_string + " Final approximate string weight: " + str(w) + result_string = result_string + "\">"+str1+ "</div>" + return result_string + def process(self,form): + response = """ + <h2>Inexact Search</h2></hr> + <p>The search performed by search engines on Indic text is not effective. + It does not take care of the inflective or agglutinative nature of the language. + This application tries to solve that by using an inexact search algorithm based on maximum common bigram algorithm. + + </p> + <p>Enter the text for searching in the below text area. + </p> + <form action="" method="post"> + <textarea cols='100' rows='25' name='input_text' id='input_text'>%s</textarea> + <br/> + <input type="text" name="search_key" value="%s"/> + <input type="submit" id="Hyphenate" value="Approximate Search" name="action" style="width:12em;"/> + </br> + </form> + """ + if(form.has_key('input_text')): + text = action=form['input_text'].value .decode('utf-8') + if(form.has_key('search_key')): + key = action=form['search_key'].value .decode('utf-8') + response=response % (text,key) + words=text.split(" ") + response = response+"<h2>Search Results</h2></hr>" + response = response+"<p>Words in green are with exact match. Words in Yellow are with approximate Match." + response = response+" Move your mouse pointer over the words to get more information on matching.</p></hr>" + else: + response = response+ "Enter a string to search." + return response % (text,"") + for word in words: + word=word.strip() + if(word>""): + response = response+ self.bigram_search(word, key) + response = response+ "<div style='float: left;'> </div>" + else: + response=response % ("","") + return response + def get_module_name(self): + return "Approximate Search" + def get_info(self): + return "Approximate Search for a string in the given text. Based on bigram search algorithm" + +def getInstance(): + return ApproximateSearch() diff --git a/silpa/modules/payyans/payyans.py b/silpa/modules/payyans/payyans.py index 1702307..8751347 100644 --- a/silpa/modules/payyans/payyans.py +++ b/silpa/modules/payyans/payyans.py @@ -273,7 +273,7 @@ class Payyans(SilpaModule): <p>Enter the text for detecting the language in the below text area. </p> <form action="" method="post"> - <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea><br/> + <textarea name='input_text' id='id1'>%s</textarea><br/> Select Font : <select id="font" name="%s" style="width:12em;"> <option value="karthika">Karthika</option> <option value="bhavana">Bhavana</option> @@ -282,7 +282,6 @@ class Payyans(SilpaModule): <option value="manorama">Manorama</option> </select> <input type="submit" id="Convert To Unicode" value="%s" name="action" style="width:12em;"/> - <input type="reset" value="Clear" style="width:12em;"/> </br> </form> """ diff --git a/silpa/modules/transliterator/transliterate.py b/silpa/modules/transliterator/transliterate.py index 206a6bf..1b534cf 100644 --- a/silpa/modules/transliterator/transliterate.py +++ b/silpa/modules/transliterator/transliterate.py @@ -1,6 +1,6 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -# Paralperu +# Any Indian Language to any other Indian language transliterator # Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com> # http://www.smc.org.in # @@ -23,6 +23,48 @@ from common import * class Transliterator(SilpaModule): + def transliterate_ml_en(self, word): + virama=u"്" + #TODO: how to make this more generic so that more languages can be handled here? + #idea1: transliterate any langauge to a common language say hindi and the n do conversion? + #existing transliterate.py can be used? + #idea2: Have dictionaries for each language like english_xx_dict ? + #TODO: complete this + english_ml_dict={u'അ':'a',u'ആ':'a',u'ഇ':'a',u'ഈ':'a',u'ഉ':'a',u'ഊ':'a',u'ഋ':'a',\ + u'എ':'a',u'ഏ':'a',u'ഐ':'a',u'ഒ':'a',u'ഓ':'a',u'ഔ':'a',\ + u'ക':'k',u'ഖ':'kh',u'ഗ':'g',u'ഘ':'gh',u'ങ്ങ':'ng',\ + u'ച':'ch',u'ഛ':'chh',u'ജ':'j',u'ഝ':'jhh',u'ഞ':'nj',\ + u'ട':'t',u'ഠ':'th',u'ഡ':'d',u'ഢ':'dh',u'ണ':'n',\ + u'ത':'th',u'ഥ':'th',u'ദ':'d',u'ധ':'dh',u'ന':'n',\ + u'പ':'p',u'ഫ':'ph',u'ബ':'b',u'ഭ':'bh',u'മ':'m',\ + u'യ':'y',u'ര':'r',u'ല':'l', u'വ':'v', u'റ':'r',\ + u'ശ':'sa',u'ഷ':'sh',u'സ':'s', u'ഹ':'h',u'ള':'l',u'ഴ':'zh',\ + u'ാ':'a',u'ി':'i' ,u'ീ':'ee' ,u'ു':'u',\ + u'ൂ':'uu',u'ൃ':'ri' ,u'െ':'e' ,u'േ':'e',\ + u'ൈ':'ai',u'ൊ':'o' ,u'ോ':'oo' ,u'ൗ':'au'} + word_length =len(word) + index=0 + tx_string="" + while index<word_length: + a_vowel="" + try: + if(index+1<word_length): + if(word[index+1]==virama): + a_vowel="" + else: + if(index+1<word_length): + if (english_ml_dict[word[index+1]] in ['a','e','i','o','u']): + a_vowel="" + else: + a_vowel="a" + if (english_ml_dict[word[index]] in ['a','e','i','o','u']): + a_vowel="" + tx_string=tx_string+ english_ml_dict[word[index]] + a_vowel + except: + tx_string=tx_string+ word[index] + index=index+1 + return tx_string + def transliterate(self,text, target_lang_code): mm=ModuleManager() ld = mm.getModuleInstance("Detect Language") @@ -31,7 +73,9 @@ class Transliterator(SilpaModule): for word in words: if(word.strip()>""): src_lang_code=ld.detect_lang(word)[word] - tx_str = tx_str + if((target_lang_code=="en_US") and (src_lang_code=="ml_IN")): + tx_str=tx_str + self.transliterate_ml_en(word) + continue for chr in word: offset=ord(chr) + self.getOffset(src_lang_code, target_lang_code) if(offset>0): @@ -60,7 +104,7 @@ class Transliterator(SilpaModule): You can give the text in any language and even with mixed language </p> <form action="" method="post"> - <textarea cols='100' rows='25' name='input_text' id='id1'>%s</textarea></br> + <textarea name='input_text' id='id1'>%s</textarea></br> <select id="trans-lang" name="trans-lang" style="width:12em;"> <option value="hi_IN">Hindi</option> <option value="ml_IN">Malayalam</option> @@ -71,6 +115,7 @@ class Transliterator(SilpaModule): <option value="gu_IN">Gujarai</option> <option value="pa_IN">Panjabi</option> <option value="ka_IN">Kannada</option> + <option value="en_US">English</option> </select> <input type="submit" id="Transliterate" value="Transliterate" name="action" style="width:12em;"/> <input type="reset" value="Clear" style="width:12em;"/> diff --git a/silpa/silpa.conf b/silpa/silpa.conf index 068112c..a70a482 100644 --- a/silpa/silpa.conf +++ b/silpa/silpa.conf @@ -1,13 +1,13 @@ # Silpa configuration file # This is a comment # Global properties -SILPA_BASE_URL=http://planet.smc.org.in/exp/silpa/index.py +SILPA_BASE_URL=http://smc.org.in/silpa SILPA_TEMPLATE=./templates/default/silpa.html SILPA_SITE_NAME=Silpa SILPA_LANGUAGE=Silpa SILPA_SITE_ADMIN_NAME=Santhosh SILPA_SITE_ADMIN_EMAIL=santhosh@silpa.org -SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved. +SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved. #Silpa Plugin Modules #Format: action=module. For eg: @@ -16,6 +16,7 @@ SILPA_SITE_COPYRIGHT= Copyright 2008-2009. Silpa Team. All rights Reserved. #An example entry #SILPA_ACTION.Sort=modules.sort #comment at the end of line is also allowed SILPA_ACTION.Transliterate=modules.transliterate #transliterator module +SILPA_ACTION.Normalize=modules.normalizer #transliterator module SILPA_ACTION.Stem=modules.stemmer #stemmer module for Indian Languages SILPA_ACTION.Detect_Language=utils #This is also a valid module SILPA_ACTION.Guess_Language=modules.guess_language #This is also a valid module @@ -23,7 +24,7 @@ SILPA_ACTION.To_Unicode=modules.payyans #Ascii to Unicode conversion Module SILPA_ACTION.To_ASCII=modules.payyans #Unicode to Ascii conversion Module SILPA_ACTION.Syllabalize=modules.syllabalizer #Syllabalizer module SILPA_ACTION.Hyphenate=modules.hyphenator #Syllabalizer module -SILPA_ACTION.Find_Meaning=modules.dictionary #English Malayalam Module +SILPA_ACTION.Dictionary=modules.dictionary #English Malayalam Dictionary Module SILPA_ACTION.Fortune=modules.fortune #Syllabalizer module SILPA_ACTION.Anagram=modules.anagram #Anagram module SILPA_ACTION.Approximate_Search=modules.inexactsearch #Approximate search |